{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1632, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012254901960784314, "grad_norm": 1.6059832937437346, "learning_rate": 6.0975609756097564e-06, "loss": 1.3541, "step": 1 }, { "epoch": 0.006127450980392157, "grad_norm": 1.5281382037952618, "learning_rate": 3.048780487804878e-05, "loss": 1.3583, "step": 5 }, { "epoch": 0.012254901960784314, "grad_norm": 0.5993529609680898, "learning_rate": 6.097560975609756e-05, "loss": 1.3375, "step": 10 }, { "epoch": 0.01838235294117647, "grad_norm": 0.7098761948973563, "learning_rate": 9.146341463414634e-05, "loss": 1.2793, "step": 15 }, { "epoch": 0.024509803921568627, "grad_norm": 0.4448928165735333, "learning_rate": 0.00012195121951219512, "loss": 1.2037, "step": 20 }, { "epoch": 0.030637254901960783, "grad_norm": 0.40161131951951523, "learning_rate": 0.0001524390243902439, "loss": 1.1336, "step": 25 }, { "epoch": 0.03676470588235294, "grad_norm": 0.2541806870693389, "learning_rate": 0.00018292682926829268, "loss": 1.1063, "step": 30 }, { "epoch": 0.0428921568627451, "grad_norm": 0.1491059867354642, "learning_rate": 0.00021341463414634146, "loss": 1.0795, "step": 35 }, { "epoch": 0.049019607843137254, "grad_norm": 0.14162648176415202, "learning_rate": 0.00024390243902439024, "loss": 1.0729, "step": 40 }, { "epoch": 0.05514705882352941, "grad_norm": 0.14138145278805817, "learning_rate": 0.00027439024390243905, "loss": 1.0504, "step": 45 }, { "epoch": 0.061274509803921566, "grad_norm": 0.11598136180137979, "learning_rate": 0.0003048780487804878, "loss": 1.0378, "step": 50 }, { "epoch": 0.06740196078431372, "grad_norm": 0.14589294245833054, "learning_rate": 0.0003353658536585366, "loss": 1.0446, "step": 55 }, { "epoch": 0.07352941176470588, "grad_norm": 0.14730649378552943, "learning_rate": 0.00036585365853658537, "loss": 1.0333, "step": 60 }, { "epoch": 0.07965686274509803, "grad_norm": 0.11759388636190853, "learning_rate": 0.0003963414634146342, "loss": 1.0143, "step": 65 }, { "epoch": 0.0857843137254902, "grad_norm": 0.12311842814802663, "learning_rate": 0.0004268292682926829, "loss": 1.0148, "step": 70 }, { "epoch": 0.09191176470588236, "grad_norm": 0.11751325803408062, "learning_rate": 0.00045731707317073173, "loss": 1.0176, "step": 75 }, { "epoch": 0.09803921568627451, "grad_norm": 0.13732149342206665, "learning_rate": 0.0004878048780487805, "loss": 1.0042, "step": 80 }, { "epoch": 0.10416666666666667, "grad_norm": 0.11309991089195273, "learning_rate": 0.0005182926829268293, "loss": 1.0065, "step": 85 }, { "epoch": 0.11029411764705882, "grad_norm": 0.12583526404678766, "learning_rate": 0.0005487804878048781, "loss": 1.0149, "step": 90 }, { "epoch": 0.11642156862745098, "grad_norm": 0.12017272173906957, "learning_rate": 0.0005792682926829268, "loss": 0.9948, "step": 95 }, { "epoch": 0.12254901960784313, "grad_norm": 0.11719746827044987, "learning_rate": 0.0006097560975609756, "loss": 0.9905, "step": 100 }, { "epoch": 0.12867647058823528, "grad_norm": 0.13729668356371552, "learning_rate": 0.0006402439024390244, "loss": 0.9977, "step": 105 }, { "epoch": 0.13480392156862744, "grad_norm": 0.12981775917103477, "learning_rate": 0.0006707317073170732, "loss": 1.0017, "step": 110 }, { "epoch": 0.1409313725490196, "grad_norm": 0.13870526825011106, "learning_rate": 0.0007012195121951219, "loss": 0.9997, "step": 115 }, { "epoch": 0.14705882352941177, "grad_norm": 0.12747253256734123, "learning_rate": 0.0007317073170731707, "loss": 0.9877, "step": 120 }, { "epoch": 0.15318627450980393, "grad_norm": 0.12869096982052203, "learning_rate": 0.0007621951219512195, "loss": 0.9823, "step": 125 }, { "epoch": 0.15931372549019607, "grad_norm": 0.12889388640011323, "learning_rate": 0.0007926829268292683, "loss": 0.9774, "step": 130 }, { "epoch": 0.16544117647058823, "grad_norm": 0.13890909917337924, "learning_rate": 0.000823170731707317, "loss": 0.9874, "step": 135 }, { "epoch": 0.1715686274509804, "grad_norm": 0.167350127299137, "learning_rate": 0.0008536585365853659, "loss": 0.9855, "step": 140 }, { "epoch": 0.17769607843137256, "grad_norm": 0.12637500439423752, "learning_rate": 0.0008841463414634147, "loss": 0.9837, "step": 145 }, { "epoch": 0.18382352941176472, "grad_norm": 0.13036804871454152, "learning_rate": 0.0009146341463414635, "loss": 0.9846, "step": 150 }, { "epoch": 0.18995098039215685, "grad_norm": 0.13716320964191064, "learning_rate": 0.0009451219512195122, "loss": 0.9742, "step": 155 }, { "epoch": 0.19607843137254902, "grad_norm": 0.1505410439189676, "learning_rate": 0.000975609756097561, "loss": 0.9657, "step": 160 }, { "epoch": 0.20220588235294118, "grad_norm": 0.13939847463577426, "learning_rate": 0.0009999988550474805, "loss": 0.982, "step": 165 }, { "epoch": 0.20833333333333334, "grad_norm": 0.1723912783342495, "learning_rate": 0.000999958782259877, "loss": 0.9771, "step": 170 }, { "epoch": 0.21446078431372548, "grad_norm": 0.20339647033601935, "learning_rate": 0.0009998614670898504, "loss": 0.9742, "step": 175 }, { "epoch": 0.22058823529411764, "grad_norm": 0.1805520187680447, "learning_rate": 0.0009997069206794246, "loss": 0.9724, "step": 180 }, { "epoch": 0.2267156862745098, "grad_norm": 0.14101202234864954, "learning_rate": 0.000999495160723267, "loss": 0.9699, "step": 185 }, { "epoch": 0.23284313725490197, "grad_norm": 0.14408773000983136, "learning_rate": 0.0009992262114666653, "loss": 0.9721, "step": 190 }, { "epoch": 0.23897058823529413, "grad_norm": 0.12211770445331499, "learning_rate": 0.0009989001037027502, "loss": 0.9638, "step": 195 }, { "epoch": 0.24509803921568626, "grad_norm": 0.11058292325672553, "learning_rate": 0.0009985168747689707, "loss": 0.9613, "step": 200 }, { "epoch": 0.2512254901960784, "grad_norm": 0.151326097792135, "learning_rate": 0.0009980765685428175, "loss": 0.9784, "step": 205 }, { "epoch": 0.25735294117647056, "grad_norm": 0.13748406413323222, "learning_rate": 0.0009975792354368017, "loss": 0.9578, "step": 210 }, { "epoch": 0.26348039215686275, "grad_norm": 0.13529789016829663, "learning_rate": 0.000997024932392681, "loss": 0.9561, "step": 215 }, { "epoch": 0.2696078431372549, "grad_norm": 0.11913360946345405, "learning_rate": 0.0009964137228749407, "loss": 0.9587, "step": 220 }, { "epoch": 0.2757352941176471, "grad_norm": 0.11311287494378686, "learning_rate": 0.0009957456768635274, "loss": 0.9534, "step": 225 }, { "epoch": 0.2818627450980392, "grad_norm": 0.1687787460034738, "learning_rate": 0.000995020870845837, "loss": 0.9483, "step": 230 }, { "epoch": 0.28799019607843135, "grad_norm": 0.13894852921173279, "learning_rate": 0.000994239387807957, "loss": 0.9465, "step": 235 }, { "epoch": 0.29411764705882354, "grad_norm": 0.15098532770679463, "learning_rate": 0.0009934013172251653, "loss": 0.9512, "step": 240 }, { "epoch": 0.3002450980392157, "grad_norm": 0.1603820773467054, "learning_rate": 0.0009925067550516852, "loss": 0.9364, "step": 245 }, { "epoch": 0.30637254901960786, "grad_norm": 0.20531045303348816, "learning_rate": 0.0009915558037097002, "loss": 0.9436, "step": 250 }, { "epoch": 0.3125, "grad_norm": 0.1373133786573118, "learning_rate": 0.0009905485720776265, "loss": 0.934, "step": 255 }, { "epoch": 0.31862745098039214, "grad_norm": 1.8470429009125096, "learning_rate": 0.0009894851754776472, "loss": 0.9526, "step": 260 }, { "epoch": 0.3247549019607843, "grad_norm": 0.11771908587868915, "learning_rate": 0.000988365735662509, "loss": 0.9513, "step": 265 }, { "epoch": 0.33088235294117646, "grad_norm": 0.14600554728304563, "learning_rate": 0.0009871903808015812, "loss": 0.9424, "step": 270 }, { "epoch": 0.33700980392156865, "grad_norm": 0.12078857521986237, "learning_rate": 0.0009859592454661823, "loss": 0.9501, "step": 275 }, { "epoch": 0.3431372549019608, "grad_norm": 0.12236086569601477, "learning_rate": 0.0009846724706141716, "loss": 0.9445, "step": 280 }, { "epoch": 0.3492647058823529, "grad_norm": 0.13112435126866281, "learning_rate": 0.0009833302035738107, "loss": 0.9363, "step": 285 }, { "epoch": 0.3553921568627451, "grad_norm": 0.14235863463223497, "learning_rate": 0.0009819325980268945, "loss": 0.9485, "step": 290 }, { "epoch": 0.36151960784313725, "grad_norm": 0.12952693104841728, "learning_rate": 0.0009804798139911568, "loss": 0.9421, "step": 295 }, { "epoch": 0.36764705882352944, "grad_norm": 0.11889525565491425, "learning_rate": 0.0009789720178019483, "loss": 0.9321, "step": 300 }, { "epoch": 0.3737745098039216, "grad_norm": 0.10497064780361466, "learning_rate": 0.0009774093820931922, "loss": 0.9383, "step": 305 }, { "epoch": 0.3799019607843137, "grad_norm": 0.1297312332951932, "learning_rate": 0.0009757920857776188, "loss": 0.9315, "step": 310 }, { "epoch": 0.3860294117647059, "grad_norm": 0.12939401003239853, "learning_rate": 0.0009741203140262813, "loss": 0.931, "step": 315 }, { "epoch": 0.39215686274509803, "grad_norm": 0.1724687096774714, "learning_rate": 0.0009723942582473544, "loss": 0.9244, "step": 320 }, { "epoch": 0.39828431372549017, "grad_norm": 0.1440746949158844, "learning_rate": 0.000970614116064219, "loss": 0.9186, "step": 325 }, { "epoch": 0.40441176470588236, "grad_norm": 0.11601098074712127, "learning_rate": 0.0009687800912928362, "loss": 0.9331, "step": 330 }, { "epoch": 0.4105392156862745, "grad_norm": 0.16676922492027155, "learning_rate": 0.0009668923939184109, "loss": 0.9282, "step": 335 }, { "epoch": 0.4166666666666667, "grad_norm": 0.16264723211095694, "learning_rate": 0.0009649512400713498, "loss": 0.931, "step": 340 }, { "epoch": 0.4227941176470588, "grad_norm": 0.12835068212058262, "learning_rate": 0.000962956852002516, "loss": 0.9212, "step": 345 }, { "epoch": 0.42892156862745096, "grad_norm": 0.11722967905017813, "learning_rate": 0.0009609094580577824, "loss": 0.9301, "step": 350 }, { "epoch": 0.43504901960784315, "grad_norm": 0.11753159563989828, "learning_rate": 0.0009588092926518875, "loss": 0.9181, "step": 355 }, { "epoch": 0.4411764705882353, "grad_norm": 0.3058794099499992, "learning_rate": 0.0009566565962415959, "loss": 0.9276, "step": 360 }, { "epoch": 0.44730392156862747, "grad_norm": 0.11638001282284417, "learning_rate": 0.0009544516152981679, "loss": 0.9187, "step": 365 }, { "epoch": 0.4534313725490196, "grad_norm": 0.0979888983879341, "learning_rate": 0.0009521946022791401, "loss": 0.9178, "step": 370 }, { "epoch": 0.45955882352941174, "grad_norm": 0.12474491987381439, "learning_rate": 0.0009498858155994194, "loss": 0.9248, "step": 375 }, { "epoch": 0.46568627450980393, "grad_norm": 0.11539100379640589, "learning_rate": 0.0009475255196016972, "loss": 0.9082, "step": 380 }, { "epoch": 0.47181372549019607, "grad_norm": 0.11060768973607205, "learning_rate": 0.0009451139845261834, "loss": 0.913, "step": 385 }, { "epoch": 0.47794117647058826, "grad_norm": 0.10933884625046772, "learning_rate": 0.0009426514864796647, "loss": 0.9164, "step": 390 }, { "epoch": 0.4840686274509804, "grad_norm": 0.12110040734071437, "learning_rate": 0.000940138307403893, "loss": 0.918, "step": 395 }, { "epoch": 0.49019607843137253, "grad_norm": 0.12516912467922992, "learning_rate": 0.0009375747350433044, "loss": 0.9099, "step": 400 }, { "epoch": 0.4963235294117647, "grad_norm": 0.15405339011075656, "learning_rate": 0.0009349610629120733, "loss": 0.9153, "step": 405 }, { "epoch": 0.5024509803921569, "grad_norm": 0.15983105881560317, "learning_rate": 0.0009322975902605082, "loss": 0.9139, "step": 410 }, { "epoch": 0.508578431372549, "grad_norm": 0.12456698491529061, "learning_rate": 0.000929584622040788, "loss": 0.9196, "step": 415 }, { "epoch": 0.5147058823529411, "grad_norm": 0.39391857600951824, "learning_rate": 0.0009268224688720474, "loss": 0.911, "step": 420 }, { "epoch": 0.5208333333333334, "grad_norm": 0.135863544001222, "learning_rate": 0.0009240114470048129, "loss": 0.9082, "step": 425 }, { "epoch": 0.5269607843137255, "grad_norm": 0.16266736226237394, "learning_rate": 0.0009211518782847931, "loss": 0.9208, "step": 430 }, { "epoch": 0.5330882352941176, "grad_norm": 0.14823148129955332, "learning_rate": 0.0009182440901160307, "loss": 0.9243, "step": 435 }, { "epoch": 0.5392156862745098, "grad_norm": 0.14300453285294948, "learning_rate": 0.0009152884154234145, "loss": 0.9082, "step": 440 }, { "epoch": 0.5453431372549019, "grad_norm": 0.11464976577167021, "learning_rate": 0.0009122851926145641, "loss": 0.9066, "step": 445 }, { "epoch": 0.5514705882352942, "grad_norm": 0.1380022527629278, "learning_rate": 0.0009092347655410818, "loss": 0.9059, "step": 450 }, { "epoch": 0.5575980392156863, "grad_norm": 0.12505924658066125, "learning_rate": 0.0009061374834591849, "loss": 0.9106, "step": 455 }, { "epoch": 0.5637254901960784, "grad_norm": 0.14573814560545809, "learning_rate": 0.0009029937009897176, "loss": 0.9085, "step": 460 }, { "epoch": 0.5698529411764706, "grad_norm": 0.12549980653285886, "learning_rate": 0.0008998037780775488, "loss": 0.9134, "step": 465 }, { "epoch": 0.5759803921568627, "grad_norm": 1.0091372919881914, "learning_rate": 0.0008965680799503608, "loss": 0.9012, "step": 470 }, { "epoch": 0.5821078431372549, "grad_norm": 0.12482100803406715, "learning_rate": 0.0008932869770768326, "loss": 0.9083, "step": 475 }, { "epoch": 0.5882352941176471, "grad_norm": 0.16842450336753725, "learning_rate": 0.0008899608451242233, "loss": 0.906, "step": 480 }, { "epoch": 0.5943627450980392, "grad_norm": 0.37435753717247394, "learning_rate": 0.0008865900649153606, "loss": 0.9116, "step": 485 }, { "epoch": 0.6004901960784313, "grad_norm": 0.1504221848097725, "learning_rate": 0.0008831750223850389, "loss": 0.9069, "step": 490 }, { "epoch": 0.6066176470588235, "grad_norm": 0.6779799933680686, "learning_rate": 0.0008797161085358317, "loss": 0.8935, "step": 495 }, { "epoch": 0.6127450980392157, "grad_norm": 0.4033225235584718, "learning_rate": 0.0008762137193933241, "loss": 0.898, "step": 500 }, { "epoch": 0.6188725490196079, "grad_norm": 0.12597786237207237, "learning_rate": 0.0008726682559607706, "loss": 0.8965, "step": 505 }, { "epoch": 0.625, "grad_norm": 0.12434445433374085, "learning_rate": 0.0008690801241731818, "loss": 0.9195, "step": 510 }, { "epoch": 0.6311274509803921, "grad_norm": 0.11180950505927977, "learning_rate": 0.0008654497348508476, "loss": 0.8856, "step": 515 }, { "epoch": 0.6372549019607843, "grad_norm": 0.11471149278729899, "learning_rate": 0.0008617775036523015, "loss": 0.888, "step": 520 }, { "epoch": 0.6433823529411765, "grad_norm": 0.10661278460678202, "learning_rate": 0.000858063851026728, "loss": 0.8844, "step": 525 }, { "epoch": 0.6495098039215687, "grad_norm": 0.13323867388840102, "learning_rate": 0.0008543092021658259, "loss": 0.8917, "step": 530 }, { "epoch": 0.6556372549019608, "grad_norm": 0.11051797161585711, "learning_rate": 0.0008505139869551248, "loss": 0.8966, "step": 535 }, { "epoch": 0.6617647058823529, "grad_norm": 0.10201940027044504, "learning_rate": 0.0008466786399247663, "loss": 0.9086, "step": 540 }, { "epoch": 0.6678921568627451, "grad_norm": 0.11341681063010137, "learning_rate": 0.000842803600199753, "loss": 0.885, "step": 545 }, { "epoch": 0.6740196078431373, "grad_norm": 0.11249706294845149, "learning_rate": 0.0008388893114496705, "loss": 0.8761, "step": 550 }, { "epoch": 0.6801470588235294, "grad_norm": 0.10372930065250463, "learning_rate": 0.0008349362218378904, "loss": 0.8875, "step": 555 }, { "epoch": 0.6862745098039216, "grad_norm": 0.1473106555936528, "learning_rate": 0.0008309447839702582, "loss": 0.8813, "step": 560 }, { "epoch": 0.6924019607843137, "grad_norm": 0.11251651358057665, "learning_rate": 0.0008269154548432722, "loss": 0.8856, "step": 565 }, { "epoch": 0.6985294117647058, "grad_norm": 0.18983642337639475, "learning_rate": 0.0008228486957917607, "loss": 0.8893, "step": 570 }, { "epoch": 0.7046568627450981, "grad_norm": 0.1926017574087406, "learning_rate": 0.0008187449724360605, "loss": 0.8853, "step": 575 }, { "epoch": 0.7107843137254902, "grad_norm": 0.10232469720003605, "learning_rate": 0.0008146047546287076, "loss": 0.8786, "step": 580 }, { "epoch": 0.7169117647058824, "grad_norm": 0.1029258945252611, "learning_rate": 0.0008104285164006415, "loss": 0.8799, "step": 585 }, { "epoch": 0.7230392156862745, "grad_norm": 0.14002711821151018, "learning_rate": 0.0008062167359069301, "loss": 0.8827, "step": 590 }, { "epoch": 0.7291666666666666, "grad_norm": 0.11144023976662691, "learning_rate": 0.0008019698953720256, "loss": 0.8832, "step": 595 }, { "epoch": 0.7352941176470589, "grad_norm": 0.12878152634910894, "learning_rate": 0.000797688481034551, "loss": 0.8852, "step": 600 }, { "epoch": 0.741421568627451, "grad_norm": 0.12427328372120441, "learning_rate": 0.0007933729830916297, "loss": 0.8807, "step": 605 }, { "epoch": 0.7475490196078431, "grad_norm": 0.10823708822180528, "learning_rate": 0.00078902389564276, "loss": 0.8839, "step": 610 }, { "epoch": 0.7536764705882353, "grad_norm": 0.106769228793293, "learning_rate": 0.0007846417166332445, "loss": 0.8756, "step": 615 }, { "epoch": 0.7598039215686274, "grad_norm": 0.10610630308145001, "learning_rate": 0.0007802269477971771, "loss": 0.8786, "step": 620 }, { "epoch": 0.7659313725490197, "grad_norm": 0.11560264922747754, "learning_rate": 0.000775780094599998, "loss": 0.8769, "step": 625 }, { "epoch": 0.7720588235294118, "grad_norm": 0.14907985785369013, "learning_rate": 0.0007713016661806211, "loss": 0.8795, "step": 630 }, { "epoch": 0.7781862745098039, "grad_norm": 0.11129324500895409, "learning_rate": 0.00076679217529314, "loss": 0.875, "step": 635 }, { "epoch": 0.7843137254901961, "grad_norm": 0.14839037572821354, "learning_rate": 0.0007622521382481208, "loss": 0.8703, "step": 640 }, { "epoch": 0.7904411764705882, "grad_norm": 0.13899413539068264, "learning_rate": 0.0007576820748534875, "loss": 0.8763, "step": 645 }, { "epoch": 0.7965686274509803, "grad_norm": 0.11601000036057739, "learning_rate": 0.0007530825083550073, "loss": 0.887, "step": 650 }, { "epoch": 0.8026960784313726, "grad_norm": 0.10442455243797781, "learning_rate": 0.0007484539653763815, "loss": 0.8751, "step": 655 }, { "epoch": 0.8088235294117647, "grad_norm": 0.12520557374066849, "learning_rate": 0.0007437969758589507, "loss": 0.8673, "step": 660 }, { "epoch": 0.8149509803921569, "grad_norm": 0.11448612465079618, "learning_rate": 0.0007391120730010193, "loss": 0.8694, "step": 665 }, { "epoch": 0.821078431372549, "grad_norm": 0.13153196077249837, "learning_rate": 0.0007343997931968067, "loss": 0.87, "step": 670 }, { "epoch": 0.8272058823529411, "grad_norm": 0.11840806919932775, "learning_rate": 0.0007296606759750351, "loss": 0.8672, "step": 675 }, { "epoch": 0.8333333333333334, "grad_norm": 0.09757556694809401, "learning_rate": 0.0007248952639371542, "loss": 0.8676, "step": 680 }, { "epoch": 0.8394607843137255, "grad_norm": 0.09022546372550531, "learning_rate": 0.0007201041026952188, "loss": 0.8664, "step": 685 }, { "epoch": 0.8455882352941176, "grad_norm": 0.10331609924034306, "learning_rate": 0.0007152877408094178, "loss": 0.8616, "step": 690 }, { "epoch": 0.8517156862745098, "grad_norm": 0.12505516849755346, "learning_rate": 0.0007104467297252677, "loss": 0.8652, "step": 695 }, { "epoch": 0.8578431372549019, "grad_norm": 0.09465790589490469, "learning_rate": 0.0007055816237104753, "loss": 0.8699, "step": 700 }, { "epoch": 0.8639705882352942, "grad_norm": 0.12150502920638975, "learning_rate": 0.0007006929797914775, "loss": 0.8597, "step": 705 }, { "epoch": 0.8700980392156863, "grad_norm": 0.10318640070426116, "learning_rate": 0.0006957813576896647, "loss": 0.8603, "step": 710 }, { "epoch": 0.8762254901960784, "grad_norm": 0.11768219039546857, "learning_rate": 0.000690847319757296, "loss": 0.8653, "step": 715 }, { "epoch": 0.8823529411764706, "grad_norm": 0.13084092032465588, "learning_rate": 0.000685891430913113, "loss": 0.8599, "step": 720 }, { "epoch": 0.8884803921568627, "grad_norm": 0.1378907468788972, "learning_rate": 0.0006809142585776604, "loss": 0.8625, "step": 725 }, { "epoch": 0.8946078431372549, "grad_norm": 0.1137277604032764, "learning_rate": 0.0006759163726083191, "loss": 0.8626, "step": 730 }, { "epoch": 0.9007352941176471, "grad_norm": 0.10001539507212627, "learning_rate": 0.0006708983452340609, "loss": 0.849, "step": 735 }, { "epoch": 0.9068627450980392, "grad_norm": 0.10484673231574948, "learning_rate": 0.0006658607509899319, "loss": 0.8682, "step": 740 }, { "epoch": 0.9129901960784313, "grad_norm": 0.15431021500080286, "learning_rate": 0.0006608041666512712, "loss": 0.8645, "step": 745 }, { "epoch": 0.9191176470588235, "grad_norm": 0.11219846062082936, "learning_rate": 0.0006557291711676738, "loss": 0.8541, "step": 750 }, { "epoch": 0.9252450980392157, "grad_norm": 0.11968297539967529, "learning_rate": 0.0006506363455967037, "loss": 0.8645, "step": 755 }, { "epoch": 0.9313725490196079, "grad_norm": 0.1421862868626235, "learning_rate": 0.0006455262730373672, "loss": 0.8628, "step": 760 }, { "epoch": 0.9375, "grad_norm": 0.09419342625695648, "learning_rate": 0.0006403995385633503, "loss": 0.859, "step": 765 }, { "epoch": 0.9436274509803921, "grad_norm": 0.10436946711990462, "learning_rate": 0.0006352567291560318, "loss": 0.8564, "step": 770 }, { "epoch": 0.9497549019607843, "grad_norm": 0.11378998198564375, "learning_rate": 0.0006300984336372771, "loss": 0.8552, "step": 775 }, { "epoch": 0.9558823529411765, "grad_norm": 0.11204753073504012, "learning_rate": 0.0006249252426020216, "loss": 0.8567, "step": 780 }, { "epoch": 0.9620098039215687, "grad_norm": 0.13969543336270682, "learning_rate": 0.000619737748350651, "loss": 0.8521, "step": 785 }, { "epoch": 0.9681372549019608, "grad_norm": 0.10178000324550646, "learning_rate": 0.0006145365448211866, "loss": 0.849, "step": 790 }, { "epoch": 0.9742647058823529, "grad_norm": 0.28123512574152176, "learning_rate": 0.0006093222275212822, "loss": 0.8539, "step": 795 }, { "epoch": 0.9803921568627451, "grad_norm": 0.11687237794199178, "learning_rate": 0.0006040953934600423, "loss": 0.8466, "step": 800 }, { "epoch": 0.9865196078431373, "grad_norm": 0.08664193399778343, "learning_rate": 0.0005988566410796687, "loss": 0.8408, "step": 805 }, { "epoch": 0.9926470588235294, "grad_norm": 0.1332025987779646, "learning_rate": 0.0005936065701869403, "loss": 0.8545, "step": 810 }, { "epoch": 0.9987745098039216, "grad_norm": 2.6257846647757304, "learning_rate": 0.0005883457818845414, "loss": 0.8575, "step": 815 }, { "epoch": 1.0, "eval_loss": 1.2250986099243164, "eval_runtime": 111.2787, "eval_samples_per_second": 188.185, "eval_steps_per_second": 5.886, "step": 816 }, { "epoch": 1.0049019607843137, "grad_norm": 1.2727033946666415, "learning_rate": 0.0005830748785022368, "loss": 0.7791, "step": 820 }, { "epoch": 1.0110294117647058, "grad_norm": 0.1911261436318345, "learning_rate": 0.0005777944635279099, "loss": 0.7643, "step": 825 }, { "epoch": 1.017156862745098, "grad_norm": 0.1586922694379843, "learning_rate": 0.0005725051415384657, "loss": 0.7516, "step": 830 }, { "epoch": 1.0232843137254901, "grad_norm": 0.11186631342494315, "learning_rate": 0.0005672075181306108, "loss": 0.7526, "step": 835 }, { "epoch": 1.0294117647058822, "grad_norm": 0.13268928121380016, "learning_rate": 0.0005619021998515165, "loss": 0.7699, "step": 840 }, { "epoch": 1.0355392156862746, "grad_norm": 0.11229206406982119, "learning_rate": 0.0005565897941293721, "loss": 0.7813, "step": 845 }, { "epoch": 1.0416666666666667, "grad_norm": 0.12012493868077331, "learning_rate": 0.000551270909203838, "loss": 0.7606, "step": 850 }, { "epoch": 1.0477941176470589, "grad_norm": 0.11528601274141542, "learning_rate": 0.0005459461540564057, "loss": 0.7597, "step": 855 }, { "epoch": 1.053921568627451, "grad_norm": 0.10690663798544056, "learning_rate": 0.0005406161383406731, "loss": 0.7595, "step": 860 }, { "epoch": 1.0600490196078431, "grad_norm": 0.13779006322984594, "learning_rate": 0.000535281472312543, "loss": 0.7604, "step": 865 }, { "epoch": 1.0661764705882353, "grad_norm": 0.1286321993738431, "learning_rate": 0.0005299427667603515, "loss": 0.7591, "step": 870 }, { "epoch": 1.0723039215686274, "grad_norm": 0.11691581398274645, "learning_rate": 0.0005246006329349376, "loss": 0.7539, "step": 875 }, { "epoch": 1.0784313725490196, "grad_norm": 0.1359966051449655, "learning_rate": 0.0005192556824796568, "loss": 0.7478, "step": 880 }, { "epoch": 1.0845588235294117, "grad_norm": 0.11784308765572785, "learning_rate": 0.0005139085273603527, "loss": 0.7526, "step": 885 }, { "epoch": 1.0906862745098038, "grad_norm": 0.08243234125547984, "learning_rate": 0.0005085597797952905, "loss": 0.7503, "step": 890 }, { "epoch": 1.0968137254901962, "grad_norm": 0.10687199207970824, "learning_rate": 0.0005032100521850608, "loss": 0.7639, "step": 895 }, { "epoch": 1.1029411764705883, "grad_norm": 0.09754168316674887, "learning_rate": 0.0004978599570424639, "loss": 0.7648, "step": 900 }, { "epoch": 1.1090686274509804, "grad_norm": 0.09597242384268918, "learning_rate": 0.0004925101069223802, "loss": 0.7618, "step": 905 }, { "epoch": 1.1151960784313726, "grad_norm": 0.16042928535903608, "learning_rate": 0.0004871611143516367, "loss": 0.7488, "step": 910 }, { "epoch": 1.1213235294117647, "grad_norm": 0.11558221571064126, "learning_rate": 0.00048181359175887594, "loss": 0.758, "step": 915 }, { "epoch": 1.1274509803921569, "grad_norm": 0.10187875911094818, "learning_rate": 0.0004764681514044362, "loss": 0.7548, "step": 920 }, { "epoch": 1.133578431372549, "grad_norm": 0.10845733289411896, "learning_rate": 0.0004711254053102521, "loss": 0.7447, "step": 925 }, { "epoch": 1.1397058823529411, "grad_norm": 0.12423589225823602, "learning_rate": 0.0004657859651897806, "loss": 0.7567, "step": 930 }, { "epoch": 1.1458333333333333, "grad_norm": 0.101823111875683, "learning_rate": 0.0004604504423779639, "loss": 0.7496, "step": 935 }, { "epoch": 1.1519607843137254, "grad_norm": 0.1490251916250933, "learning_rate": 0.00045511944776123513, "loss": 0.7476, "step": 940 }, { "epoch": 1.1580882352941178, "grad_norm": 0.0909899046261831, "learning_rate": 0.00044979359170757555, "loss": 0.7557, "step": 945 }, { "epoch": 1.1642156862745099, "grad_norm": 0.10504026034830631, "learning_rate": 0.00044447348399663056, "loss": 0.7551, "step": 950 }, { "epoch": 1.170343137254902, "grad_norm": 0.10154008572100809, "learning_rate": 0.00043915973374989326, "loss": 0.7553, "step": 955 }, { "epoch": 1.1764705882352942, "grad_norm": 0.0948668753989249, "learning_rate": 0.0004338529493609647, "loss": 0.7529, "step": 960 }, { "epoch": 1.1825980392156863, "grad_norm": 0.12214475449625242, "learning_rate": 0.0004285537384258951, "loss": 0.7544, "step": 965 }, { "epoch": 1.1887254901960784, "grad_norm": 0.10021333779503093, "learning_rate": 0.00042326270767361815, "loss": 0.7561, "step": 970 }, { "epoch": 1.1948529411764706, "grad_norm": 0.10089690279960518, "learning_rate": 0.0004179804628964839, "loss": 0.7473, "step": 975 }, { "epoch": 1.2009803921568627, "grad_norm": 0.09480462930649244, "learning_rate": 0.00041270760888089997, "loss": 0.7543, "step": 980 }, { "epoch": 1.2071078431372548, "grad_norm": 0.13593486404780272, "learning_rate": 0.000407444749338085, "loss": 0.7447, "step": 985 }, { "epoch": 1.213235294117647, "grad_norm": 0.09373417852720169, "learning_rate": 0.00040219248683494925, "loss": 0.7516, "step": 990 }, { "epoch": 1.219362745098039, "grad_norm": 0.09327325503229338, "learning_rate": 0.00039695142272510334, "loss": 0.7443, "step": 995 }, { "epoch": 1.2254901960784315, "grad_norm": 0.10034654814592268, "learning_rate": 0.0003917221570800065, "loss": 0.7475, "step": 1000 }, { "epoch": 1.2316176470588236, "grad_norm": 0.09837253795265176, "learning_rate": 0.0003865052886202621, "loss": 0.7438, "step": 1005 }, { "epoch": 1.2377450980392157, "grad_norm": 0.09051967236076873, "learning_rate": 0.000381301414647068, "loss": 0.7537, "step": 1010 }, { "epoch": 1.2438725490196079, "grad_norm": 0.08928603737077674, "learning_rate": 0.0003761111309738285, "loss": 0.7372, "step": 1015 }, { "epoch": 1.25, "grad_norm": 0.10153157943466443, "learning_rate": 0.0003709350318579371, "loss": 0.748, "step": 1020 }, { "epoch": 1.2561274509803921, "grad_norm": 0.09687550081111064, "learning_rate": 0.0003657737099327378, "loss": 0.7445, "step": 1025 }, { "epoch": 1.2622549019607843, "grad_norm": 0.08345067550046266, "learning_rate": 0.0003606277561396726, "loss": 0.7459, "step": 1030 }, { "epoch": 1.2683823529411764, "grad_norm": 0.09822605701018818, "learning_rate": 0.0003554977596606203, "loss": 0.7473, "step": 1035 }, { "epoch": 1.2745098039215685, "grad_norm": 0.08897185232535443, "learning_rate": 0.00035038430785044053, "loss": 0.7485, "step": 1040 }, { "epoch": 1.280637254901961, "grad_norm": 0.09625591066370137, "learning_rate": 0.00034528798616972434, "loss": 0.739, "step": 1045 }, { "epoch": 1.2867647058823528, "grad_norm": 0.09405754837547836, "learning_rate": 0.00034020937811776156, "loss": 0.7558, "step": 1050 }, { "epoch": 1.2928921568627452, "grad_norm": 0.10283887592029052, "learning_rate": 0.0003351490651657347, "loss": 0.7576, "step": 1055 }, { "epoch": 1.2990196078431373, "grad_norm": 0.09668902027134954, "learning_rate": 0.00033010762669014347, "loss": 0.7339, "step": 1060 }, { "epoch": 1.3051470588235294, "grad_norm": 0.09491296282202684, "learning_rate": 0.00032508563990646925, "loss": 0.74, "step": 1065 }, { "epoch": 1.3112745098039216, "grad_norm": 0.08614797698276408, "learning_rate": 0.00032008367980308734, "loss": 0.7491, "step": 1070 }, { "epoch": 1.3174019607843137, "grad_norm": 0.11385248252735118, "learning_rate": 0.0003151023190754343, "loss": 0.7424, "step": 1075 }, { "epoch": 1.3235294117647058, "grad_norm": 0.09914180185658039, "learning_rate": 0.0003101421280604379, "loss": 0.7386, "step": 1080 }, { "epoch": 1.329656862745098, "grad_norm": 0.23046737698236164, "learning_rate": 0.000305203674671216, "loss": 0.7429, "step": 1085 }, { "epoch": 1.3357843137254901, "grad_norm": 0.08999049028390338, "learning_rate": 0.00030028752433205476, "loss": 0.7504, "step": 1090 }, { "epoch": 1.3419117647058822, "grad_norm": 0.08798822285342148, "learning_rate": 0.0002953942399136702, "loss": 0.7475, "step": 1095 }, { "epoch": 1.3480392156862746, "grad_norm": 0.11440187719573272, "learning_rate": 0.00029052438166876307, "loss": 0.745, "step": 1100 }, { "epoch": 1.3541666666666667, "grad_norm": 0.09680996894090088, "learning_rate": 0.00028567850716787257, "loss": 0.7493, "step": 1105 }, { "epoch": 1.3602941176470589, "grad_norm": 0.08285996425508975, "learning_rate": 0.0002808571712355389, "loss": 0.7503, "step": 1110 }, { "epoch": 1.366421568627451, "grad_norm": 0.09255457708748827, "learning_rate": 0.0002760609258867784, "loss": 0.7318, "step": 1115 }, { "epoch": 1.3725490196078431, "grad_norm": 0.09572569889952495, "learning_rate": 0.00027129032026388045, "loss": 0.7348, "step": 1120 }, { "epoch": 1.3786764705882353, "grad_norm": 0.0909286336173143, "learning_rate": 0.00026654590057353467, "loss": 0.7403, "step": 1125 }, { "epoch": 1.3848039215686274, "grad_norm": 0.08684037452287134, "learning_rate": 0.00026182821002429345, "loss": 0.7492, "step": 1130 }, { "epoch": 1.3909313725490196, "grad_norm": 0.09039702206887036, "learning_rate": 0.00025713778876437744, "loss": 0.7271, "step": 1135 }, { "epoch": 1.3970588235294117, "grad_norm": 0.10259305111827258, "learning_rate": 0.00025247517381983136, "loss": 0.7334, "step": 1140 }, { "epoch": 1.403186274509804, "grad_norm": 0.09305514839850722, "learning_rate": 0.00024784089903303854, "loss": 0.7342, "step": 1145 }, { "epoch": 1.409313725490196, "grad_norm": 0.09362025462459474, "learning_rate": 0.00024323549500159802, "loss": 0.7287, "step": 1150 }, { "epoch": 1.4154411764705883, "grad_norm": 0.09297020814808991, "learning_rate": 0.0002386594890175749, "loss": 0.7424, "step": 1155 }, { "epoch": 1.4215686274509804, "grad_norm": 0.08353530769016755, "learning_rate": 0.0002341134050071283, "loss": 0.7485, "step": 1160 }, { "epoch": 1.4276960784313726, "grad_norm": 0.08590929777722105, "learning_rate": 0.00022959776347052509, "loss": 0.7347, "step": 1165 }, { "epoch": 1.4338235294117647, "grad_norm": 0.09926401747517187, "learning_rate": 0.00022511308142254488, "loss": 0.7529, "step": 1170 }, { "epoch": 1.4399509803921569, "grad_norm": 0.08954546869195683, "learning_rate": 0.00022065987233328528, "loss": 0.741, "step": 1175 }, { "epoch": 1.446078431372549, "grad_norm": 0.08807524417797259, "learning_rate": 0.000216238646069373, "loss": 0.7409, "step": 1180 }, { "epoch": 1.4522058823529411, "grad_norm": 0.09243225975101937, "learning_rate": 0.00021184990883558658, "loss": 0.7358, "step": 1185 }, { "epoch": 1.4583333333333333, "grad_norm": 0.09464124110688574, "learning_rate": 0.00020749416311689845, "loss": 0.7346, "step": 1190 }, { "epoch": 1.4644607843137254, "grad_norm": 0.09316779183905147, "learning_rate": 0.0002031719076209445, "loss": 0.7313, "step": 1195 }, { "epoch": 1.4705882352941178, "grad_norm": 0.09732126328149814, "learning_rate": 0.00019888363722092372, "loss": 0.7341, "step": 1200 }, { "epoch": 1.4767156862745099, "grad_norm": 0.08886419656539418, "learning_rate": 0.0001946298428989386, "loss": 0.7375, "step": 1205 }, { "epoch": 1.482843137254902, "grad_norm": 0.07850103787876181, "learning_rate": 0.00019041101168978093, "loss": 0.7287, "step": 1210 }, { "epoch": 1.4889705882352942, "grad_norm": 0.08830903517712305, "learning_rate": 0.00018622762662516868, "loss": 0.735, "step": 1215 }, { "epoch": 1.4950980392156863, "grad_norm": 0.08702420978835, "learning_rate": 0.00018208016667844152, "loss": 0.7393, "step": 1220 }, { "epoch": 1.5012254901960784, "grad_norm": 0.08438557069126026, "learning_rate": 0.00017796910670972132, "loss": 0.7423, "step": 1225 }, { "epoch": 1.5073529411764706, "grad_norm": 0.0874124625381899, "learning_rate": 0.00017389491741154372, "loss": 0.7417, "step": 1230 }, { "epoch": 1.5134803921568627, "grad_norm": 0.08983974571361636, "learning_rate": 0.0001698580652549665, "loss": 0.7284, "step": 1235 }, { "epoch": 1.5196078431372548, "grad_norm": 0.09655404504961941, "learning_rate": 0.00016585901243616042, "loss": 0.732, "step": 1240 }, { "epoch": 1.5257352941176472, "grad_norm": 0.08637672179778699, "learning_rate": 0.00016189821682349205, "loss": 0.7293, "step": 1245 }, { "epoch": 1.531862745098039, "grad_norm": 0.08546099516549688, "learning_rate": 0.0001579761319050991, "loss": 0.7356, "step": 1250 }, { "epoch": 1.5379901960784315, "grad_norm": 0.07907834707353141, "learning_rate": 0.00015409320673696902, "loss": 0.731, "step": 1255 }, { "epoch": 1.5441176470588234, "grad_norm": 0.08633766548134404, "learning_rate": 0.00015024988589152537, "loss": 0.7254, "step": 1260 }, { "epoch": 1.5502450980392157, "grad_norm": 0.09255543384492604, "learning_rate": 0.00014644660940672628, "loss": 0.7352, "step": 1265 }, { "epoch": 1.5563725490196079, "grad_norm": 0.07948954260097911, "learning_rate": 0.0001426838127356823, "loss": 0.7281, "step": 1270 }, { "epoch": 1.5625, "grad_norm": 0.0858902101428744, "learning_rate": 0.0001389619266968002, "loss": 0.7404, "step": 1275 }, { "epoch": 1.5686274509803921, "grad_norm": 0.08099715928187635, "learning_rate": 0.0001352813774244565, "loss": 0.729, "step": 1280 }, { "epoch": 1.5747549019607843, "grad_norm": 0.08675937769241077, "learning_rate": 0.0001316425863202078, "loss": 0.7289, "step": 1285 }, { "epoch": 1.5808823529411766, "grad_norm": 0.08712801365461889, "learning_rate": 0.00012804597000454215, "loss": 0.7368, "step": 1290 }, { "epoch": 1.5870098039215685, "grad_norm": 0.09259317971708025, "learning_rate": 0.00012449194026917883, "loss": 0.7254, "step": 1295 }, { "epoch": 1.593137254901961, "grad_norm": 0.0821781596676491, "learning_rate": 0.00012098090402992085, "loss": 0.7307, "step": 1300 }, { "epoch": 1.5992647058823528, "grad_norm": 0.08514490094990651, "learning_rate": 0.00011751326328006473, "loss": 0.7226, "step": 1305 }, { "epoch": 1.6053921568627452, "grad_norm": 0.08798702323560283, "learning_rate": 0.00011408941504437532, "loss": 0.7274, "step": 1310 }, { "epoch": 1.6115196078431373, "grad_norm": 0.08477334907464082, "learning_rate": 0.00011070975133362842, "loss": 0.7351, "step": 1315 }, { "epoch": 1.6176470588235294, "grad_norm": 0.08440769123912943, "learning_rate": 0.00010737465909972776, "loss": 0.7322, "step": 1320 }, { "epoch": 1.6237745098039216, "grad_norm": 0.08816338400851521, "learning_rate": 0.00010408452019140119, "loss": 0.7257, "step": 1325 }, { "epoch": 1.6299019607843137, "grad_norm": 0.0889764540813042, "learning_rate": 0.00010083971131048159, "loss": 0.7285, "step": 1330 }, { "epoch": 1.6360294117647058, "grad_norm": 0.08514875374173311, "learning_rate": 9.764060396877661e-05, "loss": 0.7323, "step": 1335 }, { "epoch": 1.642156862745098, "grad_norm": 0.08728092386538332, "learning_rate": 9.448756444553224e-05, "loss": 0.7256, "step": 1340 }, { "epoch": 1.6482843137254903, "grad_norm": 0.08430901480786084, "learning_rate": 9.138095374549633e-05, "loss": 0.7278, "step": 1345 }, { "epoch": 1.6544117647058822, "grad_norm": 0.08407206279064601, "learning_rate": 8.832112755758598e-05, "loss": 0.7232, "step": 1350 }, { "epoch": 1.6605392156862746, "grad_norm": 0.09172008766801877, "learning_rate": 8.530843621416234e-05, "loss": 0.7262, "step": 1355 }, { "epoch": 1.6666666666666665, "grad_norm": 0.088740049378598, "learning_rate": 8.234322465092047e-05, "loss": 0.7294, "step": 1360 }, { "epoch": 1.6727941176470589, "grad_norm": 0.08733287037829676, "learning_rate": 7.942583236739581e-05, "loss": 0.7326, "step": 1365 }, { "epoch": 1.678921568627451, "grad_norm": 0.08946029254025908, "learning_rate": 7.655659338809329e-05, "loss": 0.7302, "step": 1370 }, { "epoch": 1.6850490196078431, "grad_norm": 0.08125225656437084, "learning_rate": 7.373583622424358e-05, "loss": 0.7243, "step": 1375 }, { "epoch": 1.6911764705882353, "grad_norm": 0.08262402915263288, "learning_rate": 7.096388383619079e-05, "loss": 0.722, "step": 1380 }, { "epoch": 1.6973039215686274, "grad_norm": 0.0983748828611407, "learning_rate": 6.824105359641513e-05, "loss": 0.7224, "step": 1385 }, { "epoch": 1.7034313725490198, "grad_norm": 0.09511486159825465, "learning_rate": 6.556765725319525e-05, "loss": 0.7353, "step": 1390 }, { "epoch": 1.7095588235294117, "grad_norm": 0.08406547135240365, "learning_rate": 6.294400089491526e-05, "loss": 0.7249, "step": 1395 }, { "epoch": 1.715686274509804, "grad_norm": 0.09349436979698456, "learning_rate": 6.037038491501978e-05, "loss": 0.7199, "step": 1400 }, { "epoch": 1.721813725490196, "grad_norm": 0.0859858185739073, "learning_rate": 5.7847103977619555e-05, "loss": 0.7231, "step": 1405 }, { "epoch": 1.7279411764705883, "grad_norm": 0.08083261397597263, "learning_rate": 5.53744469837551e-05, "loss": 0.7331, "step": 1410 }, { "epoch": 1.7340686274509802, "grad_norm": 0.08197105924202848, "learning_rate": 5.295269703831901e-05, "loss": 0.725, "step": 1415 }, { "epoch": 1.7401960784313726, "grad_norm": 0.07809550724924257, "learning_rate": 5.058213141764151e-05, "loss": 0.718, "step": 1420 }, { "epoch": 1.7463235294117647, "grad_norm": 0.08030689228502133, "learning_rate": 4.826302153774448e-05, "loss": 0.7171, "step": 1425 }, { "epoch": 1.7524509803921569, "grad_norm": 0.08468169044962492, "learning_rate": 4.599563292326592e-05, "loss": 0.7267, "step": 1430 }, { "epoch": 1.758578431372549, "grad_norm": 0.0820979728898519, "learning_rate": 4.3780225177058766e-05, "loss": 0.7166, "step": 1435 }, { "epoch": 1.7647058823529411, "grad_norm": 0.07927753918146413, "learning_rate": 4.161705195046761e-05, "loss": 0.718, "step": 1440 }, { "epoch": 1.7708333333333335, "grad_norm": 0.0840926492467221, "learning_rate": 3.9506360914287386e-05, "loss": 0.728, "step": 1445 }, { "epoch": 1.7769607843137254, "grad_norm": 0.08436514170292973, "learning_rate": 3.744839373040682e-05, "loss": 0.7214, "step": 1450 }, { "epoch": 1.7830882352941178, "grad_norm": 0.08999651739438058, "learning_rate": 3.5443386024138605e-05, "loss": 0.7296, "step": 1455 }, { "epoch": 1.7892156862745097, "grad_norm": 0.08092101714415846, "learning_rate": 3.349156735724274e-05, "loss": 0.7196, "step": 1460 }, { "epoch": 1.795343137254902, "grad_norm": 0.08061728201126263, "learning_rate": 3.1593161201642354e-05, "loss": 0.7284, "step": 1465 }, { "epoch": 1.8014705882352942, "grad_norm": 0.07834911210493091, "learning_rate": 2.9748384913837522e-05, "loss": 0.7325, "step": 1470 }, { "epoch": 1.8075980392156863, "grad_norm": 0.0790057426287298, "learning_rate": 2.7957449710019512e-05, "loss": 0.7286, "step": 1475 }, { "epoch": 1.8137254901960784, "grad_norm": 0.07695722244990424, "learning_rate": 2.622056064188738e-05, "loss": 0.7276, "step": 1480 }, { "epoch": 1.8198529411764706, "grad_norm": 0.09150941908302032, "learning_rate": 2.4537916573171337e-05, "loss": 0.7239, "step": 1485 }, { "epoch": 1.8259803921568627, "grad_norm": 0.09154350521364844, "learning_rate": 2.2909710156863274e-05, "loss": 0.7312, "step": 1490 }, { "epoch": 1.8321078431372548, "grad_norm": 0.08002699190491802, "learning_rate": 2.1336127813159355e-05, "loss": 0.7254, "step": 1495 }, { "epoch": 1.8382352941176472, "grad_norm": 0.08021887418189992, "learning_rate": 1.981734970811644e-05, "loss": 0.7222, "step": 1500 }, { "epoch": 1.844362745098039, "grad_norm": 0.08930992030073814, "learning_rate": 1.8353549733023333e-05, "loss": 0.7383, "step": 1505 }, { "epoch": 1.8504901960784315, "grad_norm": 0.08303320455844895, "learning_rate": 1.6944895484492072e-05, "loss": 0.7133, "step": 1510 }, { "epoch": 1.8566176470588234, "grad_norm": 0.08729988838797742, "learning_rate": 1.5591548245268428e-05, "loss": 0.7315, "step": 1515 }, { "epoch": 1.8627450980392157, "grad_norm": 0.08890946616187734, "learning_rate": 1.429366296576623e-05, "loss": 0.7197, "step": 1520 }, { "epoch": 1.8688725490196079, "grad_norm": 0.08151959163399713, "learning_rate": 1.30513882463264e-05, "loss": 0.7263, "step": 1525 }, { "epoch": 1.875, "grad_norm": 0.09562764992899297, "learning_rate": 1.1864866320203115e-05, "loss": 0.7188, "step": 1530 }, { "epoch": 1.8811274509803921, "grad_norm": 0.07846982174735893, "learning_rate": 1.073423303727894e-05, "loss": 0.7237, "step": 1535 }, { "epoch": 1.8872549019607843, "grad_norm": 0.07663001458145507, "learning_rate": 9.659617848510882e-06, "loss": 0.7252, "step": 1540 }, { "epoch": 1.8933823529411766, "grad_norm": 0.08080118998190498, "learning_rate": 8.64114379110853e-06, "loss": 0.7112, "step": 1545 }, { "epoch": 1.8995098039215685, "grad_norm": 0.08484892902062172, "learning_rate": 7.678927474447817e-06, "loss": 0.7264, "step": 1550 }, { "epoch": 1.905637254901961, "grad_norm": 0.08516868212628753, "learning_rate": 6.77307906671909e-06, "loss": 0.7271, "step": 1555 }, { "epoch": 1.9117647058823528, "grad_norm": 0.08133640776409674, "learning_rate": 5.923702282314092e-06, "loss": 0.7324, "step": 1560 }, { "epoch": 1.9178921568627452, "grad_norm": 0.08097758322239303, "learning_rate": 5.130894369951011e-06, "loss": 0.7273, "step": 1565 }, { "epoch": 1.9240196078431373, "grad_norm": 0.08298253969927805, "learning_rate": 4.394746101540115e-06, "loss": 0.7298, "step": 1570 }, { "epoch": 1.9301470588235294, "grad_norm": 0.0776674873026092, "learning_rate": 3.7153417617907802e-06, "loss": 0.7199, "step": 1575 }, { "epoch": 1.9362745098039216, "grad_norm": 0.08136684261699478, "learning_rate": 3.092759138561607e-06, "loss": 0.727, "step": 1580 }, { "epoch": 1.9424019607843137, "grad_norm": 0.21064147236672387, "learning_rate": 2.5270695139539833e-06, "loss": 0.7305, "step": 1585 }, { "epoch": 1.9485294117647058, "grad_norm": 0.07856752844029326, "learning_rate": 2.018337656150726e-06, "loss": 0.7259, "step": 1590 }, { "epoch": 1.954656862745098, "grad_norm": 0.07818237444191369, "learning_rate": 1.5666218120005682e-06, "loss": 0.724, "step": 1595 }, { "epoch": 1.9607843137254903, "grad_norm": 0.07498630792024302, "learning_rate": 1.1719737003492159e-06, "loss": 0.7205, "step": 1600 }, { "epoch": 1.9669117647058822, "grad_norm": 0.07974040781241311, "learning_rate": 8.344385061176962e-07, "loss": 0.7228, "step": 1605 }, { "epoch": 1.9730392156862746, "grad_norm": 0.08674324039372262, "learning_rate": 5.540548751292173e-07, "loss": 0.7364, "step": 1610 }, { "epoch": 1.9791666666666665, "grad_norm": 0.08126471002081247, "learning_rate": 3.3085490968409737e-07, "loss": 0.7232, "step": 1615 }, { "epoch": 1.9852941176470589, "grad_norm": 0.08409713343077478, "learning_rate": 1.6486416488459277e-07, "loss": 0.7251, "step": 1620 }, { "epoch": 1.991421568627451, "grad_norm": 0.08609959331430181, "learning_rate": 5.6101645708850346e-08, "loss": 0.7326, "step": 1625 }, { "epoch": 1.9975490196078431, "grad_norm": 0.0748443586344328, "learning_rate": 4.579804834703438e-09, "loss": 0.7238, "step": 1630 }, { "epoch": 2.0, "eval_loss": 1.1972506046295166, "eval_runtime": 112.8302, "eval_samples_per_second": 185.598, "eval_steps_per_second": 5.805, "step": 1632 }, { "epoch": 2.0, "step": 1632, "total_flos": 160150899916800.0, "train_loss": 0.8347952699690473, "train_runtime": 3713.7533, "train_samples_per_second": 56.241, "train_steps_per_second": 0.439 } ], "logging_steps": 5, "max_steps": 1632, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 160150899916800.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }