|
{ |
|
"best_metric": 1.9664931297302246, |
|
"best_model_checkpoint": "./lora_bn_resume/checkpoint-3000", |
|
"epoch": 1.9292604501607717, |
|
"eval_steps": 200, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006430868167202572, |
|
"grad_norm": 0.7529953718185425, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 2.01, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.012861736334405145, |
|
"grad_norm": 0.8143910765647888, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 1.9794, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01929260450160772, |
|
"grad_norm": 0.7554563283920288, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 1.9687, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02572347266881029, |
|
"grad_norm": 0.701172411441803, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 2.0374, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03215434083601286, |
|
"grad_norm": 0.7426002621650696, |
|
"learning_rate": 0.00015, |
|
"loss": 1.8484, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03858520900321544, |
|
"grad_norm": 0.7900332808494568, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 1.91, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04501607717041801, |
|
"grad_norm": 0.7825136184692383, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 1.9625, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05144694533762058, |
|
"grad_norm": 0.9338003993034363, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 1.9668, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05787781350482315, |
|
"grad_norm": 0.8660485148429871, |
|
"learning_rate": 0.00027, |
|
"loss": 2.0447, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06430868167202572, |
|
"grad_norm": 0.8631746768951416, |
|
"learning_rate": 0.0003, |
|
"loss": 2.0347, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0707395498392283, |
|
"grad_norm": 0.9202760457992554, |
|
"learning_rate": 0.00029934282584884994, |
|
"loss": 2.0218, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07717041800643087, |
|
"grad_norm": 0.8508992791175842, |
|
"learning_rate": 0.00029868565169769985, |
|
"loss": 1.9808, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.08360128617363344, |
|
"grad_norm": 0.9962050914764404, |
|
"learning_rate": 0.0002980284775465498, |
|
"loss": 1.9586, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09003215434083602, |
|
"grad_norm": 0.9159810543060303, |
|
"learning_rate": 0.00029737130339539973, |
|
"loss": 2.0257, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.09646302250803858, |
|
"grad_norm": 0.8135138750076294, |
|
"learning_rate": 0.0002967141292442497, |
|
"loss": 2.0103, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.10289389067524116, |
|
"grad_norm": 0.7933633327484131, |
|
"learning_rate": 0.00029605695509309966, |
|
"loss": 2.028, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.10932475884244373, |
|
"grad_norm": 0.9258368611335754, |
|
"learning_rate": 0.00029539978094194957, |
|
"loss": 2.0654, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1157556270096463, |
|
"grad_norm": 0.8758969902992249, |
|
"learning_rate": 0.00029474260679079954, |
|
"loss": 1.9928, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.12218649517684887, |
|
"grad_norm": 0.8316165804862976, |
|
"learning_rate": 0.00029408543263964945, |
|
"loss": 1.9748, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.12861736334405144, |
|
"grad_norm": 0.8353763222694397, |
|
"learning_rate": 0.0002934282584884994, |
|
"loss": 2.0167, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12861736334405144, |
|
"eval_loss": 2.0699551105499268, |
|
"eval_runtime": 131.8406, |
|
"eval_samples_per_second": 15.17, |
|
"eval_steps_per_second": 1.896, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13504823151125403, |
|
"grad_norm": 0.8024882078170776, |
|
"learning_rate": 0.0002927710843373494, |
|
"loss": 2.1039, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1414790996784566, |
|
"grad_norm": 0.861377477645874, |
|
"learning_rate": 0.0002921139101861993, |
|
"loss": 2.023, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.14790996784565916, |
|
"grad_norm": 0.8247071504592896, |
|
"learning_rate": 0.00029145673603504926, |
|
"loss": 1.9341, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.15434083601286175, |
|
"grad_norm": 0.8182681202888489, |
|
"learning_rate": 0.0002907995618838992, |
|
"loss": 2.0137, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1607717041800643, |
|
"grad_norm": 0.8556217551231384, |
|
"learning_rate": 0.00029014238773274913, |
|
"loss": 2.0638, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.16720257234726688, |
|
"grad_norm": 0.7721512913703918, |
|
"learning_rate": 0.0002894852135815991, |
|
"loss": 2.0061, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.17363344051446947, |
|
"grad_norm": 0.7948784828186035, |
|
"learning_rate": 0.000288828039430449, |
|
"loss": 1.9751, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.18006430868167203, |
|
"grad_norm": 0.7582404613494873, |
|
"learning_rate": 0.000288170865279299, |
|
"loss": 2.0254, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1864951768488746, |
|
"grad_norm": 0.9620535969734192, |
|
"learning_rate": 0.00028751369112814894, |
|
"loss": 1.9978, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.19292604501607716, |
|
"grad_norm": 0.7374221682548523, |
|
"learning_rate": 0.00028685651697699885, |
|
"loss": 2.0631, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.19935691318327975, |
|
"grad_norm": 0.794651210308075, |
|
"learning_rate": 0.0002861993428258488, |
|
"loss": 1.9507, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2057877813504823, |
|
"grad_norm": 0.7450920939445496, |
|
"learning_rate": 0.00028554216867469873, |
|
"loss": 2.0363, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.21221864951768488, |
|
"grad_norm": 0.7574348449707031, |
|
"learning_rate": 0.0002848849945235487, |
|
"loss": 2.0508, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.21864951768488747, |
|
"grad_norm": 0.9118533134460449, |
|
"learning_rate": 0.00028422782037239866, |
|
"loss": 2.0118, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.22508038585209003, |
|
"grad_norm": 0.8136394023895264, |
|
"learning_rate": 0.0002835706462212486, |
|
"loss": 2.1211, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2315112540192926, |
|
"grad_norm": 0.9099079966545105, |
|
"learning_rate": 0.00028291347207009854, |
|
"loss": 2.0346, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2379421221864952, |
|
"grad_norm": 0.830896258354187, |
|
"learning_rate": 0.0002822562979189485, |
|
"loss": 2.0494, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.24437299035369775, |
|
"grad_norm": 0.789002001285553, |
|
"learning_rate": 0.0002815991237677984, |
|
"loss": 1.9791, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2508038585209003, |
|
"grad_norm": 0.8194644451141357, |
|
"learning_rate": 0.0002809419496166484, |
|
"loss": 2.0106, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2572347266881029, |
|
"grad_norm": 0.8226191401481628, |
|
"learning_rate": 0.00028028477546549835, |
|
"loss": 2.0268, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2572347266881029, |
|
"eval_loss": 2.057727575302124, |
|
"eval_runtime": 127.2637, |
|
"eval_samples_per_second": 15.715, |
|
"eval_steps_per_second": 1.964, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.26366559485530544, |
|
"grad_norm": 0.796454668045044, |
|
"learning_rate": 0.00027962760131434826, |
|
"loss": 2.0376, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.27009646302250806, |
|
"grad_norm": 0.8327352404594421, |
|
"learning_rate": 0.0002789704271631982, |
|
"loss": 2.0481, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2765273311897106, |
|
"grad_norm": 0.8051420450210571, |
|
"learning_rate": 0.0002783132530120482, |
|
"loss": 1.99, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2829581993569132, |
|
"grad_norm": 0.7519128322601318, |
|
"learning_rate": 0.0002776560788608981, |
|
"loss": 2.0339, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.28938906752411575, |
|
"grad_norm": 0.8251495957374573, |
|
"learning_rate": 0.00027699890470974807, |
|
"loss": 2.0289, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2958199356913183, |
|
"grad_norm": 0.7058277130126953, |
|
"learning_rate": 0.000276341730558598, |
|
"loss": 2.0669, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3022508038585209, |
|
"grad_norm": 0.8475114107131958, |
|
"learning_rate": 0.00027568455640744795, |
|
"loss": 2.0506, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3086816720257235, |
|
"grad_norm": 0.7855744957923889, |
|
"learning_rate": 0.0002750273822562979, |
|
"loss": 1.97, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.31511254019292606, |
|
"grad_norm": 0.727988064289093, |
|
"learning_rate": 0.0002743702081051478, |
|
"loss": 2.0705, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.3215434083601286, |
|
"grad_norm": 0.7662935853004456, |
|
"learning_rate": 0.0002737130339539978, |
|
"loss": 1.9678, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3279742765273312, |
|
"grad_norm": 0.9171555638313293, |
|
"learning_rate": 0.00027305585980284776, |
|
"loss": 1.9818, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.33440514469453375, |
|
"grad_norm": 0.7959179282188416, |
|
"learning_rate": 0.00027239868565169767, |
|
"loss": 2.0014, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3408360128617363, |
|
"grad_norm": 0.9359775185585022, |
|
"learning_rate": 0.00027174151150054763, |
|
"loss": 2.0244, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.34726688102893893, |
|
"grad_norm": 0.7740966081619263, |
|
"learning_rate": 0.0002710843373493976, |
|
"loss": 2.0883, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3536977491961415, |
|
"grad_norm": 0.868601381778717, |
|
"learning_rate": 0.0002704271631982475, |
|
"loss": 2.0226, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.36012861736334406, |
|
"grad_norm": 0.8721134662628174, |
|
"learning_rate": 0.0002697699890470975, |
|
"loss": 2.0965, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3665594855305466, |
|
"grad_norm": 0.8080394268035889, |
|
"learning_rate": 0.00026911281489594744, |
|
"loss": 2.0082, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3729903536977492, |
|
"grad_norm": 1.7169413566589355, |
|
"learning_rate": 0.00026845564074479735, |
|
"loss": 2.039, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.37942122186495175, |
|
"grad_norm": 0.8220880031585693, |
|
"learning_rate": 0.0002677984665936473, |
|
"loss": 2.0696, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3858520900321543, |
|
"grad_norm": 0.7639694213867188, |
|
"learning_rate": 0.00026714129244249723, |
|
"loss": 2.0014, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3858520900321543, |
|
"eval_loss": 2.0443177223205566, |
|
"eval_runtime": 133.8726, |
|
"eval_samples_per_second": 14.94, |
|
"eval_steps_per_second": 1.867, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.39228295819935693, |
|
"grad_norm": 0.817965567111969, |
|
"learning_rate": 0.0002664841182913472, |
|
"loss": 2.0553, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3987138263665595, |
|
"grad_norm": 0.871166467666626, |
|
"learning_rate": 0.00026582694414019716, |
|
"loss": 2.0027, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.40514469453376206, |
|
"grad_norm": 0.7483948469161987, |
|
"learning_rate": 0.00026516976998904707, |
|
"loss": 2.0355, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.4115755627009646, |
|
"grad_norm": 0.8223303556442261, |
|
"learning_rate": 0.00026451259583789704, |
|
"loss": 2.0076, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.4180064308681672, |
|
"grad_norm": 0.80986088514328, |
|
"learning_rate": 0.00026385542168674695, |
|
"loss": 2.0781, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.42443729903536975, |
|
"grad_norm": 0.7527362704277039, |
|
"learning_rate": 0.0002631982475355969, |
|
"loss": 1.9727, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.43086816720257237, |
|
"grad_norm": 0.7571489810943604, |
|
"learning_rate": 0.0002625410733844469, |
|
"loss": 2.0205, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.43729903536977494, |
|
"grad_norm": 0.7976600527763367, |
|
"learning_rate": 0.0002618838992332968, |
|
"loss": 2.0505, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.4437299035369775, |
|
"grad_norm": 0.8057394623756409, |
|
"learning_rate": 0.00026122672508214676, |
|
"loss": 2.0351, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.45016077170418006, |
|
"grad_norm": 0.8420009016990662, |
|
"learning_rate": 0.0002605695509309967, |
|
"loss": 1.9655, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4565916398713826, |
|
"grad_norm": 0.853597104549408, |
|
"learning_rate": 0.00025991237677984664, |
|
"loss": 1.9939, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.4630225080385852, |
|
"grad_norm": 0.7588443160057068, |
|
"learning_rate": 0.0002592552026286966, |
|
"loss": 2.032, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.4694533762057878, |
|
"grad_norm": 0.8099080920219421, |
|
"learning_rate": 0.0002585980284775465, |
|
"loss": 1.9817, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.4758842443729904, |
|
"grad_norm": 0.7894070148468018, |
|
"learning_rate": 0.0002579408543263965, |
|
"loss": 2.0001, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.48231511254019294, |
|
"grad_norm": 0.7474116683006287, |
|
"learning_rate": 0.00025728368017524644, |
|
"loss": 2.0077, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4887459807073955, |
|
"grad_norm": 0.8076878786087036, |
|
"learning_rate": 0.00025662650602409636, |
|
"loss": 2.0394, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.49517684887459806, |
|
"grad_norm": 0.7559667825698853, |
|
"learning_rate": 0.0002559693318729463, |
|
"loss": 1.9753, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5016077170418006, |
|
"grad_norm": 0.7402215600013733, |
|
"learning_rate": 0.00025531215772179623, |
|
"loss": 2.0353, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5080385852090032, |
|
"grad_norm": 0.7112523317337036, |
|
"learning_rate": 0.0002546549835706462, |
|
"loss": 1.989, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5144694533762058, |
|
"grad_norm": 0.7255666255950928, |
|
"learning_rate": 0.00025399780941949616, |
|
"loss": 1.9912, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5144694533762058, |
|
"eval_loss": 2.0358893871307373, |
|
"eval_runtime": 131.9747, |
|
"eval_samples_per_second": 15.154, |
|
"eval_steps_per_second": 1.894, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5209003215434084, |
|
"grad_norm": 0.7614848613739014, |
|
"learning_rate": 0.0002533406352683461, |
|
"loss": 1.9507, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.5273311897106109, |
|
"grad_norm": 0.7834282517433167, |
|
"learning_rate": 0.00025268346111719604, |
|
"loss": 2.0572, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.5337620578778135, |
|
"grad_norm": 0.8642615079879761, |
|
"learning_rate": 0.00025202628696604595, |
|
"loss": 1.9766, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.5401929260450161, |
|
"grad_norm": 0.7937222123146057, |
|
"learning_rate": 0.0002513691128148959, |
|
"loss": 1.9718, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5466237942122186, |
|
"grad_norm": 0.7922580242156982, |
|
"learning_rate": 0.0002507119386637459, |
|
"loss": 2.0098, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5530546623794212, |
|
"grad_norm": 0.7464605569839478, |
|
"learning_rate": 0.0002500547645125958, |
|
"loss": 1.9529, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5594855305466238, |
|
"grad_norm": 0.7568275332450867, |
|
"learning_rate": 0.00024939759036144576, |
|
"loss": 1.989, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5659163987138264, |
|
"grad_norm": 0.7011362910270691, |
|
"learning_rate": 0.00024874041621029573, |
|
"loss": 2.031, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.572347266881029, |
|
"grad_norm": 0.7106270790100098, |
|
"learning_rate": 0.00024808324205914564, |
|
"loss": 2.022, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5787781350482315, |
|
"grad_norm": 0.7415210604667664, |
|
"learning_rate": 0.0002474260679079956, |
|
"loss": 2.0595, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5852090032154341, |
|
"grad_norm": 0.7313567399978638, |
|
"learning_rate": 0.0002467688937568455, |
|
"loss": 2.0293, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5916398713826366, |
|
"grad_norm": 0.692523181438446, |
|
"learning_rate": 0.0002461117196056955, |
|
"loss": 2.0746, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5980707395498392, |
|
"grad_norm": 0.6929277181625366, |
|
"learning_rate": 0.00024545454545454545, |
|
"loss": 1.955, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.6045016077170418, |
|
"grad_norm": 0.7199161648750305, |
|
"learning_rate": 0.00024479737130339536, |
|
"loss": 2.0454, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.6109324758842444, |
|
"grad_norm": 0.767314076423645, |
|
"learning_rate": 0.00024414019715224533, |
|
"loss": 2.0428, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.617363344051447, |
|
"grad_norm": 0.8044443130493164, |
|
"learning_rate": 0.00024348302300109526, |
|
"loss": 1.9423, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.6237942122186495, |
|
"grad_norm": 0.702936589717865, |
|
"learning_rate": 0.0002428258488499452, |
|
"loss": 1.9271, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.6302250803858521, |
|
"grad_norm": 0.7394160032272339, |
|
"learning_rate": 0.00024216867469879517, |
|
"loss": 1.9674, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.6366559485530546, |
|
"grad_norm": 0.7981842160224915, |
|
"learning_rate": 0.0002415115005476451, |
|
"loss": 1.9932, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.6430868167202572, |
|
"grad_norm": 0.871896505355835, |
|
"learning_rate": 0.00024085432639649505, |
|
"loss": 2.0182, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6430868167202572, |
|
"eval_loss": 2.024224281311035, |
|
"eval_runtime": 130.1041, |
|
"eval_samples_per_second": 15.372, |
|
"eval_steps_per_second": 1.922, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6495176848874598, |
|
"grad_norm": 0.7123499512672424, |
|
"learning_rate": 0.00024019715224534498, |
|
"loss": 2.0923, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.6559485530546624, |
|
"grad_norm": 0.7226546406745911, |
|
"learning_rate": 0.00023953997809419495, |
|
"loss": 2.0035, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.662379421221865, |
|
"grad_norm": 0.7627468109130859, |
|
"learning_rate": 0.0002388828039430449, |
|
"loss": 1.9667, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.6688102893890675, |
|
"grad_norm": 0.8175467252731323, |
|
"learning_rate": 0.00023822562979189483, |
|
"loss": 1.948, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.6752411575562701, |
|
"grad_norm": 0.690073549747467, |
|
"learning_rate": 0.0002375684556407448, |
|
"loss": 2.0498, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6816720257234726, |
|
"grad_norm": 0.9848446249961853, |
|
"learning_rate": 0.0002369112814895947, |
|
"loss": 1.9874, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.6881028938906752, |
|
"grad_norm": 0.7157571315765381, |
|
"learning_rate": 0.00023625410733844467, |
|
"loss": 2.0488, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.6945337620578779, |
|
"grad_norm": 0.8503302931785583, |
|
"learning_rate": 0.00023559693318729464, |
|
"loss": 1.9958, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.7009646302250804, |
|
"grad_norm": 0.7864677906036377, |
|
"learning_rate": 0.00023493975903614455, |
|
"loss": 2.0212, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.707395498392283, |
|
"grad_norm": 1.7837698459625244, |
|
"learning_rate": 0.0002342825848849945, |
|
"loss": 1.9828, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7138263665594855, |
|
"grad_norm": 0.7183972001075745, |
|
"learning_rate": 0.00023362541073384445, |
|
"loss": 2.0652, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.7202572347266881, |
|
"grad_norm": 0.7377676963806152, |
|
"learning_rate": 0.0002329682365826944, |
|
"loss": 2.0123, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.7266881028938906, |
|
"grad_norm": 0.7170071601867676, |
|
"learning_rate": 0.00023231106243154436, |
|
"loss": 1.9759, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.7331189710610932, |
|
"grad_norm": 0.6442170143127441, |
|
"learning_rate": 0.00023165388828039427, |
|
"loss": 2.047, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.7395498392282959, |
|
"grad_norm": 0.7356306910514832, |
|
"learning_rate": 0.00023099671412924423, |
|
"loss": 2.0438, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.7459807073954984, |
|
"grad_norm": 0.7483031153678894, |
|
"learning_rate": 0.0002303395399780942, |
|
"loss": 2.0274, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.752411575562701, |
|
"grad_norm": 0.7624642848968506, |
|
"learning_rate": 0.0002296823658269441, |
|
"loss": 1.9938, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.7588424437299035, |
|
"grad_norm": 0.7435073256492615, |
|
"learning_rate": 0.00022902519167579408, |
|
"loss": 1.9848, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.7652733118971061, |
|
"grad_norm": 0.7327163219451904, |
|
"learning_rate": 0.000228368017524644, |
|
"loss": 2.0286, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.7717041800643086, |
|
"grad_norm": 0.8398700952529907, |
|
"learning_rate": 0.00022771084337349395, |
|
"loss": 1.999, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7717041800643086, |
|
"eval_loss": 2.0166773796081543, |
|
"eval_runtime": 129.989, |
|
"eval_samples_per_second": 15.386, |
|
"eval_steps_per_second": 1.923, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7781350482315113, |
|
"grad_norm": 0.6727181673049927, |
|
"learning_rate": 0.00022705366922234392, |
|
"loss": 2.0044, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.7845659163987139, |
|
"grad_norm": 0.8738404512405396, |
|
"learning_rate": 0.00022639649507119383, |
|
"loss": 2.0246, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.7909967845659164, |
|
"grad_norm": 0.760010302066803, |
|
"learning_rate": 0.0002257393209200438, |
|
"loss": 2.0058, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.797427652733119, |
|
"grad_norm": 0.701081395149231, |
|
"learning_rate": 0.00022508214676889373, |
|
"loss": 1.9974, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.8038585209003215, |
|
"grad_norm": 0.7346913814544678, |
|
"learning_rate": 0.00022442497261774367, |
|
"loss": 2.0884, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.8102893890675241, |
|
"grad_norm": 0.7433114647865295, |
|
"learning_rate": 0.00022376779846659364, |
|
"loss": 1.9927, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.8167202572347267, |
|
"grad_norm": 0.7781444787979126, |
|
"learning_rate": 0.00022311062431544358, |
|
"loss": 2.001, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.8231511254019293, |
|
"grad_norm": 0.7538995742797852, |
|
"learning_rate": 0.00022245345016429352, |
|
"loss": 1.9947, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.8295819935691319, |
|
"grad_norm": 0.7132537961006165, |
|
"learning_rate": 0.00022179627601314345, |
|
"loss": 1.9781, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.8360128617363344, |
|
"grad_norm": 0.7174340486526489, |
|
"learning_rate": 0.0002211391018619934, |
|
"loss": 1.9848, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.842443729903537, |
|
"grad_norm": 0.7245258092880249, |
|
"learning_rate": 0.00022048192771084336, |
|
"loss": 2.005, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.8488745980707395, |
|
"grad_norm": 0.667892336845398, |
|
"learning_rate": 0.0002198247535596933, |
|
"loss": 1.9939, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.8553054662379421, |
|
"grad_norm": 0.7173146605491638, |
|
"learning_rate": 0.00021916757940854324, |
|
"loss": 2.0636, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.8617363344051447, |
|
"grad_norm": 0.7765901684761047, |
|
"learning_rate": 0.0002185104052573932, |
|
"loss": 1.9966, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.8681672025723473, |
|
"grad_norm": 0.7077351808547974, |
|
"learning_rate": 0.00021785323110624314, |
|
"loss": 2.0078, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.8745980707395499, |
|
"grad_norm": 0.736723780632019, |
|
"learning_rate": 0.00021719605695509308, |
|
"loss": 2.0292, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.8810289389067524, |
|
"grad_norm": 0.732185959815979, |
|
"learning_rate": 0.00021653888280394302, |
|
"loss": 2.0223, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.887459807073955, |
|
"grad_norm": 0.7002454400062561, |
|
"learning_rate": 0.00021588170865279298, |
|
"loss": 2.0068, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.8938906752411575, |
|
"grad_norm": 0.75859534740448, |
|
"learning_rate": 0.00021522453450164292, |
|
"loss": 1.9556, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.9003215434083601, |
|
"grad_norm": 0.7475289106369019, |
|
"learning_rate": 0.00021456736035049286, |
|
"loss": 1.9792, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9003215434083601, |
|
"eval_loss": 2.0089023113250732, |
|
"eval_runtime": 130.0325, |
|
"eval_samples_per_second": 15.381, |
|
"eval_steps_per_second": 1.923, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9067524115755627, |
|
"grad_norm": 0.7917546629905701, |
|
"learning_rate": 0.00021391018619934283, |
|
"loss": 1.9999, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.9131832797427653, |
|
"grad_norm": 0.7062447667121887, |
|
"learning_rate": 0.00021325301204819274, |
|
"loss": 1.9779, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.9196141479099679, |
|
"grad_norm": 0.6973288655281067, |
|
"learning_rate": 0.0002125958378970427, |
|
"loss": 2.0511, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.9260450160771704, |
|
"grad_norm": 0.7297340035438538, |
|
"learning_rate": 0.00021193866374589267, |
|
"loss": 1.9764, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.932475884244373, |
|
"grad_norm": 0.9256350994110107, |
|
"learning_rate": 0.00021128148959474258, |
|
"loss": 1.9559, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.9389067524115756, |
|
"grad_norm": 0.6994000673294067, |
|
"learning_rate": 0.00021062431544359255, |
|
"loss": 2.0152, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.9453376205787781, |
|
"grad_norm": 0.7412806749343872, |
|
"learning_rate": 0.00020996714129244246, |
|
"loss": 1.9494, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.9517684887459807, |
|
"grad_norm": 0.729680061340332, |
|
"learning_rate": 0.00020930996714129242, |
|
"loss": 2.0272, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.9581993569131833, |
|
"grad_norm": 0.7601342797279358, |
|
"learning_rate": 0.0002086527929901424, |
|
"loss": 1.9714, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.9646302250803859, |
|
"grad_norm": 0.6875161528587341, |
|
"learning_rate": 0.0002079956188389923, |
|
"loss": 1.993, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.9710610932475884, |
|
"grad_norm": 0.7520968317985535, |
|
"learning_rate": 0.00020733844468784227, |
|
"loss": 2.0471, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.977491961414791, |
|
"grad_norm": 0.8061411380767822, |
|
"learning_rate": 0.00020668127053669218, |
|
"loss": 2.0145, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.9839228295819936, |
|
"grad_norm": 0.7837228775024414, |
|
"learning_rate": 0.00020602409638554214, |
|
"loss": 1.9889, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.9903536977491961, |
|
"grad_norm": 0.744296133518219, |
|
"learning_rate": 0.0002053669222343921, |
|
"loss": 1.9834, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.9967845659163987, |
|
"grad_norm": 0.7137749791145325, |
|
"learning_rate": 0.00020470974808324202, |
|
"loss": 2.0582, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.0032154340836013, |
|
"grad_norm": 0.718320906162262, |
|
"learning_rate": 0.000204052573932092, |
|
"loss": 1.9576, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.0096463022508038, |
|
"grad_norm": 0.719998836517334, |
|
"learning_rate": 0.00020339539978094195, |
|
"loss": 1.9138, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.0160771704180065, |
|
"grad_norm": 0.7154316306114197, |
|
"learning_rate": 0.00020273822562979186, |
|
"loss": 1.875, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.022508038585209, |
|
"grad_norm": 0.6565534472465515, |
|
"learning_rate": 0.00020208105147864183, |
|
"loss": 1.9994, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.0289389067524115, |
|
"grad_norm": 0.7222368121147156, |
|
"learning_rate": 0.00020142387732749177, |
|
"loss": 1.9591, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.0289389067524115, |
|
"eval_loss": 2.002497673034668, |
|
"eval_runtime": 131.2869, |
|
"eval_samples_per_second": 15.234, |
|
"eval_steps_per_second": 1.904, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.0353697749196142, |
|
"grad_norm": 0.7213057279586792, |
|
"learning_rate": 0.0002007667031763417, |
|
"loss": 1.9464, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.0418006430868167, |
|
"grad_norm": 0.6436830163002014, |
|
"learning_rate": 0.00020010952902519167, |
|
"loss": 1.8951, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.0482315112540193, |
|
"grad_norm": 0.7160071134567261, |
|
"learning_rate": 0.00019945235487404158, |
|
"loss": 1.9062, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.0546623794212218, |
|
"grad_norm": 0.6585739850997925, |
|
"learning_rate": 0.00019879518072289155, |
|
"loss": 1.9514, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.0610932475884245, |
|
"grad_norm": 0.7445241808891296, |
|
"learning_rate": 0.0001981380065717415, |
|
"loss": 1.8301, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.067524115755627, |
|
"grad_norm": 0.6654142141342163, |
|
"learning_rate": 0.00019748083242059143, |
|
"loss": 1.9048, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.0739549839228295, |
|
"grad_norm": 0.7550114393234253, |
|
"learning_rate": 0.0001968236582694414, |
|
"loss": 1.9266, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.0803858520900322, |
|
"grad_norm": 0.7276896834373474, |
|
"learning_rate": 0.00019616648411829133, |
|
"loss": 1.8942, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.0868167202572347, |
|
"grad_norm": 0.7431575059890747, |
|
"learning_rate": 0.00019550930996714127, |
|
"loss": 1.9148, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.0932475884244373, |
|
"grad_norm": 0.74256831407547, |
|
"learning_rate": 0.0001948521358159912, |
|
"loss": 1.942, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.09967845659164, |
|
"grad_norm": 0.7295734286308289, |
|
"learning_rate": 0.00019419496166484117, |
|
"loss": 1.9331, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.1061093247588425, |
|
"grad_norm": 0.7749672532081604, |
|
"learning_rate": 0.0001935377875136911, |
|
"loss": 1.9373, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.112540192926045, |
|
"grad_norm": 0.6896611452102661, |
|
"learning_rate": 0.00019288061336254105, |
|
"loss": 1.8813, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.1189710610932475, |
|
"grad_norm": 0.7282217741012573, |
|
"learning_rate": 0.00019222343921139102, |
|
"loss": 1.9634, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.1254019292604502, |
|
"grad_norm": 0.7761743068695068, |
|
"learning_rate": 0.00019156626506024093, |
|
"loss": 1.8708, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.1318327974276527, |
|
"grad_norm": 0.7596757411956787, |
|
"learning_rate": 0.0001909090909090909, |
|
"loss": 1.9446, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.1382636655948553, |
|
"grad_norm": 0.7023797631263733, |
|
"learning_rate": 0.00019025191675794086, |
|
"loss": 1.8837, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.144694533762058, |
|
"grad_norm": 0.7191573977470398, |
|
"learning_rate": 0.00018959474260679077, |
|
"loss": 1.9141, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.1511254019292605, |
|
"grad_norm": 0.784885048866272, |
|
"learning_rate": 0.00018893756845564074, |
|
"loss": 1.9506, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.157556270096463, |
|
"grad_norm": 0.710903525352478, |
|
"learning_rate": 0.00018828039430449068, |
|
"loss": 1.9157, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.157556270096463, |
|
"eval_loss": 1.998835563659668, |
|
"eval_runtime": 121.0458, |
|
"eval_samples_per_second": 16.523, |
|
"eval_steps_per_second": 2.065, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.1639871382636655, |
|
"grad_norm": 0.7552351355552673, |
|
"learning_rate": 0.00018762322015334062, |
|
"loss": 1.9139, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.1704180064308682, |
|
"grad_norm": 0.7722271084785461, |
|
"learning_rate": 0.00018696604600219058, |
|
"loss": 1.863, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.1768488745980707, |
|
"grad_norm": 0.7195548415184021, |
|
"learning_rate": 0.0001863088718510405, |
|
"loss": 1.8697, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.1832797427652733, |
|
"grad_norm": 0.7423893809318542, |
|
"learning_rate": 0.00018565169769989046, |
|
"loss": 1.9772, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.189710610932476, |
|
"grad_norm": 0.7222315073013306, |
|
"learning_rate": 0.00018499452354874042, |
|
"loss": 1.9308, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.1961414790996785, |
|
"grad_norm": 0.6815035939216614, |
|
"learning_rate": 0.00018433734939759034, |
|
"loss": 1.9675, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.202572347266881, |
|
"grad_norm": 0.7621594071388245, |
|
"learning_rate": 0.0001836801752464403, |
|
"loss": 1.9295, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.2090032154340835, |
|
"grad_norm": 0.7405025959014893, |
|
"learning_rate": 0.0001830230010952902, |
|
"loss": 1.9088, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.2154340836012862, |
|
"grad_norm": 0.6729809641838074, |
|
"learning_rate": 0.00018236582694414018, |
|
"loss": 1.9446, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.2218649517684887, |
|
"grad_norm": 0.7389471530914307, |
|
"learning_rate": 0.00018170865279299014, |
|
"loss": 1.8841, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.2282958199356913, |
|
"grad_norm": 0.6453628540039062, |
|
"learning_rate": 0.00018105147864184006, |
|
"loss": 1.8661, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.234726688102894, |
|
"grad_norm": 0.6971079111099243, |
|
"learning_rate": 0.00018039430449069002, |
|
"loss": 1.9807, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.2411575562700965, |
|
"grad_norm": 0.7807840704917908, |
|
"learning_rate": 0.00017973713033953996, |
|
"loss": 1.9475, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.247588424437299, |
|
"grad_norm": 0.78909832239151, |
|
"learning_rate": 0.0001790799561883899, |
|
"loss": 1.8439, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.2540192926045015, |
|
"grad_norm": 0.7715321183204651, |
|
"learning_rate": 0.00017842278203723986, |
|
"loss": 1.9478, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.2604501607717042, |
|
"grad_norm": 0.7786479592323303, |
|
"learning_rate": 0.0001777656078860898, |
|
"loss": 1.8773, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.2668810289389068, |
|
"grad_norm": 0.6935726404190063, |
|
"learning_rate": 0.00017710843373493974, |
|
"loss": 1.94, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.2733118971061093, |
|
"grad_norm": 0.7824066877365112, |
|
"learning_rate": 0.00017645125958378968, |
|
"loss": 1.8996, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.279742765273312, |
|
"grad_norm": 0.7019379138946533, |
|
"learning_rate": 0.00017579408543263962, |
|
"loss": 1.9114, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.2861736334405145, |
|
"grad_norm": 0.8215466737747192, |
|
"learning_rate": 0.00017513691128148958, |
|
"loss": 1.8294, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.2861736334405145, |
|
"eval_loss": 1.9947528839111328, |
|
"eval_runtime": 132.3397, |
|
"eval_samples_per_second": 15.113, |
|
"eval_steps_per_second": 1.889, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.292604501607717, |
|
"grad_norm": 0.7088531851768494, |
|
"learning_rate": 0.00017447973713033952, |
|
"loss": 1.9497, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.2990353697749195, |
|
"grad_norm": 0.7754150032997131, |
|
"learning_rate": 0.00017382256297918946, |
|
"loss": 1.9047, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.3054662379421222, |
|
"grad_norm": 0.7185202836990356, |
|
"learning_rate": 0.00017316538882803943, |
|
"loss": 1.8529, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.3118971061093248, |
|
"grad_norm": 0.7496573328971863, |
|
"learning_rate": 0.00017250821467688937, |
|
"loss": 1.8618, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.3183279742765273, |
|
"grad_norm": 0.6794284582138062, |
|
"learning_rate": 0.0001718510405257393, |
|
"loss": 1.898, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.32475884244373, |
|
"grad_norm": 0.7059448957443237, |
|
"learning_rate": 0.00017119386637458924, |
|
"loss": 1.9594, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.3311897106109325, |
|
"grad_norm": 0.7007871866226196, |
|
"learning_rate": 0.0001705366922234392, |
|
"loss": 1.9476, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.337620578778135, |
|
"grad_norm": 0.6973986029624939, |
|
"learning_rate": 0.00016987951807228915, |
|
"loss": 1.9567, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.3440514469453375, |
|
"grad_norm": 0.7169969081878662, |
|
"learning_rate": 0.00016922234392113909, |
|
"loss": 1.9685, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.3504823151125402, |
|
"grad_norm": 0.7009272575378418, |
|
"learning_rate": 0.00016856516976998905, |
|
"loss": 1.9714, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.3569131832797428, |
|
"grad_norm": 0.7070193290710449, |
|
"learning_rate": 0.00016790799561883896, |
|
"loss": 1.9695, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.3633440514469453, |
|
"grad_norm": 0.7268947958946228, |
|
"learning_rate": 0.00016725082146768893, |
|
"loss": 1.9107, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.369774919614148, |
|
"grad_norm": 0.7544928789138794, |
|
"learning_rate": 0.00016659364731653887, |
|
"loss": 1.8658, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.3762057877813505, |
|
"grad_norm": 0.6320627927780151, |
|
"learning_rate": 0.0001659364731653888, |
|
"loss": 1.8917, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.382636655948553, |
|
"grad_norm": 0.6863923668861389, |
|
"learning_rate": 0.00016527929901423877, |
|
"loss": 1.9237, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.3890675241157555, |
|
"grad_norm": 0.7775669097900391, |
|
"learning_rate": 0.00016462212486308868, |
|
"loss": 1.8548, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.3954983922829582, |
|
"grad_norm": 0.7198719382286072, |
|
"learning_rate": 0.00016396495071193865, |
|
"loss": 1.9145, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.4019292604501608, |
|
"grad_norm": 0.7938317656517029, |
|
"learning_rate": 0.00016330777656078861, |
|
"loss": 1.8939, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.4083601286173635, |
|
"grad_norm": 0.7361711263656616, |
|
"learning_rate": 0.00016265060240963853, |
|
"loss": 1.9642, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.414790996784566, |
|
"grad_norm": 0.7385576963424683, |
|
"learning_rate": 0.0001619934282584885, |
|
"loss": 1.9134, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.414790996784566, |
|
"eval_loss": 1.9883830547332764, |
|
"eval_runtime": 130.0767, |
|
"eval_samples_per_second": 15.376, |
|
"eval_steps_per_second": 1.922, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.4212218649517685, |
|
"grad_norm": 0.7863461971282959, |
|
"learning_rate": 0.0001613362541073384, |
|
"loss": 2.0157, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.427652733118971, |
|
"grad_norm": 0.7755898237228394, |
|
"learning_rate": 0.00016067907995618837, |
|
"loss": 1.8973, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.4340836012861735, |
|
"grad_norm": 0.7090388536453247, |
|
"learning_rate": 0.00016002190580503833, |
|
"loss": 1.9034, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.4405144694533762, |
|
"grad_norm": 0.6487644910812378, |
|
"learning_rate": 0.00015936473165388825, |
|
"loss": 1.906, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.4469453376205788, |
|
"grad_norm": 0.6597898006439209, |
|
"learning_rate": 0.0001587075575027382, |
|
"loss": 1.843, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.4533762057877815, |
|
"grad_norm": 0.7069796323776245, |
|
"learning_rate": 0.00015805038335158818, |
|
"loss": 1.9554, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.459807073954984, |
|
"grad_norm": 0.7358680367469788, |
|
"learning_rate": 0.0001573932092004381, |
|
"loss": 1.9268, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.4662379421221865, |
|
"grad_norm": 0.675457775592804, |
|
"learning_rate": 0.00015673603504928806, |
|
"loss": 1.8981, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.472668810289389, |
|
"grad_norm": 0.7369397878646851, |
|
"learning_rate": 0.000156078860898138, |
|
"loss": 1.9535, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.4790996784565915, |
|
"grad_norm": 0.666994035243988, |
|
"learning_rate": 0.00015542168674698793, |
|
"loss": 1.8657, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.4855305466237942, |
|
"grad_norm": 0.7241340279579163, |
|
"learning_rate": 0.0001547645125958379, |
|
"loss": 1.8097, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.4919614147909968, |
|
"grad_norm": 0.7224936485290527, |
|
"learning_rate": 0.0001541073384446878, |
|
"loss": 1.8397, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.4983922829581995, |
|
"grad_norm": 0.7167637348175049, |
|
"learning_rate": 0.00015345016429353778, |
|
"loss": 1.9225, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.504823151125402, |
|
"grad_norm": 0.7176666259765625, |
|
"learning_rate": 0.00015279299014238771, |
|
"loss": 1.8764, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.5112540192926045, |
|
"grad_norm": 0.735252857208252, |
|
"learning_rate": 0.00015213581599123765, |
|
"loss": 1.8935, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.517684887459807, |
|
"grad_norm": 0.6805827021598816, |
|
"learning_rate": 0.00015147864184008762, |
|
"loss": 1.9212, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.5241157556270095, |
|
"grad_norm": 0.7019375562667847, |
|
"learning_rate": 0.00015082146768893756, |
|
"loss": 1.9318, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.5305466237942122, |
|
"grad_norm": 0.6795372366905212, |
|
"learning_rate": 0.0001501642935377875, |
|
"loss": 1.9023, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.5369774919614148, |
|
"grad_norm": 0.6497982144355774, |
|
"learning_rate": 0.00014950711938663743, |
|
"loss": 1.9721, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.5434083601286175, |
|
"grad_norm": 0.7713346481323242, |
|
"learning_rate": 0.0001488499452354874, |
|
"loss": 1.9906, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.5434083601286175, |
|
"eval_loss": 1.9822700023651123, |
|
"eval_runtime": 130.376, |
|
"eval_samples_per_second": 15.34, |
|
"eval_steps_per_second": 1.918, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.54983922829582, |
|
"grad_norm": 0.7202898263931274, |
|
"learning_rate": 0.00014819277108433734, |
|
"loss": 1.8816, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.5562700964630225, |
|
"grad_norm": 0.7167313694953918, |
|
"learning_rate": 0.00014753559693318728, |
|
"loss": 1.9316, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.562700964630225, |
|
"grad_norm": 0.7133712768554688, |
|
"learning_rate": 0.00014687842278203724, |
|
"loss": 2.0053, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.5691318327974275, |
|
"grad_norm": 0.76304692029953, |
|
"learning_rate": 0.00014622124863088718, |
|
"loss": 1.8718, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.5755627009646302, |
|
"grad_norm": 0.667654812335968, |
|
"learning_rate": 0.00014556407447973712, |
|
"loss": 1.8727, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.5819935691318328, |
|
"grad_norm": 0.7308873534202576, |
|
"learning_rate": 0.00014490690032858706, |
|
"loss": 1.8918, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.5884244372990355, |
|
"grad_norm": 0.9376251697540283, |
|
"learning_rate": 0.00014424972617743702, |
|
"loss": 1.96, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.594855305466238, |
|
"grad_norm": 0.6924982666969299, |
|
"learning_rate": 0.00014359255202628696, |
|
"loss": 1.8744, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.6012861736334405, |
|
"grad_norm": 0.7420899868011475, |
|
"learning_rate": 0.0001429353778751369, |
|
"loss": 1.9112, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.607717041800643, |
|
"grad_norm": 0.7384818196296692, |
|
"learning_rate": 0.00014227820372398684, |
|
"loss": 1.9562, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.6141479099678455, |
|
"grad_norm": 0.7550799250602722, |
|
"learning_rate": 0.0001416210295728368, |
|
"loss": 1.891, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.6205787781350482, |
|
"grad_norm": 0.7184371948242188, |
|
"learning_rate": 0.00014096385542168674, |
|
"loss": 1.9361, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.6270096463022508, |
|
"grad_norm": 0.770914614200592, |
|
"learning_rate": 0.00014030668127053668, |
|
"loss": 1.9132, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.6334405144694535, |
|
"grad_norm": 0.7566716074943542, |
|
"learning_rate": 0.00013964950711938662, |
|
"loss": 1.8982, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.639871382636656, |
|
"grad_norm": 0.6670147776603699, |
|
"learning_rate": 0.00013899233296823656, |
|
"loss": 1.9211, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.6463022508038585, |
|
"grad_norm": 0.7093060612678528, |
|
"learning_rate": 0.00013833515881708653, |
|
"loss": 1.8881, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.652733118971061, |
|
"grad_norm": 0.6549977660179138, |
|
"learning_rate": 0.00013767798466593646, |
|
"loss": 1.9187, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.6591639871382635, |
|
"grad_norm": 0.7039531469345093, |
|
"learning_rate": 0.0001370208105147864, |
|
"loss": 1.9165, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.6655948553054662, |
|
"grad_norm": 0.7216307520866394, |
|
"learning_rate": 0.00013636363636363634, |
|
"loss": 1.9228, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.6720257234726688, |
|
"grad_norm": 0.6866537928581238, |
|
"learning_rate": 0.00013570646221248628, |
|
"loss": 1.9003, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.6720257234726688, |
|
"eval_loss": 1.977206826210022, |
|
"eval_runtime": 131.9243, |
|
"eval_samples_per_second": 15.16, |
|
"eval_steps_per_second": 1.895, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.6784565916398715, |
|
"grad_norm": 0.7328875660896301, |
|
"learning_rate": 0.00013504928806133625, |
|
"loss": 1.9, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.684887459807074, |
|
"grad_norm": 0.7623500227928162, |
|
"learning_rate": 0.00013439211391018618, |
|
"loss": 1.9117, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.6913183279742765, |
|
"grad_norm": 0.6996557712554932, |
|
"learning_rate": 0.00013373493975903612, |
|
"loss": 1.8342, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.697749196141479, |
|
"grad_norm": 0.6597011685371399, |
|
"learning_rate": 0.00013307776560788606, |
|
"loss": 1.911, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.7041800643086815, |
|
"grad_norm": 0.7154627442359924, |
|
"learning_rate": 0.00013242059145673603, |
|
"loss": 1.8955, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.7106109324758842, |
|
"grad_norm": 0.6822642087936401, |
|
"learning_rate": 0.00013176341730558597, |
|
"loss": 1.928, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.717041800643087, |
|
"grad_norm": 0.6770340204238892, |
|
"learning_rate": 0.0001311062431544359, |
|
"loss": 1.934, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.7234726688102895, |
|
"grad_norm": 0.7235671877861023, |
|
"learning_rate": 0.00013044906900328584, |
|
"loss": 1.9248, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.729903536977492, |
|
"grad_norm": 0.6428620219230652, |
|
"learning_rate": 0.0001297918948521358, |
|
"loss": 1.8998, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.7363344051446945, |
|
"grad_norm": 0.7132564783096313, |
|
"learning_rate": 0.00012913472070098575, |
|
"loss": 1.9353, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.742765273311897, |
|
"grad_norm": 0.7110019326210022, |
|
"learning_rate": 0.0001284775465498357, |
|
"loss": 1.8877, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.7491961414790995, |
|
"grad_norm": 0.7546197772026062, |
|
"learning_rate": 0.00012782037239868565, |
|
"loss": 1.9219, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.7556270096463023, |
|
"grad_norm": 0.8485615253448486, |
|
"learning_rate": 0.0001271631982475356, |
|
"loss": 1.9238, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.762057877813505, |
|
"grad_norm": 0.7058401703834534, |
|
"learning_rate": 0.00012650602409638553, |
|
"loss": 1.9012, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.7684887459807075, |
|
"grad_norm": 0.7222112417221069, |
|
"learning_rate": 0.00012584884994523547, |
|
"loss": 1.8442, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.77491961414791, |
|
"grad_norm": 0.7010639905929565, |
|
"learning_rate": 0.00012519167579408543, |
|
"loss": 1.9322, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.7813504823151125, |
|
"grad_norm": 0.6908234357833862, |
|
"learning_rate": 0.00012453450164293537, |
|
"loss": 1.9456, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.787781350482315, |
|
"grad_norm": 0.6615903973579407, |
|
"learning_rate": 0.0001238773274917853, |
|
"loss": 1.9052, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.7942122186495175, |
|
"grad_norm": 0.6688089370727539, |
|
"learning_rate": 0.00012322015334063528, |
|
"loss": 1.87, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.8006430868167203, |
|
"grad_norm": 0.7396994233131409, |
|
"learning_rate": 0.00012256297918948522, |
|
"loss": 1.9243, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.8006430868167203, |
|
"eval_loss": 1.974278450012207, |
|
"eval_runtime": 144.2243, |
|
"eval_samples_per_second": 13.867, |
|
"eval_steps_per_second": 1.733, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.807073954983923, |
|
"grad_norm": 0.6520466208457947, |
|
"learning_rate": 0.00012190580503833514, |
|
"loss": 1.902, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.8135048231511255, |
|
"grad_norm": 0.7591603398323059, |
|
"learning_rate": 0.00012124863088718509, |
|
"loss": 1.9079, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.819935691318328, |
|
"grad_norm": 0.6622514128684998, |
|
"learning_rate": 0.00012059145673603504, |
|
"loss": 1.9288, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.8263665594855305, |
|
"grad_norm": 0.7578607797622681, |
|
"learning_rate": 0.00011993428258488498, |
|
"loss": 1.8936, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.832797427652733, |
|
"grad_norm": 0.730093240737915, |
|
"learning_rate": 0.00011927710843373494, |
|
"loss": 1.8809, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.8392282958199357, |
|
"grad_norm": 0.6403250098228455, |
|
"learning_rate": 0.00011861993428258487, |
|
"loss": 1.8866, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.8456591639871383, |
|
"grad_norm": 0.7032350897789001, |
|
"learning_rate": 0.00011796276013143481, |
|
"loss": 1.938, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.852090032154341, |
|
"grad_norm": 0.7376342415809631, |
|
"learning_rate": 0.00011730558598028478, |
|
"loss": 1.8925, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.8585209003215435, |
|
"grad_norm": 0.7093110680580139, |
|
"learning_rate": 0.00011664841182913472, |
|
"loss": 1.9029, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.864951768488746, |
|
"grad_norm": 0.6826250553131104, |
|
"learning_rate": 0.00011599123767798466, |
|
"loss": 1.8956, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.8713826366559485, |
|
"grad_norm": 0.7709969282150269, |
|
"learning_rate": 0.0001153340635268346, |
|
"loss": 1.92, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.877813504823151, |
|
"grad_norm": 0.6641222238540649, |
|
"learning_rate": 0.00011467688937568453, |
|
"loss": 1.8998, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.8842443729903537, |
|
"grad_norm": 0.7321887612342834, |
|
"learning_rate": 0.0001140197152245345, |
|
"loss": 1.9257, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.8906752411575563, |
|
"grad_norm": 0.7000001668930054, |
|
"learning_rate": 0.00011336254107338444, |
|
"loss": 1.8944, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.897106109324759, |
|
"grad_norm": 0.7347818613052368, |
|
"learning_rate": 0.00011270536692223438, |
|
"loss": 1.9256, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.9035369774919615, |
|
"grad_norm": 0.708888590335846, |
|
"learning_rate": 0.00011204819277108433, |
|
"loss": 1.9307, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.909967845659164, |
|
"grad_norm": 0.6980915665626526, |
|
"learning_rate": 0.00011139101861993428, |
|
"loss": 1.883, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.9163987138263665, |
|
"grad_norm": 0.8052535653114319, |
|
"learning_rate": 0.00011073384446878422, |
|
"loss": 1.899, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.922829581993569, |
|
"grad_norm": 0.707011878490448, |
|
"learning_rate": 0.00011007667031763416, |
|
"loss": 1.9263, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.9292604501607717, |
|
"grad_norm": 0.7086938619613647, |
|
"learning_rate": 0.00010941949616648411, |
|
"loss": 1.883, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.9292604501607717, |
|
"eval_loss": 1.9664931297302246, |
|
"eval_runtime": 133.023, |
|
"eval_samples_per_second": 15.035, |
|
"eval_steps_per_second": 1.879, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4665, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.0137669676957696e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|