|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 2290, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004366812227074236, |
|
"grad_norm": 1.8752956704333947, |
|
"learning_rate": 8.733624454148472e-07, |
|
"loss": 3.7085, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.021834061135371178, |
|
"grad_norm": 1.8028899921940393, |
|
"learning_rate": 4.3668122270742355e-06, |
|
"loss": 3.6291, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.043668122270742356, |
|
"grad_norm": 1.75783914939411, |
|
"learning_rate": 8.733624454148471e-06, |
|
"loss": 3.547, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06550218340611354, |
|
"grad_norm": 2.1823195400139026, |
|
"learning_rate": 1.3100436681222708e-05, |
|
"loss": 3.6303, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.08733624454148471, |
|
"grad_norm": 2.7703299276693256, |
|
"learning_rate": 1.7467248908296942e-05, |
|
"loss": 3.5805, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1091703056768559, |
|
"grad_norm": 3.0501154969163244, |
|
"learning_rate": 2.183406113537118e-05, |
|
"loss": 3.277, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.13100436681222707, |
|
"grad_norm": 2.0575132099752795, |
|
"learning_rate": 2.6200873362445416e-05, |
|
"loss": 2.8289, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.15283842794759825, |
|
"grad_norm": 1.2985876224099822, |
|
"learning_rate": 3.056768558951965e-05, |
|
"loss": 2.6017, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.17467248908296942, |
|
"grad_norm": 0.7330786544465717, |
|
"learning_rate": 3.4934497816593884e-05, |
|
"loss": 2.4112, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1965065502183406, |
|
"grad_norm": 0.7933262369374595, |
|
"learning_rate": 3.930131004366812e-05, |
|
"loss": 2.2821, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2183406113537118, |
|
"grad_norm": 0.7790118551391272, |
|
"learning_rate": 4.366812227074236e-05, |
|
"loss": 2.0793, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.24017467248908297, |
|
"grad_norm": 0.7082020251604142, |
|
"learning_rate": 4.8034934497816594e-05, |
|
"loss": 2.0505, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.26200873362445415, |
|
"grad_norm": 0.5696342576311191, |
|
"learning_rate": 5.240174672489083e-05, |
|
"loss": 1.8173, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2838427947598253, |
|
"grad_norm": 0.5023621326635465, |
|
"learning_rate": 5.6768558951965065e-05, |
|
"loss": 1.8912, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3056768558951965, |
|
"grad_norm": 0.4787415611645182, |
|
"learning_rate": 6.11353711790393e-05, |
|
"loss": 1.7972, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.32751091703056767, |
|
"grad_norm": 0.48422783401495123, |
|
"learning_rate": 6.550218340611354e-05, |
|
"loss": 1.7183, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.34934497816593885, |
|
"grad_norm": 0.3910949318663936, |
|
"learning_rate": 6.986899563318777e-05, |
|
"loss": 1.6366, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.37117903930131, |
|
"grad_norm": 0.3902520624627953, |
|
"learning_rate": 7.423580786026201e-05, |
|
"loss": 1.7019, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3930131004366812, |
|
"grad_norm": 0.3853385163155022, |
|
"learning_rate": 7.860262008733625e-05, |
|
"loss": 1.7606, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4148471615720524, |
|
"grad_norm": 0.3911758987224519, |
|
"learning_rate": 8.296943231441049e-05, |
|
"loss": 1.6395, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.4366812227074236, |
|
"grad_norm": 0.42385759736776996, |
|
"learning_rate": 8.733624454148472e-05, |
|
"loss": 1.6328, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4585152838427948, |
|
"grad_norm": 0.3975926298253002, |
|
"learning_rate": 9.170305676855896e-05, |
|
"loss": 1.6775, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.48034934497816595, |
|
"grad_norm": 0.4355288111027398, |
|
"learning_rate": 9.606986899563319e-05, |
|
"loss": 1.6008, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5021834061135371, |
|
"grad_norm": 0.44774118787630435, |
|
"learning_rate": 0.00010043668122270742, |
|
"loss": 1.6046, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5240174672489083, |
|
"grad_norm": 0.45365568335085893, |
|
"learning_rate": 0.00010480349344978167, |
|
"loss": 1.6348, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5458515283842795, |
|
"grad_norm": 0.3985940875605887, |
|
"learning_rate": 0.00010917030567685591, |
|
"loss": 1.616, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5676855895196506, |
|
"grad_norm": 0.4103041259632963, |
|
"learning_rate": 0.00011353711790393013, |
|
"loss": 1.6063, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5895196506550219, |
|
"grad_norm": 0.45594857125060284, |
|
"learning_rate": 0.00011790393013100438, |
|
"loss": 1.5782, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.611353711790393, |
|
"grad_norm": 0.406753522288533, |
|
"learning_rate": 0.0001222707423580786, |
|
"loss": 1.5361, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6331877729257642, |
|
"grad_norm": 0.45489448779672886, |
|
"learning_rate": 0.00012663755458515284, |
|
"loss": 1.6416, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6550218340611353, |
|
"grad_norm": 0.4232268425851412, |
|
"learning_rate": 0.00013100436681222707, |
|
"loss": 1.5449, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6768558951965066, |
|
"grad_norm": 0.4008200720858846, |
|
"learning_rate": 0.00013537117903930133, |
|
"loss": 1.6322, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6986899563318777, |
|
"grad_norm": 0.41073046435729793, |
|
"learning_rate": 0.00013973799126637554, |
|
"loss": 1.6144, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7205240174672489, |
|
"grad_norm": 0.43150859357010535, |
|
"learning_rate": 0.0001441048034934498, |
|
"loss": 1.6816, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.74235807860262, |
|
"grad_norm": 0.4209643739546475, |
|
"learning_rate": 0.00014847161572052403, |
|
"loss": 1.6049, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7641921397379913, |
|
"grad_norm": 0.4646259107508816, |
|
"learning_rate": 0.00015283842794759826, |
|
"loss": 1.6193, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7860262008733624, |
|
"grad_norm": 0.42132133209440126, |
|
"learning_rate": 0.0001572052401746725, |
|
"loss": 1.5542, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8078602620087336, |
|
"grad_norm": 0.4068655673248684, |
|
"learning_rate": 0.00016157205240174672, |
|
"loss": 1.5172, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.8296943231441049, |
|
"grad_norm": 0.45022442420363395, |
|
"learning_rate": 0.00016593886462882098, |
|
"loss": 1.65, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.851528384279476, |
|
"grad_norm": 0.4218769408186785, |
|
"learning_rate": 0.00017030567685589521, |
|
"loss": 1.7073, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.8733624454148472, |
|
"grad_norm": 0.44363856749896563, |
|
"learning_rate": 0.00017467248908296945, |
|
"loss": 1.6647, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8951965065502183, |
|
"grad_norm": 0.39452894148369905, |
|
"learning_rate": 0.00017903930131004368, |
|
"loss": 1.4932, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.9170305676855895, |
|
"grad_norm": 0.407234590774645, |
|
"learning_rate": 0.0001834061135371179, |
|
"loss": 1.5987, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9388646288209607, |
|
"grad_norm": 0.4299787387863718, |
|
"learning_rate": 0.00018777292576419214, |
|
"loss": 1.606, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.9606986899563319, |
|
"grad_norm": 0.4459993359246055, |
|
"learning_rate": 0.00019213973799126638, |
|
"loss": 1.6248, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.982532751091703, |
|
"grad_norm": 0.42477289910814814, |
|
"learning_rate": 0.0001965065502183406, |
|
"loss": 1.5145, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.0043668122270741, |
|
"grad_norm": 0.4524366274873438, |
|
"learning_rate": 0.00019999988382473225, |
|
"loss": 1.6031, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.0262008733624455, |
|
"grad_norm": 0.4084729303647835, |
|
"learning_rate": 0.00019999581771870396, |
|
"loss": 1.5467, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.0480349344978166, |
|
"grad_norm": 0.5172909354442642, |
|
"learning_rate": 0.0001999859431192192, |
|
"loss": 1.4636, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0698689956331877, |
|
"grad_norm": 0.4686179138565006, |
|
"learning_rate": 0.00019997026059986742, |
|
"loss": 1.5244, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.091703056768559, |
|
"grad_norm": 0.47437945549478044, |
|
"learning_rate": 0.00019994877107160482, |
|
"loss": 1.4414, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.1135371179039302, |
|
"grad_norm": 0.45613574001819357, |
|
"learning_rate": 0.00019992147578270142, |
|
"loss": 1.4545, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.1353711790393013, |
|
"grad_norm": 0.4440788329210085, |
|
"learning_rate": 0.00019988837631866864, |
|
"loss": 1.4727, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.1572052401746724, |
|
"grad_norm": 0.43804218551099794, |
|
"learning_rate": 0.00019984947460216707, |
|
"loss": 1.5721, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.1790393013100438, |
|
"grad_norm": 0.4712591099454086, |
|
"learning_rate": 0.0001998047728928949, |
|
"loss": 1.45, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.2008733624454149, |
|
"grad_norm": 0.46682322093255346, |
|
"learning_rate": 0.00019975427378745659, |
|
"loss": 1.5364, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.222707423580786, |
|
"grad_norm": 0.44776219185560584, |
|
"learning_rate": 0.00019969798021921201, |
|
"loss": 1.4799, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.244541484716157, |
|
"grad_norm": 0.43694406427786026, |
|
"learning_rate": 0.0001996358954581062, |
|
"loss": 1.3916, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.2663755458515285, |
|
"grad_norm": 0.4369949578213358, |
|
"learning_rate": 0.00019956802311047925, |
|
"loss": 1.5629, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.2882096069868996, |
|
"grad_norm": 0.4507447731242532, |
|
"learning_rate": 0.00019949436711885686, |
|
"loss": 1.5553, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.3100436681222707, |
|
"grad_norm": 0.4238763083224714, |
|
"learning_rate": 0.00019941493176172154, |
|
"loss": 1.555, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.3318777292576418, |
|
"grad_norm": 0.44296845533811896, |
|
"learning_rate": 0.0001993297216532637, |
|
"loss": 1.5952, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.3537117903930131, |
|
"grad_norm": 0.44395910006889616, |
|
"learning_rate": 0.00019923874174311394, |
|
"loss": 1.4769, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.3755458515283843, |
|
"grad_norm": 0.44391981727033414, |
|
"learning_rate": 0.00019914199731605546, |
|
"loss": 1.5458, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.3973799126637554, |
|
"grad_norm": 0.46314929089631696, |
|
"learning_rate": 0.00019903949399171692, |
|
"loss": 1.5994, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.4192139737991267, |
|
"grad_norm": 0.4902612617938975, |
|
"learning_rate": 0.0001989312377242463, |
|
"loss": 1.5253, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.4410480349344978, |
|
"grad_norm": 0.4264196408790441, |
|
"learning_rate": 0.0001988172348019648, |
|
"loss": 1.5378, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.462882096069869, |
|
"grad_norm": 0.4125915654790707, |
|
"learning_rate": 0.00019869749184700156, |
|
"loss": 1.4231, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.48471615720524, |
|
"grad_norm": 0.4121352169920894, |
|
"learning_rate": 0.00019857201581490933, |
|
"loss": 1.4937, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.5065502183406112, |
|
"grad_norm": 0.46111501316021164, |
|
"learning_rate": 0.00019844081399425997, |
|
"loss": 1.6366, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.5283842794759825, |
|
"grad_norm": 0.41020501591518393, |
|
"learning_rate": 0.0001983038940062214, |
|
"loss": 1.5345, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.5502183406113537, |
|
"grad_norm": 0.42368595569558, |
|
"learning_rate": 0.00019816126380411476, |
|
"loss": 1.5478, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.572052401746725, |
|
"grad_norm": 0.41418005567795424, |
|
"learning_rate": 0.0001980129316729526, |
|
"loss": 1.5202, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.5938864628820961, |
|
"grad_norm": 0.4293992491929074, |
|
"learning_rate": 0.0001978589062289573, |
|
"loss": 1.4605, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.6157205240174672, |
|
"grad_norm": 0.4361713850313702, |
|
"learning_rate": 0.00019769919641906097, |
|
"loss": 1.5154, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.6375545851528384, |
|
"grad_norm": 0.3915518977683478, |
|
"learning_rate": 0.0001975338115203854, |
|
"loss": 1.3845, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.6593886462882095, |
|
"grad_norm": 0.40952561447487074, |
|
"learning_rate": 0.0001973627611397034, |
|
"loss": 1.5663, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.6812227074235808, |
|
"grad_norm": 0.41401676412381255, |
|
"learning_rate": 0.00019718605521288073, |
|
"loss": 1.5892, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.703056768558952, |
|
"grad_norm": 0.40948687994041144, |
|
"learning_rate": 0.00019700370400429885, |
|
"loss": 1.5853, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.7248908296943233, |
|
"grad_norm": 0.39912451760472517, |
|
"learning_rate": 0.00019681571810625873, |
|
"loss": 1.5086, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.7467248908296944, |
|
"grad_norm": 0.4274906140041681, |
|
"learning_rate": 0.00019662210843836574, |
|
"loss": 1.5361, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.7685589519650655, |
|
"grad_norm": 0.45496243462203195, |
|
"learning_rate": 0.00019642288624689501, |
|
"loss": 1.5281, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.7903930131004366, |
|
"grad_norm": 0.40754588577831136, |
|
"learning_rate": 0.00019621806310413857, |
|
"loss": 1.4146, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.8122270742358078, |
|
"grad_norm": 0.46020764826270205, |
|
"learning_rate": 0.00019600765090773282, |
|
"loss": 1.509, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.8340611353711789, |
|
"grad_norm": 0.41433203928546175, |
|
"learning_rate": 0.0001957916618799676, |
|
"loss": 1.4521, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.8558951965065502, |
|
"grad_norm": 0.4076208140451916, |
|
"learning_rate": 0.00019557010856707617, |
|
"loss": 1.5177, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.8777292576419216, |
|
"grad_norm": 0.3861676767334766, |
|
"learning_rate": 0.00019534300383850642, |
|
"loss": 1.5334, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.8995633187772927, |
|
"grad_norm": 0.38833060515579254, |
|
"learning_rate": 0.00019511036088617342, |
|
"loss": 1.5405, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.9213973799126638, |
|
"grad_norm": 0.42695215319286783, |
|
"learning_rate": 0.000194872193223693, |
|
"loss": 1.5458, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.943231441048035, |
|
"grad_norm": 0.4089286925168508, |
|
"learning_rate": 0.0001946285146855968, |
|
"loss": 1.5466, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.965065502183406, |
|
"grad_norm": 0.42155330336215135, |
|
"learning_rate": 0.00019437933942652885, |
|
"loss": 1.566, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.9868995633187772, |
|
"grad_norm": 0.3751384149186164, |
|
"learning_rate": 0.000194124681920423, |
|
"loss": 1.4511, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.0087336244541483, |
|
"grad_norm": 0.4161005311145859, |
|
"learning_rate": 0.00019386455695966253, |
|
"loss": 1.4751, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.03056768558952, |
|
"grad_norm": 0.4391122093349958, |
|
"learning_rate": 0.0001935989796542207, |
|
"loss": 1.4673, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.052401746724891, |
|
"grad_norm": 0.4822422647436377, |
|
"learning_rate": 0.00019332796543078314, |
|
"loss": 1.4212, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.074235807860262, |
|
"grad_norm": 0.47440157836581254, |
|
"learning_rate": 0.00019305153003185165, |
|
"loss": 1.4117, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.096069868995633, |
|
"grad_norm": 0.5171235330183517, |
|
"learning_rate": 0.00019276968951482986, |
|
"loss": 1.377, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.1179039301310043, |
|
"grad_norm": 0.5213453701595562, |
|
"learning_rate": 0.00019248246025109045, |
|
"loss": 1.3892, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.1397379912663754, |
|
"grad_norm": 0.5031391500788147, |
|
"learning_rate": 0.0001921898589250242, |
|
"loss": 1.3859, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.1615720524017465, |
|
"grad_norm": 0.48354639648956227, |
|
"learning_rate": 0.00019189190253307082, |
|
"loss": 1.3916, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.183406113537118, |
|
"grad_norm": 0.5173445212952421, |
|
"learning_rate": 0.00019158860838273172, |
|
"loss": 1.3977, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.2052401746724892, |
|
"grad_norm": 0.513094289634623, |
|
"learning_rate": 0.00019127999409156453, |
|
"loss": 1.3707, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.2270742358078603, |
|
"grad_norm": 0.5024827997518101, |
|
"learning_rate": 0.00019096607758615998, |
|
"loss": 1.3482, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.2489082969432315, |
|
"grad_norm": 0.5798563945761437, |
|
"learning_rate": 0.0001906468771011003, |
|
"loss": 1.4178, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.2707423580786026, |
|
"grad_norm": 0.5150755731275006, |
|
"learning_rate": 0.00019032241117790028, |
|
"loss": 1.4191, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.2925764192139737, |
|
"grad_norm": 0.5437625450565541, |
|
"learning_rate": 0.00018999269866393006, |
|
"loss": 1.3817, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.314410480349345, |
|
"grad_norm": 0.4924027196915137, |
|
"learning_rate": 0.00018965775871132044, |
|
"loss": 1.3745, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.3362445414847164, |
|
"grad_norm": 0.5287496980456949, |
|
"learning_rate": 0.00018931761077585035, |
|
"loss": 1.3749, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.3580786026200875, |
|
"grad_norm": 0.5123509887446156, |
|
"learning_rate": 0.00018897227461581672, |
|
"loss": 1.4476, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.3799126637554586, |
|
"grad_norm": 0.5054089069691902, |
|
"learning_rate": 0.00018862177029088675, |
|
"loss": 1.4103, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.4017467248908297, |
|
"grad_norm": 0.5339659464397467, |
|
"learning_rate": 0.00018826611816093273, |
|
"loss": 1.421, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.423580786026201, |
|
"grad_norm": 0.5367901761635062, |
|
"learning_rate": 0.00018790533888484937, |
|
"loss": 1.4725, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.445414847161572, |
|
"grad_norm": 0.5453912898301467, |
|
"learning_rate": 0.00018753945341935376, |
|
"loss": 1.4671, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.467248908296943, |
|
"grad_norm": 0.5114254543022997, |
|
"learning_rate": 0.0001871684830177681, |
|
"loss": 1.5483, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.489082969432314, |
|
"grad_norm": 0.5183339258600979, |
|
"learning_rate": 0.00018679244922878516, |
|
"loss": 1.4277, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.5109170305676853, |
|
"grad_norm": 0.5202383438539178, |
|
"learning_rate": 0.00018641137389521645, |
|
"loss": 1.4767, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.532751091703057, |
|
"grad_norm": 0.5293048970285786, |
|
"learning_rate": 0.0001860252791527236, |
|
"loss": 1.4691, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.554585152838428, |
|
"grad_norm": 0.5307661790603934, |
|
"learning_rate": 0.0001856341874285324, |
|
"loss": 1.484, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.576419213973799, |
|
"grad_norm": 0.5298444162459437, |
|
"learning_rate": 0.0001852381214401302, |
|
"loss": 1.3704, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.5982532751091703, |
|
"grad_norm": 0.5045622495753448, |
|
"learning_rate": 0.00018483710419394615, |
|
"loss": 1.4273, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.6200873362445414, |
|
"grad_norm": 0.516020284616684, |
|
"learning_rate": 0.00018443115898401504, |
|
"loss": 1.5253, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.641921397379913, |
|
"grad_norm": 0.5212259321852013, |
|
"learning_rate": 0.000184020309390624, |
|
"loss": 1.4966, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.6637554585152836, |
|
"grad_norm": 0.571739861550278, |
|
"learning_rate": 0.00018360457927894287, |
|
"loss": 1.489, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.685589519650655, |
|
"grad_norm": 0.5165260257002361, |
|
"learning_rate": 0.00018318399279763797, |
|
"loss": 1.419, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.7074235807860263, |
|
"grad_norm": 0.5022734014528262, |
|
"learning_rate": 0.00018275857437746932, |
|
"loss": 1.5218, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.7292576419213974, |
|
"grad_norm": 0.5065667081884927, |
|
"learning_rate": 0.00018232834872987147, |
|
"loss": 1.3765, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.7510917030567685, |
|
"grad_norm": 0.5208990486632071, |
|
"learning_rate": 0.00018189334084551826, |
|
"loss": 1.4514, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.7729257641921397, |
|
"grad_norm": 0.4881264484974513, |
|
"learning_rate": 0.00018145357599287095, |
|
"loss": 1.4477, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.7947598253275108, |
|
"grad_norm": 0.513600593429205, |
|
"learning_rate": 0.00018100907971671054, |
|
"loss": 1.4449, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.816593886462882, |
|
"grad_norm": 0.6046158817500052, |
|
"learning_rate": 0.00018055987783665404, |
|
"loss": 1.3161, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.8384279475982535, |
|
"grad_norm": 0.5208894858087225, |
|
"learning_rate": 0.00018010599644565457, |
|
"loss": 1.4693, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.8602620087336246, |
|
"grad_norm": 0.5746066357275653, |
|
"learning_rate": 0.0001796474619084856, |
|
"loss": 1.4347, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.8820960698689957, |
|
"grad_norm": 0.5569260278390025, |
|
"learning_rate": 0.00017918430086020975, |
|
"loss": 1.4628, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.903930131004367, |
|
"grad_norm": 0.5015533731915597, |
|
"learning_rate": 0.0001787165402046313, |
|
"loss": 1.4082, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.925764192139738, |
|
"grad_norm": 0.4930090755602876, |
|
"learning_rate": 0.0001782442071127338, |
|
"loss": 1.4412, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.947598253275109, |
|
"grad_norm": 0.5351113450288878, |
|
"learning_rate": 0.0001777673290211014, |
|
"loss": 1.3765, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.96943231441048, |
|
"grad_norm": 0.5333783396073188, |
|
"learning_rate": 0.00017728593363032532, |
|
"loss": 1.4074, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.9912663755458517, |
|
"grad_norm": 0.5148622195538735, |
|
"learning_rate": 0.0001768000489033949, |
|
"loss": 1.355, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 3.013100436681223, |
|
"grad_norm": 0.5136549354887358, |
|
"learning_rate": 0.00017630970306407311, |
|
"loss": 1.33, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.034934497816594, |
|
"grad_norm": 0.587330448131839, |
|
"learning_rate": 0.00017581492459525712, |
|
"loss": 1.267, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 3.056768558951965, |
|
"grad_norm": 0.5781502863776671, |
|
"learning_rate": 0.00017531574223732396, |
|
"loss": 1.3391, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.078602620087336, |
|
"grad_norm": 0.5801593817600947, |
|
"learning_rate": 0.0001748121849864609, |
|
"loss": 1.3398, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 3.1004366812227073, |
|
"grad_norm": 0.6358481598657785, |
|
"learning_rate": 0.00017430428209298126, |
|
"loss": 1.3191, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.1222707423580784, |
|
"grad_norm": 0.635414065898168, |
|
"learning_rate": 0.00017379206305962526, |
|
"loss": 1.3233, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 3.14410480349345, |
|
"grad_norm": 0.6721891418870005, |
|
"learning_rate": 0.0001732755576398463, |
|
"loss": 1.2795, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.165938864628821, |
|
"grad_norm": 0.6262467411308055, |
|
"learning_rate": 0.00017275479583608261, |
|
"loss": 1.3117, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 3.1877729257641922, |
|
"grad_norm": 0.7147271334112754, |
|
"learning_rate": 0.00017222980789801477, |
|
"loss": 1.3604, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.2096069868995634, |
|
"grad_norm": 0.677887647709634, |
|
"learning_rate": 0.00017170062432080805, |
|
"loss": 1.3356, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 3.2314410480349345, |
|
"grad_norm": 0.6529188195262589, |
|
"learning_rate": 0.00017116727584334159, |
|
"loss": 1.3092, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.2532751091703056, |
|
"grad_norm": 0.6545432757758792, |
|
"learning_rate": 0.00017062979344642244, |
|
"loss": 1.3272, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 3.2751091703056767, |
|
"grad_norm": 0.6417623946150004, |
|
"learning_rate": 0.00017008820835098627, |
|
"loss": 1.3712, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.2969432314410483, |
|
"grad_norm": 0.6419119938295037, |
|
"learning_rate": 0.00016954255201628358, |
|
"loss": 1.372, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 3.3187772925764194, |
|
"grad_norm": 0.6643469655488602, |
|
"learning_rate": 0.00016899285613805246, |
|
"loss": 1.3883, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.3406113537117905, |
|
"grad_norm": 0.6617592247751748, |
|
"learning_rate": 0.00016843915264667746, |
|
"loss": 1.3131, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 3.3624454148471616, |
|
"grad_norm": 0.6721660291620549, |
|
"learning_rate": 0.00016788147370533482, |
|
"loss": 1.3677, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.3842794759825328, |
|
"grad_norm": 0.6696065641348963, |
|
"learning_rate": 0.00016731985170812414, |
|
"loss": 1.3612, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 3.406113537117904, |
|
"grad_norm": 0.6442936479861974, |
|
"learning_rate": 0.00016675431927818678, |
|
"loss": 1.3288, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.427947598253275, |
|
"grad_norm": 0.6665292623524364, |
|
"learning_rate": 0.00016618490926581086, |
|
"loss": 1.3302, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 3.449781659388646, |
|
"grad_norm": 0.6832138001277586, |
|
"learning_rate": 0.00016561165474652292, |
|
"loss": 1.296, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.4716157205240172, |
|
"grad_norm": 0.6676829472258946, |
|
"learning_rate": 0.0001650345890191669, |
|
"loss": 1.258, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 3.493449781659389, |
|
"grad_norm": 0.6462889175077688, |
|
"learning_rate": 0.00016445374560396974, |
|
"loss": 1.3108, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.51528384279476, |
|
"grad_norm": 0.6538188867916314, |
|
"learning_rate": 0.00016386915824059427, |
|
"loss": 1.2225, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 3.537117903930131, |
|
"grad_norm": 0.6573367032320154, |
|
"learning_rate": 0.0001632808608861794, |
|
"loss": 1.2692, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.558951965065502, |
|
"grad_norm": 0.6707468426806011, |
|
"learning_rate": 0.0001626888877133677, |
|
"loss": 1.2621, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 3.5807860262008733, |
|
"grad_norm": 0.6607806582929415, |
|
"learning_rate": 0.00016209327310832028, |
|
"loss": 1.3217, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.6026200873362444, |
|
"grad_norm": 0.6695542325826566, |
|
"learning_rate": 0.00016149405166871947, |
|
"loss": 1.2445, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 3.6244541484716155, |
|
"grad_norm": 0.6854945885270477, |
|
"learning_rate": 0.00016089125820175913, |
|
"loss": 1.2334, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.646288209606987, |
|
"grad_norm": 0.6998882406491346, |
|
"learning_rate": 0.00016028492772212277, |
|
"loss": 1.3228, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 3.668122270742358, |
|
"grad_norm": 0.6481594617699246, |
|
"learning_rate": 0.00015967509544994959, |
|
"loss": 1.3119, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.6899563318777293, |
|
"grad_norm": 0.6775238829866298, |
|
"learning_rate": 0.00015906179680878876, |
|
"loss": 1.2587, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 3.7117903930131004, |
|
"grad_norm": 0.667250254807951, |
|
"learning_rate": 0.00015844506742354164, |
|
"loss": 1.335, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.7336244541484715, |
|
"grad_norm": 0.6850864766225458, |
|
"learning_rate": 0.00015782494311839248, |
|
"loss": 1.3585, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 3.7554585152838427, |
|
"grad_norm": 0.7100136757699026, |
|
"learning_rate": 0.00015720145991472746, |
|
"loss": 1.3494, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.777292576419214, |
|
"grad_norm": 0.6455753266089419, |
|
"learning_rate": 0.00015657465402904239, |
|
"loss": 1.32, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 3.7991266375545854, |
|
"grad_norm": 0.6774008615272976, |
|
"learning_rate": 0.00015594456187083887, |
|
"loss": 1.3053, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.8209606986899565, |
|
"grad_norm": 0.6412117270718072, |
|
"learning_rate": 0.0001553112200405094, |
|
"loss": 1.3468, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 3.8427947598253276, |
|
"grad_norm": 0.6299987093163749, |
|
"learning_rate": 0.00015467466532721136, |
|
"loss": 1.2464, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.8646288209606987, |
|
"grad_norm": 0.6648608762625429, |
|
"learning_rate": 0.00015403493470673006, |
|
"loss": 1.4054, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 3.88646288209607, |
|
"grad_norm": 0.6973437539106749, |
|
"learning_rate": 0.00015339206533933087, |
|
"loss": 1.3005, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.908296943231441, |
|
"grad_norm": 0.6609891540439728, |
|
"learning_rate": 0.00015274609456760073, |
|
"loss": 1.3751, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 3.930131004366812, |
|
"grad_norm": 0.6243806427317503, |
|
"learning_rate": 0.0001520970599142789, |
|
"loss": 1.309, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.9519650655021836, |
|
"grad_norm": 0.6701604629698161, |
|
"learning_rate": 0.00015144499908007757, |
|
"loss": 1.3302, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 3.9737991266375547, |
|
"grad_norm": 0.6417002641455068, |
|
"learning_rate": 0.00015078994994149167, |
|
"loss": 1.3244, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.995633187772926, |
|
"grad_norm": 0.6227627176738441, |
|
"learning_rate": 0.00015013195054859894, |
|
"loss": 1.3739, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 4.0174672489082965, |
|
"grad_norm": 0.69565512329475, |
|
"learning_rate": 0.00014947103912284958, |
|
"loss": 1.1587, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.039301310043668, |
|
"grad_norm": 0.9168936710349829, |
|
"learning_rate": 0.0001488072540548461, |
|
"loss": 1.183, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 4.06113537117904, |
|
"grad_norm": 0.768969897857725, |
|
"learning_rate": 0.00014814063390211334, |
|
"loss": 1.1114, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 4.08296943231441, |
|
"grad_norm": 0.8549092972669946, |
|
"learning_rate": 0.00014747121738685874, |
|
"loss": 1.2111, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 4.104803493449782, |
|
"grad_norm": 0.8316566189643829, |
|
"learning_rate": 0.00014679904339372302, |
|
"loss": 1.1581, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.126637554585153, |
|
"grad_norm": 0.8226115993114171, |
|
"learning_rate": 0.00014612415096752155, |
|
"loss": 1.1881, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 4.148471615720524, |
|
"grad_norm": 0.8505226801184157, |
|
"learning_rate": 0.0001454465793109763, |
|
"loss": 1.135, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.170305676855895, |
|
"grad_norm": 0.8021211984624651, |
|
"learning_rate": 0.00014476636778243878, |
|
"loss": 1.1768, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 4.192139737991266, |
|
"grad_norm": 0.8578093150750806, |
|
"learning_rate": 0.00014408355589360348, |
|
"loss": 1.0631, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.213973799126638, |
|
"grad_norm": 0.8812497659618362, |
|
"learning_rate": 0.00014339818330721314, |
|
"loss": 1.1288, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 4.235807860262009, |
|
"grad_norm": 0.7816446218878502, |
|
"learning_rate": 0.0001427102898347546, |
|
"loss": 1.1777, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.25764192139738, |
|
"grad_norm": 0.8163412581216741, |
|
"learning_rate": 0.0001420199154341464, |
|
"loss": 1.1469, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 4.279475982532751, |
|
"grad_norm": 0.8410801009958802, |
|
"learning_rate": 0.0001413271002074176, |
|
"loss": 1.1547, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.301310043668122, |
|
"grad_norm": 0.8957226442397316, |
|
"learning_rate": 0.00014063188439837832, |
|
"loss": 1.1054, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 4.323144104803493, |
|
"grad_norm": 0.8533966074014762, |
|
"learning_rate": 0.0001399343083902824, |
|
"loss": 1.1468, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.344978165938865, |
|
"grad_norm": 0.7969709395883895, |
|
"learning_rate": 0.00013923441270348124, |
|
"loss": 1.1661, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 4.366812227074236, |
|
"grad_norm": 0.830675985424985, |
|
"learning_rate": 0.00013853223799307031, |
|
"loss": 1.1714, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.388646288209607, |
|
"grad_norm": 0.879665119495671, |
|
"learning_rate": 0.00013782782504652763, |
|
"loss": 1.2237, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 4.4104803493449785, |
|
"grad_norm": 0.8513132295064585, |
|
"learning_rate": 0.0001371212147813443, |
|
"loss": 1.2524, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 4.432314410480349, |
|
"grad_norm": 0.8345543594033096, |
|
"learning_rate": 0.00013641244824264803, |
|
"loss": 1.2055, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 4.454148471615721, |
|
"grad_norm": 0.8449282094486232, |
|
"learning_rate": 0.00013570156660081868, |
|
"loss": 1.1459, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.475982532751091, |
|
"grad_norm": 0.8491089050635324, |
|
"learning_rate": 0.00013498861114909685, |
|
"loss": 1.165, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 4.497816593886463, |
|
"grad_norm": 0.8675954453498238, |
|
"learning_rate": 0.00013427362330118543, |
|
"loss": 1.1048, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.5196506550218345, |
|
"grad_norm": 0.9120386243780424, |
|
"learning_rate": 0.0001335566445888437, |
|
"loss": 1.2427, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 4.541484716157205, |
|
"grad_norm": 0.8105081633081175, |
|
"learning_rate": 0.00013283771665947505, |
|
"loss": 1.278, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.563318777292577, |
|
"grad_norm": 0.8869239311496004, |
|
"learning_rate": 0.00013211688127370784, |
|
"loss": 1.1099, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 4.585152838427947, |
|
"grad_norm": 0.8873085989909458, |
|
"learning_rate": 0.00013139418030296937, |
|
"loss": 1.1783, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.606986899563319, |
|
"grad_norm": 0.80023844006387, |
|
"learning_rate": 0.00013066965572705401, |
|
"loss": 1.1504, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 4.62882096069869, |
|
"grad_norm": 0.8438162486126547, |
|
"learning_rate": 0.00012994334963168443, |
|
"loss": 1.2292, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.650655021834061, |
|
"grad_norm": 0.8687319952846376, |
|
"learning_rate": 0.00012921530420606714, |
|
"loss": 1.2132, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 4.672489082969433, |
|
"grad_norm": 0.8481724475183398, |
|
"learning_rate": 0.00012848556174044183, |
|
"loss": 1.2114, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.6943231441048034, |
|
"grad_norm": 0.8170589588250686, |
|
"learning_rate": 0.00012775416462362457, |
|
"loss": 1.2152, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 4.716157205240175, |
|
"grad_norm": 0.8800579649975868, |
|
"learning_rate": 0.00012702115534054593, |
|
"loss": 1.1693, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.737991266375546, |
|
"grad_norm": 0.8601610801550544, |
|
"learning_rate": 0.0001262865764697829, |
|
"loss": 1.1846, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 4.759825327510917, |
|
"grad_norm": 0.8386680065719362, |
|
"learning_rate": 0.00012555047068108568, |
|
"loss": 1.249, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.781659388646288, |
|
"grad_norm": 0.8517305343726155, |
|
"learning_rate": 0.00012481288073289912, |
|
"loss": 1.1364, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 4.8034934497816595, |
|
"grad_norm": 0.8088860139786535, |
|
"learning_rate": 0.00012407384946987898, |
|
"loss": 1.1527, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.825327510917031, |
|
"grad_norm": 0.8583326581924249, |
|
"learning_rate": 0.00012333341982040323, |
|
"loss": 1.1515, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 4.847161572052402, |
|
"grad_norm": 0.9360072468671379, |
|
"learning_rate": 0.00012259163479407832, |
|
"loss": 1.0865, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.868995633187773, |
|
"grad_norm": 0.8650317997007926, |
|
"learning_rate": 0.00012184853747924112, |
|
"loss": 1.131, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 4.890829694323144, |
|
"grad_norm": 0.8102946053666945, |
|
"learning_rate": 0.00012110417104045575, |
|
"loss": 1.111, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.9126637554585155, |
|
"grad_norm": 0.9358255576727259, |
|
"learning_rate": 0.00012035857871600649, |
|
"loss": 1.2429, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 4.934497816593886, |
|
"grad_norm": 0.8877109383416729, |
|
"learning_rate": 0.00011961180381538599, |
|
"loss": 1.1798, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 4.956331877729258, |
|
"grad_norm": 0.8523579756065384, |
|
"learning_rate": 0.0001188638897167797, |
|
"loss": 1.1524, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 4.978165938864628, |
|
"grad_norm": 0.8309844177132272, |
|
"learning_rate": 0.00011811487986454612, |
|
"loss": 1.2469, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.8482507314409842, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 1.1823, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 5.021834061135372, |
|
"grad_norm": 1.0322243914151594, |
|
"learning_rate": 0.00011661374699235057, |
|
"loss": 1.0325, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 5.043668122270742, |
|
"grad_norm": 1.0324089865002601, |
|
"learning_rate": 0.00011586171116924014, |
|
"loss": 1.0234, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 5.065502183406114, |
|
"grad_norm": 0.9762094077290032, |
|
"learning_rate": 0.00011510875398114027, |
|
"loss": 1.0794, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 5.0873362445414845, |
|
"grad_norm": 1.1952296969892835, |
|
"learning_rate": 0.00011435491916534919, |
|
"loss": 1.0145, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 5.109170305676856, |
|
"grad_norm": 1.0895050768888697, |
|
"learning_rate": 0.0001136002505101442, |
|
"loss": 1.0151, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 5.131004366812227, |
|
"grad_norm": 1.0235853080588493, |
|
"learning_rate": 0.00011284479185223812, |
|
"loss": 1.0388, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 5.152838427947598, |
|
"grad_norm": 1.0490311233025102, |
|
"learning_rate": 0.00011208858707423299, |
|
"loss": 1.0072, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 5.17467248908297, |
|
"grad_norm": 1.1856197953264118, |
|
"learning_rate": 0.00011133168010207091, |
|
"loss": 1.0504, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 5.1965065502183405, |
|
"grad_norm": 1.0065811557379292, |
|
"learning_rate": 0.00011057411490248266, |
|
"loss": 0.9977, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 5.218340611353712, |
|
"grad_norm": 1.1043632872539175, |
|
"learning_rate": 0.00010981593548043374, |
|
"loss": 0.9932, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 5.240174672489083, |
|
"grad_norm": 1.0753914347833422, |
|
"learning_rate": 0.00010905718587656811, |
|
"loss": 1.092, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 5.262008733624454, |
|
"grad_norm": 1.0266820326577377, |
|
"learning_rate": 0.0001082979101646502, |
|
"loss": 1.0655, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 5.283842794759825, |
|
"grad_norm": 0.9941194317158725, |
|
"learning_rate": 0.00010753815244900458, |
|
"loss": 0.9828, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 5.3056768558951966, |
|
"grad_norm": 1.084324048580049, |
|
"learning_rate": 0.00010677795686195422, |
|
"loss": 1.0229, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 5.327510917030567, |
|
"grad_norm": 1.051439528201926, |
|
"learning_rate": 0.00010601736756125685, |
|
"loss": 1.0168, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 5.349344978165939, |
|
"grad_norm": 1.1580814102197374, |
|
"learning_rate": 0.00010525642872753996, |
|
"loss": 0.935, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 5.37117903930131, |
|
"grad_norm": 1.0710336894680983, |
|
"learning_rate": 0.00010449518456173456, |
|
"loss": 1.067, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 5.393013100436681, |
|
"grad_norm": 1.0463850478020345, |
|
"learning_rate": 0.00010373367928250749, |
|
"loss": 1.0489, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 5.414847161572053, |
|
"grad_norm": 1.1057562523337745, |
|
"learning_rate": 0.00010297195712369311, |
|
"loss": 0.954, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 5.436681222707423, |
|
"grad_norm": 1.0647508212261871, |
|
"learning_rate": 0.0001022100623317237, |
|
"loss": 0.9094, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 5.458515283842795, |
|
"grad_norm": 1.0854376270937827, |
|
"learning_rate": 0.00010144803916305925, |
|
"loss": 0.9996, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 5.4803493449781655, |
|
"grad_norm": 1.0768231071114585, |
|
"learning_rate": 0.00010068593188161697, |
|
"loss": 1.0098, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 5.502183406113537, |
|
"grad_norm": 1.0629470347878223, |
|
"learning_rate": 9.992378475619981e-05, |
|
"loss": 1.0252, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 5.524017467248909, |
|
"grad_norm": 1.051250411684801, |
|
"learning_rate": 9.916164205792527e-05, |
|
"loss": 0.9879, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 5.545851528384279, |
|
"grad_norm": 1.0100574406024927, |
|
"learning_rate": 9.839954805765364e-05, |
|
"loss": 1.0638, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 5.567685589519651, |
|
"grad_norm": 1.0416962539798005, |
|
"learning_rate": 9.763754702341646e-05, |
|
"loss": 0.9556, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 5.5895196506550215, |
|
"grad_norm": 1.041656249717949, |
|
"learning_rate": 9.687568321784509e-05, |
|
"loss": 1.0295, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 5.611353711790393, |
|
"grad_norm": 1.057095267929098, |
|
"learning_rate": 9.611400089559975e-05, |
|
"loss": 1.0233, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 5.633187772925764, |
|
"grad_norm": 1.0153521926215252, |
|
"learning_rate": 9.535254430079864e-05, |
|
"loss": 0.9867, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 5.655021834061135, |
|
"grad_norm": 1.1345238181227135, |
|
"learning_rate": 9.459135766444815e-05, |
|
"loss": 1.0027, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 5.676855895196507, |
|
"grad_norm": 1.1187108189925221, |
|
"learning_rate": 9.383048520187344e-05, |
|
"loss": 0.9987, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.698689956331878, |
|
"grad_norm": 1.0636989170846824, |
|
"learning_rate": 9.306997111015014e-05, |
|
"loss": 1.0486, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 5.720524017467249, |
|
"grad_norm": 1.0706396029944643, |
|
"learning_rate": 9.23098595655371e-05, |
|
"loss": 0.9931, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 5.74235807860262, |
|
"grad_norm": 1.0475360658888055, |
|
"learning_rate": 9.155019472091022e-05, |
|
"loss": 0.9749, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 5.764192139737991, |
|
"grad_norm": 1.0214139834738702, |
|
"learning_rate": 9.079102070319786e-05, |
|
"loss": 1.0693, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 5.786026200873362, |
|
"grad_norm": 1.0530973134949948, |
|
"learning_rate": 9.003238161081743e-05, |
|
"loss": 1.0228, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 5.807860262008734, |
|
"grad_norm": 1.103677474707846, |
|
"learning_rate": 8.9274321511114e-05, |
|
"loss": 0.9761, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 5.829694323144105, |
|
"grad_norm": 1.0644633976825475, |
|
"learning_rate": 8.851688443780043e-05, |
|
"loss": 1.0239, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 5.851528384279476, |
|
"grad_norm": 1.0555551246349646, |
|
"learning_rate": 8.776011438839977e-05, |
|
"loss": 1.0473, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 5.873362445414847, |
|
"grad_norm": 1.122056836899885, |
|
"learning_rate": 8.70040553216892e-05, |
|
"loss": 0.9723, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 5.895196506550218, |
|
"grad_norm": 0.9963873601564418, |
|
"learning_rate": 8.624875115514697e-05, |
|
"loss": 1.0268, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 5.91703056768559, |
|
"grad_norm": 1.0336940380478288, |
|
"learning_rate": 8.549424576240102e-05, |
|
"loss": 0.9574, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 5.93886462882096, |
|
"grad_norm": 1.0760501661341102, |
|
"learning_rate": 8.474058297068071e-05, |
|
"loss": 1.0979, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 5.960698689956332, |
|
"grad_norm": 1.0890942343064978, |
|
"learning_rate": 8.398780655827096e-05, |
|
"loss": 0.9427, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 5.9825327510917035, |
|
"grad_norm": 1.1415589599070428, |
|
"learning_rate": 8.323596025196911e-05, |
|
"loss": 1.0041, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 6.004366812227074, |
|
"grad_norm": 1.1509480354124522, |
|
"learning_rate": 8.248508772454529e-05, |
|
"loss": 0.9545, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 6.026200873362446, |
|
"grad_norm": 1.4252192004046074, |
|
"learning_rate": 8.173523259220521e-05, |
|
"loss": 0.8584, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 6.048034934497816, |
|
"grad_norm": 1.37449264500959, |
|
"learning_rate": 8.098643841205685e-05, |
|
"loss": 0.8417, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 6.069868995633188, |
|
"grad_norm": 1.1455224268481572, |
|
"learning_rate": 8.023874867958027e-05, |
|
"loss": 0.8365, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 6.091703056768559, |
|
"grad_norm": 1.4517829308762, |
|
"learning_rate": 7.949220682610109e-05, |
|
"loss": 0.8772, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 6.11353711790393, |
|
"grad_norm": 1.2198167511326543, |
|
"learning_rate": 7.874685621626767e-05, |
|
"loss": 0.7638, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 6.135371179039302, |
|
"grad_norm": 1.2285129578196883, |
|
"learning_rate": 7.80027401455321e-05, |
|
"loss": 0.8632, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 6.157205240174672, |
|
"grad_norm": 1.3128304307256697, |
|
"learning_rate": 7.725990183763541e-05, |
|
"loss": 0.7864, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 6.179039301310044, |
|
"grad_norm": 1.2223078968334138, |
|
"learning_rate": 7.651838444209678e-05, |
|
"loss": 0.8107, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 6.200873362445415, |
|
"grad_norm": 1.2116494464520935, |
|
"learning_rate": 7.577823103170695e-05, |
|
"loss": 0.7665, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 6.222707423580786, |
|
"grad_norm": 1.2710441404657735, |
|
"learning_rate": 7.503948460002651e-05, |
|
"loss": 0.8755, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 6.244541484716157, |
|
"grad_norm": 1.3090624013120402, |
|
"learning_rate": 7.430218805888831e-05, |
|
"loss": 0.8635, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 6.2663755458515285, |
|
"grad_norm": 1.3421074934086965, |
|
"learning_rate": 7.356638423590485e-05, |
|
"loss": 0.8408, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 6.2882096069869, |
|
"grad_norm": 1.2190845509165837, |
|
"learning_rate": 7.283211587198056e-05, |
|
"loss": 0.901, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 6.310043668122271, |
|
"grad_norm": 1.2848215636192764, |
|
"learning_rate": 7.209942561882914e-05, |
|
"loss": 0.8183, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 6.331877729257642, |
|
"grad_norm": 1.2782891875098124, |
|
"learning_rate": 7.136835603649599e-05, |
|
"loss": 0.8144, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 6.353711790393013, |
|
"grad_norm": 1.3643650683015465, |
|
"learning_rate": 7.0638949590886e-05, |
|
"loss": 0.815, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 6.3755458515283845, |
|
"grad_norm": 1.3747527932453862, |
|
"learning_rate": 6.991124865129683e-05, |
|
"loss": 0.8058, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 6.397379912663755, |
|
"grad_norm": 1.324272429862148, |
|
"learning_rate": 6.918529548795781e-05, |
|
"loss": 0.8359, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 6.419213973799127, |
|
"grad_norm": 1.329733488330745, |
|
"learning_rate": 6.846113226957456e-05, |
|
"loss": 0.8081, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 6.441048034934497, |
|
"grad_norm": 1.3973165024504683, |
|
"learning_rate": 6.773880106087945e-05, |
|
"loss": 0.9255, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 6.462882096069869, |
|
"grad_norm": 1.2579802684349493, |
|
"learning_rate": 6.701834382018832e-05, |
|
"loss": 0.8932, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 6.4847161572052405, |
|
"grad_norm": 1.3102021281865468, |
|
"learning_rate": 6.629980239696315e-05, |
|
"loss": 0.8651, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 6.506550218340611, |
|
"grad_norm": 1.3096833467828455, |
|
"learning_rate": 6.558321852938099e-05, |
|
"loss": 0.8145, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 6.528384279475983, |
|
"grad_norm": 1.3466234774293075, |
|
"learning_rate": 6.486863384190987e-05, |
|
"loss": 0.8885, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 6.550218340611353, |
|
"grad_norm": 1.296177045867574, |
|
"learning_rate": 6.415608984289052e-05, |
|
"loss": 0.8546, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.572052401746725, |
|
"grad_norm": 1.2633240958805778, |
|
"learning_rate": 6.344562792212554e-05, |
|
"loss": 0.8685, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 6.593886462882097, |
|
"grad_norm": 1.3534959976317207, |
|
"learning_rate": 6.273728934847516e-05, |
|
"loss": 0.7986, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 6.615720524017467, |
|
"grad_norm": 1.252619828877404, |
|
"learning_rate": 6.203111526745985e-05, |
|
"loss": 0.8332, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 6.637554585152839, |
|
"grad_norm": 1.3249258619095206, |
|
"learning_rate": 6.132714669887044e-05, |
|
"loss": 0.8308, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 6.6593886462882095, |
|
"grad_norm": 1.2054299066608396, |
|
"learning_rate": 6.0625424534385425e-05, |
|
"loss": 0.8697, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 6.681222707423581, |
|
"grad_norm": 1.1955192517041973, |
|
"learning_rate": 5.99259895351955e-05, |
|
"loss": 0.8591, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 6.703056768558952, |
|
"grad_norm": 1.2570222378899258, |
|
"learning_rate": 5.9228882329636094e-05, |
|
"loss": 0.7953, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 6.724890829694323, |
|
"grad_norm": 1.3956159571082007, |
|
"learning_rate": 5.8534143410827104e-05, |
|
"loss": 0.8367, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 6.746724890829694, |
|
"grad_norm": 1.3649251909287845, |
|
"learning_rate": 5.7841813134320975e-05, |
|
"loss": 0.8553, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 6.7685589519650655, |
|
"grad_norm": 1.3726673140188062, |
|
"learning_rate": 5.715193171575842e-05, |
|
"loss": 0.8649, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 6.790393013100436, |
|
"grad_norm": 1.2184325480564675, |
|
"learning_rate": 5.64645392285325e-05, |
|
"loss": 0.8222, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 6.812227074235808, |
|
"grad_norm": 1.3383861408747, |
|
"learning_rate": 5.577967560146077e-05, |
|
"loss": 0.851, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 6.834061135371179, |
|
"grad_norm": 1.3171908370457348, |
|
"learning_rate": 5.5097380616466057e-05, |
|
"loss": 0.8662, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 6.85589519650655, |
|
"grad_norm": 1.3054511507141975, |
|
"learning_rate": 5.4417693906265365e-05, |
|
"loss": 0.8979, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 6.877729257641922, |
|
"grad_norm": 1.2387745464762083, |
|
"learning_rate": 5.374065495206805e-05, |
|
"loss": 0.8119, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 6.899563318777292, |
|
"grad_norm": 1.3948067425415336, |
|
"learning_rate": 5.306630308128229e-05, |
|
"loss": 0.8409, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 6.921397379912664, |
|
"grad_norm": 1.3502073303444249, |
|
"learning_rate": 5.239467746523048e-05, |
|
"loss": 0.8391, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 6.9432314410480345, |
|
"grad_norm": 1.3334535387387385, |
|
"learning_rate": 5.172581711687438e-05, |
|
"loss": 0.8577, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 6.965065502183406, |
|
"grad_norm": 1.3381286547460995, |
|
"learning_rate": 5.105976088854842e-05, |
|
"loss": 0.8925, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 6.986899563318778, |
|
"grad_norm": 1.2107404270853181, |
|
"learning_rate": 5.0396547469703106e-05, |
|
"loss": 0.8894, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 7.008733624454148, |
|
"grad_norm": 1.2856577830397042, |
|
"learning_rate": 4.973621538465768e-05, |
|
"loss": 0.8269, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 7.03056768558952, |
|
"grad_norm": 1.5720811121732976, |
|
"learning_rate": 4.907880299036234e-05, |
|
"loss": 0.6532, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 7.0524017467248905, |
|
"grad_norm": 1.513790910492325, |
|
"learning_rate": 4.8424348474170014e-05, |
|
"loss": 0.6398, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 7.074235807860262, |
|
"grad_norm": 1.3731923874867311, |
|
"learning_rate": 4.7772889851618405e-05, |
|
"loss": 0.7323, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 7.096069868995633, |
|
"grad_norm": 1.3538833010462115, |
|
"learning_rate": 4.712446496422165e-05, |
|
"loss": 0.6906, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 7.117903930131004, |
|
"grad_norm": 1.4454754507310241, |
|
"learning_rate": 4.647911147727209e-05, |
|
"loss": 0.7328, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 7.139737991266376, |
|
"grad_norm": 1.449171514767732, |
|
"learning_rate": 4.583686687765264e-05, |
|
"loss": 0.6782, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 7.1615720524017465, |
|
"grad_norm": 1.6050731544814476, |
|
"learning_rate": 4.5197768471659104e-05, |
|
"loss": 0.7385, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 7.183406113537118, |
|
"grad_norm": 1.3388820740639182, |
|
"learning_rate": 4.4561853382833206e-05, |
|
"loss": 0.6937, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 7.205240174672489, |
|
"grad_norm": 1.3724590934014314, |
|
"learning_rate": 4.3929158549806096e-05, |
|
"loss": 0.6899, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 7.22707423580786, |
|
"grad_norm": 1.5165378554181088, |
|
"learning_rate": 4.32997207241528e-05, |
|
"loss": 0.7044, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 7.248908296943231, |
|
"grad_norm": 1.6117091341741596, |
|
"learning_rate": 4.267357646825746e-05, |
|
"loss": 0.7093, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 7.270742358078603, |
|
"grad_norm": 1.4490430208656846, |
|
"learning_rate": 4.205076215318925e-05, |
|
"loss": 0.6967, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 7.292576419213974, |
|
"grad_norm": 1.469380632694007, |
|
"learning_rate": 4.143131395658996e-05, |
|
"loss": 0.7164, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 7.314410480349345, |
|
"grad_norm": 1.519813749480573, |
|
"learning_rate": 4.081526786057254e-05, |
|
"loss": 0.6724, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 7.336244541484716, |
|
"grad_norm": 1.4588612796193572, |
|
"learning_rate": 4.020265964963066e-05, |
|
"loss": 0.731, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 7.358078602620087, |
|
"grad_norm": 1.429975103899558, |
|
"learning_rate": 3.9593524908560464e-05, |
|
"loss": 0.7327, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 7.379912663755459, |
|
"grad_norm": 1.6113371519439155, |
|
"learning_rate": 3.898789902039338e-05, |
|
"loss": 0.709, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 7.401746724890829, |
|
"grad_norm": 1.5463250020435173, |
|
"learning_rate": 3.8385817164340723e-05, |
|
"loss": 0.7246, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 7.423580786026201, |
|
"grad_norm": 1.463616658449462, |
|
"learning_rate": 3.778731431375041e-05, |
|
"loss": 0.7013, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 7.445414847161572, |
|
"grad_norm": 1.5566283347360372, |
|
"learning_rate": 3.719242523407539e-05, |
|
"loss": 0.7344, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 7.467248908296943, |
|
"grad_norm": 1.4398876621434327, |
|
"learning_rate": 3.6601184480854066e-05, |
|
"loss": 0.7323, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 7.489082969432315, |
|
"grad_norm": 1.5252195085267406, |
|
"learning_rate": 3.601362639770328e-05, |
|
"loss": 0.7091, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 7.510917030567685, |
|
"grad_norm": 1.6881923973765582, |
|
"learning_rate": 3.542978511432325e-05, |
|
"loss": 0.7585, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 7.532751091703057, |
|
"grad_norm": 1.450411669170985, |
|
"learning_rate": 3.484969454451511e-05, |
|
"loss": 0.7258, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 7.554585152838428, |
|
"grad_norm": 1.523075629927926, |
|
"learning_rate": 3.4273388384210855e-05, |
|
"loss": 0.6716, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 7.576419213973799, |
|
"grad_norm": 1.4565661875305633, |
|
"learning_rate": 3.3700900109516184e-05, |
|
"loss": 0.6586, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 7.598253275109171, |
|
"grad_norm": 1.6164061159781367, |
|
"learning_rate": 3.3132262974765906e-05, |
|
"loss": 0.7123, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 7.620087336244541, |
|
"grad_norm": 1.5157276652461606, |
|
"learning_rate": 3.256751001059214e-05, |
|
"loss": 0.723, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 7.641921397379913, |
|
"grad_norm": 1.5600828466807661, |
|
"learning_rate": 3.200667402200586e-05, |
|
"loss": 0.7477, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 7.663755458515284, |
|
"grad_norm": 1.5728766811598356, |
|
"learning_rate": 3.144978758649133e-05, |
|
"loss": 0.7001, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 7.685589519650655, |
|
"grad_norm": 1.458458820326098, |
|
"learning_rate": 3.0896883052113525e-05, |
|
"loss": 0.7066, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 7.707423580786026, |
|
"grad_norm": 1.5980960571332508, |
|
"learning_rate": 3.034799253563939e-05, |
|
"loss": 0.6878, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 7.729257641921397, |
|
"grad_norm": 1.544990160417705, |
|
"learning_rate": 2.9803147920672146e-05, |
|
"loss": 0.6894, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 7.751091703056769, |
|
"grad_norm": 1.6353637199811462, |
|
"learning_rate": 2.9262380855799164e-05, |
|
"loss": 0.7297, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 7.77292576419214, |
|
"grad_norm": 1.5830327061313785, |
|
"learning_rate": 2.872572275275379e-05, |
|
"loss": 0.6983, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 7.794759825327511, |
|
"grad_norm": 1.4630236788620592, |
|
"learning_rate": 2.8193204784590597e-05, |
|
"loss": 0.7176, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 7.816593886462882, |
|
"grad_norm": 1.3928429612080049, |
|
"learning_rate": 2.766485788387455e-05, |
|
"loss": 0.7269, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 7.8384279475982535, |
|
"grad_norm": 1.5318894677152983, |
|
"learning_rate": 2.7140712740884376e-05, |
|
"loss": 0.7094, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 7.860262008733624, |
|
"grad_norm": 1.5006081477924398, |
|
"learning_rate": 2.6620799801829765e-05, |
|
"loss": 0.7356, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 7.882096069868996, |
|
"grad_norm": 1.4913103972222401, |
|
"learning_rate": 2.610514926708285e-05, |
|
"loss": 0.7563, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 7.903930131004367, |
|
"grad_norm": 1.5640971425352013, |
|
"learning_rate": 2.5593791089423858e-05, |
|
"loss": 0.6974, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 7.925764192139738, |
|
"grad_norm": 1.5288946416263425, |
|
"learning_rate": 2.5086754972301384e-05, |
|
"loss": 0.7597, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 7.9475982532751095, |
|
"grad_norm": 1.4850580236939783, |
|
"learning_rate": 2.4584070368106928e-05, |
|
"loss": 0.731, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 7.96943231441048, |
|
"grad_norm": 1.5394321618591782, |
|
"learning_rate": 2.4085766476463967e-05, |
|
"loss": 0.712, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 7.991266375545852, |
|
"grad_norm": 1.6849009773020913, |
|
"learning_rate": 2.3591872242532066e-05, |
|
"loss": 0.7327, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 8.013100436681222, |
|
"grad_norm": 1.4739518847631212, |
|
"learning_rate": 2.310241635532531e-05, |
|
"loss": 0.6777, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 8.034934497816593, |
|
"grad_norm": 1.4710344726002955, |
|
"learning_rate": 2.2617427246045973e-05, |
|
"loss": 0.5886, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 8.056768558951966, |
|
"grad_norm": 1.8620287227429924, |
|
"learning_rate": 2.2136933086432955e-05, |
|
"loss": 0.6258, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 8.078602620087336, |
|
"grad_norm": 1.4930799825826104, |
|
"learning_rate": 2.1660961787125388e-05, |
|
"loss": 0.6041, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 8.100436681222707, |
|
"grad_norm": 1.469008850941141, |
|
"learning_rate": 2.1189540996041313e-05, |
|
"loss": 0.647, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 8.12227074235808, |
|
"grad_norm": 1.6154095310822976, |
|
"learning_rate": 2.0722698096771832e-05, |
|
"loss": 0.5866, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 8.14410480349345, |
|
"grad_norm": 1.7217187185906804, |
|
"learning_rate": 2.026046020699035e-05, |
|
"loss": 0.6718, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 8.16593886462882, |
|
"grad_norm": 1.5816769270478201, |
|
"learning_rate": 1.980285417687735e-05, |
|
"loss": 0.6303, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 8.187772925764191, |
|
"grad_norm": 1.5330680888020691, |
|
"learning_rate": 1.9349906587560862e-05, |
|
"loss": 0.6166, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 8.209606986899564, |
|
"grad_norm": 1.4811035124378678, |
|
"learning_rate": 1.8901643749572374e-05, |
|
"loss": 0.6245, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 8.231441048034934, |
|
"grad_norm": 1.6432943675024339, |
|
"learning_rate": 1.8458091701318504e-05, |
|
"loss": 0.6261, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 8.253275109170305, |
|
"grad_norm": 1.6763677385421005, |
|
"learning_rate": 1.801927620756847e-05, |
|
"loss": 0.6468, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 8.275109170305678, |
|
"grad_norm": 1.5425807262553166, |
|
"learning_rate": 1.7585222757957576e-05, |
|
"loss": 0.6059, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 8.296943231441048, |
|
"grad_norm": 1.7341819757802275, |
|
"learning_rate": 1.7155956565506547e-05, |
|
"loss": 0.6728, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 8.318777292576419, |
|
"grad_norm": 1.4483276727621572, |
|
"learning_rate": 1.6731502565156875e-05, |
|
"loss": 0.6033, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 8.34061135371179, |
|
"grad_norm": 1.5596252995782947, |
|
"learning_rate": 1.6311885412322602e-05, |
|
"loss": 0.63, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 8.362445414847162, |
|
"grad_norm": 1.747814876036389, |
|
"learning_rate": 1.5897129481457996e-05, |
|
"loss": 0.5621, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 8.384279475982533, |
|
"grad_norm": 1.5550499771354138, |
|
"learning_rate": 1.5487258864641717e-05, |
|
"loss": 0.6306, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 8.406113537117903, |
|
"grad_norm": 1.5708977251660243, |
|
"learning_rate": 1.50822973701775e-05, |
|
"loss": 0.6281, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 8.427947598253276, |
|
"grad_norm": 1.5052322890768695, |
|
"learning_rate": 1.4682268521211073e-05, |
|
"loss": 0.5805, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 8.449781659388647, |
|
"grad_norm": 1.5682376306714874, |
|
"learning_rate": 1.4287195554363718e-05, |
|
"loss": 0.6103, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 8.471615720524017, |
|
"grad_norm": 1.5926672273219815, |
|
"learning_rate": 1.3897101418382663e-05, |
|
"loss": 0.6086, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 8.493449781659388, |
|
"grad_norm": 1.6920842457267133, |
|
"learning_rate": 1.3512008772807993e-05, |
|
"loss": 0.6075, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 8.51528384279476, |
|
"grad_norm": 1.7066301478594386, |
|
"learning_rate": 1.3131939986656305e-05, |
|
"loss": 0.6037, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 8.537117903930131, |
|
"grad_norm": 1.6263309583877439, |
|
"learning_rate": 1.2756917137121527e-05, |
|
"loss": 0.6137, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 8.558951965065502, |
|
"grad_norm": 1.63476708848148, |
|
"learning_rate": 1.2386962008292413e-05, |
|
"loss": 0.5858, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 8.580786026200874, |
|
"grad_norm": 1.558434256481925, |
|
"learning_rate": 1.2022096089887191e-05, |
|
"loss": 0.6426, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 8.602620087336245, |
|
"grad_norm": 1.551619834712178, |
|
"learning_rate": 1.1662340576005216e-05, |
|
"loss": 0.6084, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 8.624454148471616, |
|
"grad_norm": 1.555097964766268, |
|
"learning_rate": 1.130771636389596e-05, |
|
"loss": 0.6687, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 8.646288209606986, |
|
"grad_norm": 1.6643047208497839, |
|
"learning_rate": 1.0958244052745126e-05, |
|
"loss": 0.6155, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 8.668122270742359, |
|
"grad_norm": 1.611005114630636, |
|
"learning_rate": 1.0613943942478e-05, |
|
"loss": 0.6089, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 8.68995633187773, |
|
"grad_norm": 1.5244955674562928, |
|
"learning_rate": 1.0274836032580415e-05, |
|
"loss": 0.6487, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 8.7117903930131, |
|
"grad_norm": 1.6030907080208872, |
|
"learning_rate": 9.940940020936951e-06, |
|
"loss": 0.6293, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 8.733624454148472, |
|
"grad_norm": 1.5700446980317904, |
|
"learning_rate": 9.612275302686713e-06, |
|
"loss": 0.6326, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.755458515283843, |
|
"grad_norm": 1.5394752226532917, |
|
"learning_rate": 9.288860969096857e-06, |
|
"loss": 0.6107, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 8.777292576419214, |
|
"grad_norm": 1.667355715027956, |
|
"learning_rate": 8.970715806453489e-06, |
|
"loss": 0.636, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 8.799126637554584, |
|
"grad_norm": 1.7421757272877927, |
|
"learning_rate": 8.657858294970412e-06, |
|
"loss": 0.6358, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 8.820960698689957, |
|
"grad_norm": 1.48526713960321, |
|
"learning_rate": 8.350306607715774e-06, |
|
"loss": 0.6456, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 8.842794759825328, |
|
"grad_norm": 1.7150442937256956, |
|
"learning_rate": 8.048078609556386e-06, |
|
"loss": 0.6443, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 8.864628820960698, |
|
"grad_norm": 1.6511037454437418, |
|
"learning_rate": 7.751191856119932e-06, |
|
"loss": 0.671, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 8.886462882096069, |
|
"grad_norm": 1.6987081532253472, |
|
"learning_rate": 7.459663592775334e-06, |
|
"loss": 0.6577, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 8.908296943231441, |
|
"grad_norm": 1.534952723839, |
|
"learning_rate": 7.173510753630919e-06, |
|
"loss": 0.6233, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 8.930131004366812, |
|
"grad_norm": 1.641736331583932, |
|
"learning_rate": 6.892749960550815e-06, |
|
"loss": 0.6289, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 8.951965065502183, |
|
"grad_norm": 1.5438647483260877, |
|
"learning_rate": 6.6173975221893615e-06, |
|
"loss": 0.5888, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 8.973799126637555, |
|
"grad_norm": 1.5243321966646095, |
|
"learning_rate": 6.347469433043851e-06, |
|
"loss": 0.6707, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 8.995633187772926, |
|
"grad_norm": 1.6062799594767991, |
|
"learning_rate": 6.082981372525487e-06, |
|
"loss": 0.5971, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 9.017467248908297, |
|
"grad_norm": 1.462091614630793, |
|
"learning_rate": 5.823948704048443e-06, |
|
"loss": 0.5631, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 9.039301310043669, |
|
"grad_norm": 1.4923902769131498, |
|
"learning_rate": 5.570386474137623e-06, |
|
"loss": 0.5617, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 9.06113537117904, |
|
"grad_norm": 1.6248158642620525, |
|
"learning_rate": 5.322309411554582e-06, |
|
"loss": 0.6111, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 9.08296943231441, |
|
"grad_norm": 1.6615679454812948, |
|
"learning_rate": 5.0797319264419105e-06, |
|
"loss": 0.563, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 9.104803493449781, |
|
"grad_norm": 1.6126706246375568, |
|
"learning_rate": 4.84266810948627e-06, |
|
"loss": 0.5686, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 9.126637554585153, |
|
"grad_norm": 1.5809643369161743, |
|
"learning_rate": 4.611131731099905e-06, |
|
"loss": 0.5533, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 9.148471615720524, |
|
"grad_norm": 1.6161716990207524, |
|
"learning_rate": 4.385136240620657e-06, |
|
"loss": 0.5962, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 9.170305676855895, |
|
"grad_norm": 1.5154717234389132, |
|
"learning_rate": 4.164694765530841e-06, |
|
"loss": 0.5946, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 9.192139737991265, |
|
"grad_norm": 1.5768496797034472, |
|
"learning_rate": 3.94982011069468e-06, |
|
"loss": 0.5383, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 9.213973799126638, |
|
"grad_norm": 1.6583634682510404, |
|
"learning_rate": 3.7405247576144054e-06, |
|
"loss": 0.6018, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 9.235807860262009, |
|
"grad_norm": 1.5801691223578864, |
|
"learning_rate": 3.5368208637053702e-06, |
|
"loss": 0.5564, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 9.25764192139738, |
|
"grad_norm": 1.702349073192532, |
|
"learning_rate": 3.338720261589823e-06, |
|
"loss": 0.578, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 9.279475982532752, |
|
"grad_norm": 1.5184620025021562, |
|
"learning_rate": 3.146234458409525e-06, |
|
"loss": 0.5649, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 9.301310043668122, |
|
"grad_norm": 1.4782870440067606, |
|
"learning_rate": 2.959374635157364e-06, |
|
"loss": 0.5708, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 9.323144104803493, |
|
"grad_norm": 1.5908300327141753, |
|
"learning_rate": 2.7781516460279157e-06, |
|
"loss": 0.5719, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 9.344978165938866, |
|
"grad_norm": 1.653067982991365, |
|
"learning_rate": 2.6025760177869063e-06, |
|
"loss": 0.5914, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 9.366812227074236, |
|
"grad_norm": 1.5475103396862675, |
|
"learning_rate": 2.4326579491597333e-06, |
|
"loss": 0.5931, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 9.388646288209607, |
|
"grad_norm": 1.6475600779910462, |
|
"learning_rate": 2.2684073102391066e-06, |
|
"loss": 0.5978, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 9.410480349344978, |
|
"grad_norm": 1.5891384582888524, |
|
"learning_rate": 2.1098336419116625e-06, |
|
"loss": 0.5656, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 9.43231441048035, |
|
"grad_norm": 1.5178759944544706, |
|
"learning_rate": 1.956946155303785e-06, |
|
"loss": 0.6198, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 9.45414847161572, |
|
"grad_norm": 1.641246543950594, |
|
"learning_rate": 1.809753731246544e-06, |
|
"loss": 0.5829, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 9.475982532751091, |
|
"grad_norm": 1.665331469610374, |
|
"learning_rate": 1.6682649197598433e-06, |
|
"loss": 0.5871, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 9.497816593886462, |
|
"grad_norm": 1.823654408381571, |
|
"learning_rate": 1.5324879395557933e-06, |
|
"loss": 0.5906, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 9.519650655021834, |
|
"grad_norm": 1.564912693530831, |
|
"learning_rate": 1.4024306775612283e-06, |
|
"loss": 0.6207, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 9.541484716157205, |
|
"grad_norm": 1.5863861199272236, |
|
"learning_rate": 1.2781006884596825e-06, |
|
"loss": 0.6267, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 9.563318777292576, |
|
"grad_norm": 1.6063472205433305, |
|
"learning_rate": 1.1595051942524637e-06, |
|
"loss": 0.5755, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 9.585152838427948, |
|
"grad_norm": 1.5366202264694289, |
|
"learning_rate": 1.0466510838392229e-06, |
|
"loss": 0.5384, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 9.606986899563319, |
|
"grad_norm": 1.61763818006577, |
|
"learning_rate": 9.395449126177291e-07, |
|
"loss": 0.6435, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 9.62882096069869, |
|
"grad_norm": 1.5959225303743334, |
|
"learning_rate": 8.381929021031409e-07, |
|
"loss": 0.5587, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 9.65065502183406, |
|
"grad_norm": 1.5317193865637022, |
|
"learning_rate": 7.426009395665734e-07, |
|
"loss": 0.6166, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 9.672489082969433, |
|
"grad_norm": 1.7050859904255076, |
|
"learning_rate": 6.527745776931382e-07, |
|
"loss": 0.6078, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 9.694323144104803, |
|
"grad_norm": 1.626127532872475, |
|
"learning_rate": 5.687190342594239e-07, |
|
"loss": 0.6022, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 9.716157205240174, |
|
"grad_norm": 1.6752520068014993, |
|
"learning_rate": 4.904391918303608e-07, |
|
"loss": 0.6126, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 9.737991266375547, |
|
"grad_norm": 1.6226696728292287, |
|
"learning_rate": 4.1793959747565836e-07, |
|
"loss": 0.5671, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 9.759825327510917, |
|
"grad_norm": 1.6182868533567345, |
|
"learning_rate": 3.5122446250562825e-07, |
|
"loss": 0.5859, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 9.781659388646288, |
|
"grad_norm": 1.686162675556006, |
|
"learning_rate": 2.902976622265907e-07, |
|
"loss": 0.5634, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 9.803493449781659, |
|
"grad_norm": 1.5169542450593143, |
|
"learning_rate": 2.3516273571577708e-07, |
|
"loss": 0.5578, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 9.825327510917031, |
|
"grad_norm": 1.6287222229266196, |
|
"learning_rate": 1.8582288561573847e-07, |
|
"loss": 0.5543, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 9.847161572052402, |
|
"grad_norm": 1.4791363351235698, |
|
"learning_rate": 1.4228097794828366e-07, |
|
"loss": 0.5705, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 9.868995633187772, |
|
"grad_norm": 1.5782969298269662, |
|
"learning_rate": 1.045395419480677e-07, |
|
"loss": 0.5742, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 9.890829694323145, |
|
"grad_norm": 1.5934772169541918, |
|
"learning_rate": 7.260076991560949e-08, |
|
"loss": 0.6509, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 9.912663755458516, |
|
"grad_norm": 1.5072353913111483, |
|
"learning_rate": 4.646651708998251e-08, |
|
"loss": 0.5509, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 9.934497816593886, |
|
"grad_norm": 1.5448684833958992, |
|
"learning_rate": 2.6138301541056564e-08, |
|
"loss": 0.5666, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 9.956331877729257, |
|
"grad_norm": 1.504682951530182, |
|
"learning_rate": 1.1617304081268376e-08, |
|
"loss": 0.5865, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 9.97816593886463, |
|
"grad_norm": 1.5933513163764192, |
|
"learning_rate": 2.9043681970875035e-09, |
|
"loss": 0.5653, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 1.5918112804762306, |
|
"learning_rate": 0.0, |
|
"loss": 0.5502, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 2290, |
|
"total_flos": 5211452815179776.0, |
|
"train_loss": 1.1147898718779785, |
|
"train_runtime": 4502.6314, |
|
"train_samples_per_second": 32.519, |
|
"train_steps_per_second": 0.509 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2290, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5211452815179776.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|