|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 3248, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.2709156318785633, |
|
"learning_rate": 6.153846153846154e-07, |
|
"loss": 0.9934, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.4419044510206839, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 1.0584, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.5233721395581817, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 1.1615, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.6649670185877594, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 1.1631, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.4305302514113907, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 1.2207, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5755996114609079, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.9927, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.20368238004024766, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 0.9844, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.24375178341672513, |
|
"learning_rate": 2.1538461538461542e-05, |
|
"loss": 0.9985, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.38432351721666425, |
|
"learning_rate": 2.461538461538462e-05, |
|
"loss": 1.0881, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.8927024097300557, |
|
"learning_rate": 2.7692307692307694e-05, |
|
"loss": 1.0362, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.24949485533837065, |
|
"learning_rate": 3.0769230769230774e-05, |
|
"loss": 0.9051, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3003048201124618, |
|
"learning_rate": 3.384615384615385e-05, |
|
"loss": 0.9026, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3884135265695224, |
|
"learning_rate": 3.692307692307693e-05, |
|
"loss": 0.8955, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5578919802311338, |
|
"learning_rate": 4e-05, |
|
"loss": 0.7901, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.2289752603512456, |
|
"learning_rate": 4.3076923076923084e-05, |
|
"loss": 0.8754, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.459950526314893, |
|
"learning_rate": 4.615384615384616e-05, |
|
"loss": 0.7449, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3061061622913128, |
|
"learning_rate": 4.923076923076924e-05, |
|
"loss": 0.8067, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.26485910183284767, |
|
"learning_rate": 5.230769230769231e-05, |
|
"loss": 0.8737, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.21605949900797208, |
|
"learning_rate": 5.538461538461539e-05, |
|
"loss": 0.8169, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.2286110237629433, |
|
"learning_rate": 5.846153846153847e-05, |
|
"loss": 0.8132, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.24556746259970036, |
|
"learning_rate": 6.153846153846155e-05, |
|
"loss": 0.8198, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.2948151232563899, |
|
"learning_rate": 6.461538461538462e-05, |
|
"loss": 0.8163, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.2659100532190652, |
|
"learning_rate": 6.76923076923077e-05, |
|
"loss": 0.8979, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.24575597154935636, |
|
"learning_rate": 7.076923076923078e-05, |
|
"loss": 0.8273, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3093299661054237, |
|
"learning_rate": 7.384615384615386e-05, |
|
"loss": 0.8091, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.2908225328310054, |
|
"learning_rate": 7.692307692307693e-05, |
|
"loss": 0.8313, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3020091724093868, |
|
"learning_rate": 8e-05, |
|
"loss": 0.8358, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.2694408106299054, |
|
"learning_rate": 8.307692307692309e-05, |
|
"loss": 0.8349, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3000515761359836, |
|
"learning_rate": 8.615384615384617e-05, |
|
"loss": 0.8707, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3079582972721868, |
|
"learning_rate": 8.923076923076924e-05, |
|
"loss": 0.8617, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3795146820147972, |
|
"learning_rate": 9.230769230769232e-05, |
|
"loss": 0.7768, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.4775337525844143, |
|
"learning_rate": 9.53846153846154e-05, |
|
"loss": 0.8528, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.27382955396233616, |
|
"learning_rate": 9.846153846153848e-05, |
|
"loss": 0.9321, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3493866683153211, |
|
"learning_rate": 0.00010153846153846153, |
|
"loss": 0.8745, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.392311201345868, |
|
"learning_rate": 0.00010461538461538463, |
|
"loss": 0.8214, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2542347730845665, |
|
"learning_rate": 0.0001076923076923077, |
|
"loss": 0.8266, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.30567410806640644, |
|
"learning_rate": 0.00011076923076923077, |
|
"loss": 0.8421, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3347043226775438, |
|
"learning_rate": 0.00011384615384615384, |
|
"loss": 0.8482, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.39125501413574576, |
|
"learning_rate": 0.00011692307692307694, |
|
"loss": 0.7707, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.27082032316598875, |
|
"learning_rate": 0.00012, |
|
"loss": 0.8802, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.2655311149315157, |
|
"learning_rate": 0.0001230769230769231, |
|
"loss": 0.8666, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3211818059226096, |
|
"learning_rate": 0.00012615384615384615, |
|
"loss": 0.8397, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.2924195950677733, |
|
"learning_rate": 0.00012923076923076923, |
|
"loss": 0.8067, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.27236793564577827, |
|
"learning_rate": 0.0001323076923076923, |
|
"loss": 0.8266, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.31068056017205964, |
|
"learning_rate": 0.0001353846153846154, |
|
"loss": 0.727, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.24520172222389908, |
|
"learning_rate": 0.00013846153846153847, |
|
"loss": 0.7937, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.28996280021429405, |
|
"learning_rate": 0.00014153846153846156, |
|
"loss": 0.8405, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.26492992094167794, |
|
"learning_rate": 0.0001446153846153846, |
|
"loss": 0.8417, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.28305833332691566, |
|
"learning_rate": 0.00014769230769230772, |
|
"loss": 0.9148, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.31032712702750226, |
|
"learning_rate": 0.00015076923076923077, |
|
"loss": 0.8458, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.32447015298077714, |
|
"learning_rate": 0.00015384615384615385, |
|
"loss": 0.7821, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.2547532649722294, |
|
"learning_rate": 0.00015692307692307693, |
|
"loss": 0.8207, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.2735833452730825, |
|
"learning_rate": 0.00016, |
|
"loss": 0.8249, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.22729853067013947, |
|
"learning_rate": 0.0001630769230769231, |
|
"loss": 0.8387, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3343853232959839, |
|
"learning_rate": 0.00016615384615384617, |
|
"loss": 0.8648, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3108160988317094, |
|
"learning_rate": 0.00016923076923076923, |
|
"loss": 0.804, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.2609065872820603, |
|
"learning_rate": 0.00017230769230769234, |
|
"loss": 0.8216, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.24454880681449043, |
|
"learning_rate": 0.0001753846153846154, |
|
"loss": 0.8174, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.22180414129702308, |
|
"learning_rate": 0.00017846153846153847, |
|
"loss": 0.8579, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.26081939474045385, |
|
"learning_rate": 0.00018153846153846155, |
|
"loss": 0.8432, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.2751512686500224, |
|
"learning_rate": 0.00018461538461538463, |
|
"loss": 0.6994, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.24286008540174067, |
|
"learning_rate": 0.0001876923076923077, |
|
"loss": 0.8409, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.2306911443540912, |
|
"learning_rate": 0.0001907692307692308, |
|
"loss": 0.8241, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.3568984623630479, |
|
"learning_rate": 0.00019384615384615385, |
|
"loss": 0.7153, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.36681138166946803, |
|
"learning_rate": 0.00019692307692307696, |
|
"loss": 0.8065, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.22369081814221262, |
|
"learning_rate": 0.0002, |
|
"loss": 0.757, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.32740968759147726, |
|
"learning_rate": 0.00019999855605356607, |
|
"loss": 0.785, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.38663458318983307, |
|
"learning_rate": 0.0001999942242559639, |
|
"loss": 0.7893, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.3744195353028169, |
|
"learning_rate": 0.00019998700473229113, |
|
"loss": 0.8817, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.2937020154962458, |
|
"learning_rate": 0.00019997689769103992, |
|
"loss": 0.8068, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.31130184115081005, |
|
"learning_rate": 0.00019996390342409071, |
|
"loss": 0.8888, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.27717910910942206, |
|
"learning_rate": 0.00019994802230670415, |
|
"loss": 0.8296, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.28929048963159804, |
|
"learning_rate": 0.00019992925479750978, |
|
"loss": 0.8375, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.30278441435173964, |
|
"learning_rate": 0.00019990760143849317, |
|
"loss": 0.7978, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.3926711605744842, |
|
"learning_rate": 0.00019988306285498018, |
|
"loss": 0.8156, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.2056371002966051, |
|
"learning_rate": 0.0001998556397556188, |
|
"loss": 0.8492, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.3386258840898327, |
|
"learning_rate": 0.00019982533293235873, |
|
"loss": 0.7553, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.24704095019032765, |
|
"learning_rate": 0.00019979214326042857, |
|
"loss": 0.8032, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.3027504102198928, |
|
"learning_rate": 0.0001997560716983105, |
|
"loss": 0.8777, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.25850106416138335, |
|
"learning_rate": 0.00019971711928771257, |
|
"loss": 0.8353, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.42457411611908963, |
|
"learning_rate": 0.0001996752871535387, |
|
"loss": 0.7962, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.32389294595176554, |
|
"learning_rate": 0.00019963057650385606, |
|
"loss": 0.8473, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.2232805240763939, |
|
"learning_rate": 0.0001995829886298604, |
|
"loss": 0.7768, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.26355868762668144, |
|
"learning_rate": 0.00019953252490583843, |
|
"loss": 0.8432, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.2479995279505114, |
|
"learning_rate": 0.00019947918678912848, |
|
"loss": 0.8742, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.26547854409221383, |
|
"learning_rate": 0.0001994229758200783, |
|
"loss": 0.8072, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.23642626786417162, |
|
"learning_rate": 0.00019936389362200033, |
|
"loss": 0.7956, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.25913092229555307, |
|
"learning_rate": 0.00019930194190112522, |
|
"loss": 0.7345, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.2966162150235158, |
|
"learning_rate": 0.00019923712244655225, |
|
"loss": 0.8089, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.2187595004800295, |
|
"learning_rate": 0.00019916943713019794, |
|
"loss": 0.7427, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.29051575591401246, |
|
"learning_rate": 0.00019909888790674155, |
|
"loss": 0.8768, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.3296601997445316, |
|
"learning_rate": 0.00019902547681356923, |
|
"loss": 0.8616, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.29185622420361307, |
|
"learning_rate": 0.0001989492059707146, |
|
"loss": 0.7993, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.285867084295898, |
|
"learning_rate": 0.00019887007758079793, |
|
"loss": 0.8207, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.30952870458662307, |
|
"learning_rate": 0.00019878809392896235, |
|
"loss": 0.8668, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.3381740373711063, |
|
"learning_rate": 0.00019870325738280785, |
|
"loss": 0.8842, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.21684296837932523, |
|
"learning_rate": 0.0001986155703923231, |
|
"loss": 0.7966, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.3040871521339894, |
|
"learning_rate": 0.0001985250354898143, |
|
"loss": 0.8622, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.26978651830594724, |
|
"learning_rate": 0.0001984316552898326, |
|
"loss": 0.8748, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.29082578689683647, |
|
"learning_rate": 0.00019833543248909798, |
|
"loss": 0.8407, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.301663442193365, |
|
"learning_rate": 0.00019823636986642199, |
|
"loss": 0.8568, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.2552544076755423, |
|
"learning_rate": 0.0001981344702826269, |
|
"loss": 0.8286, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.24913640355204184, |
|
"learning_rate": 0.00019802973668046363, |
|
"loss": 0.8022, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.2217941168846133, |
|
"learning_rate": 0.00019792217208452635, |
|
"loss": 0.8674, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.2891487359747499, |
|
"learning_rate": 0.00019781177960116538, |
|
"loss": 0.8123, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.34655206684809864, |
|
"learning_rate": 0.00019769856241839737, |
|
"loss": 0.8517, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.3053447288771597, |
|
"learning_rate": 0.00019758252380581328, |
|
"loss": 0.8821, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.3307139329014054, |
|
"learning_rate": 0.00019746366711448387, |
|
"loss": 0.8677, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.306144694096585, |
|
"learning_rate": 0.00019734199577686314, |
|
"loss": 0.7189, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.2774735539484507, |
|
"learning_rate": 0.0001972175133066889, |
|
"loss": 0.7494, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.3140012878663545, |
|
"learning_rate": 0.00019709022329888155, |
|
"loss": 0.7943, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.2646845744217625, |
|
"learning_rate": 0.00019696012942944013, |
|
"loss": 0.836, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.2308386439333217, |
|
"learning_rate": 0.00019682723545533628, |
|
"loss": 0.8478, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.262138689846067, |
|
"learning_rate": 0.00019669154521440553, |
|
"loss": 0.7914, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6748339885003066, |
|
"learning_rate": 0.0001965530626252367, |
|
"loss": 0.8494, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.33850537974316935, |
|
"learning_rate": 0.00019641179168705862, |
|
"loss": 0.6988, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.2655667830205273, |
|
"learning_rate": 0.00019626773647962457, |
|
"loss": 0.8944, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.266738555121118, |
|
"learning_rate": 0.0001961209011630947, |
|
"loss": 0.8797, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.2867657573604784, |
|
"learning_rate": 0.0001959712899779156, |
|
"loss": 0.8718, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.3370395857653061, |
|
"learning_rate": 0.00019581890724469802, |
|
"loss": 0.8289, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.29934897150076484, |
|
"learning_rate": 0.00019566375736409204, |
|
"loss": 0.822, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.28585720261383735, |
|
"learning_rate": 0.00019550584481666002, |
|
"loss": 0.8579, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.3285211504524654, |
|
"learning_rate": 0.0001953451741627471, |
|
"loss": 0.8795, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.24511909912168928, |
|
"learning_rate": 0.0001951817500423497, |
|
"loss": 0.7862, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.253601728108672, |
|
"learning_rate": 0.0001950155771749813, |
|
"loss": 0.8076, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.3092833252431494, |
|
"learning_rate": 0.00019484666035953632, |
|
"loss": 0.7513, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.26385573322113004, |
|
"learning_rate": 0.00019467500447415138, |
|
"loss": 0.8263, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.28938768331562686, |
|
"learning_rate": 0.00019450061447606455, |
|
"loss": 0.7777, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2529231145412997, |
|
"learning_rate": 0.00019432349540147222, |
|
"loss": 0.8287, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.21789719255683243, |
|
"learning_rate": 0.00019414365236538342, |
|
"loss": 0.789, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2926240350717249, |
|
"learning_rate": 0.00019396109056147242, |
|
"loss": 0.8396, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.21569069093149024, |
|
"learning_rate": 0.00019377581526192853, |
|
"loss": 0.7599, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.26361195643008684, |
|
"learning_rate": 0.00019358783181730387, |
|
"loss": 0.8687, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.285068572985004, |
|
"learning_rate": 0.00019339714565635898, |
|
"loss": 0.7735, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.32266517654300897, |
|
"learning_rate": 0.0001932037622859059, |
|
"loss": 0.754, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.2678959128715121, |
|
"learning_rate": 0.00019300768729064912, |
|
"loss": 0.8024, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.2808314964971424, |
|
"learning_rate": 0.00019280892633302454, |
|
"loss": 0.767, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.25044630314755334, |
|
"learning_rate": 0.00019260748515303563, |
|
"loss": 0.8454, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.30242650123792125, |
|
"learning_rate": 0.00019240336956808786, |
|
"loss": 0.8812, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.2701055207795336, |
|
"learning_rate": 0.00019219658547282067, |
|
"loss": 0.7791, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2232333034817994, |
|
"learning_rate": 0.0001919871388389372, |
|
"loss": 0.7782, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2578136050088398, |
|
"learning_rate": 0.0001917750357150318, |
|
"loss": 0.7164, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2974468917976116, |
|
"learning_rate": 0.00019156028222641554, |
|
"loss": 0.8559, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2811089697192464, |
|
"learning_rate": 0.00019134288457493904, |
|
"loss": 0.7352, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.3892901045304661, |
|
"learning_rate": 0.0001911228490388136, |
|
"loss": 0.7775, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.32835288605201257, |
|
"learning_rate": 0.00019090018197242972, |
|
"loss": 0.8125, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.3396081375822814, |
|
"learning_rate": 0.00019067488980617384, |
|
"loss": 0.8498, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.2725077722420475, |
|
"learning_rate": 0.00019044697904624226, |
|
"loss": 0.8652, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.26882238969800315, |
|
"learning_rate": 0.0001902164562744536, |
|
"loss": 0.8316, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.2942779783614407, |
|
"learning_rate": 0.00018998332814805852, |
|
"loss": 0.8937, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.23318828090848456, |
|
"learning_rate": 0.0001897476013995476, |
|
"loss": 0.7247, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.3207657739253934, |
|
"learning_rate": 0.00018950928283645676, |
|
"loss": 0.8168, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.27343106758276103, |
|
"learning_rate": 0.00018926837934117084, |
|
"loss": 0.7436, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.2380153711122644, |
|
"learning_rate": 0.0001890248978707246, |
|
"loss": 0.845, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.28205742315414456, |
|
"learning_rate": 0.00018877884545660215, |
|
"loss": 0.8329, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.248001179467568, |
|
"learning_rate": 0.0001885302292045336, |
|
"loss": 0.8322, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.512811292371171, |
|
"learning_rate": 0.0001882790562942899, |
|
"loss": 0.7778, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.3027018813313023, |
|
"learning_rate": 0.00018802533397947567, |
|
"loss": 0.8338, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.2940436466186521, |
|
"learning_rate": 0.00018776906958731953, |
|
"loss": 0.6823, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.33833283166239086, |
|
"learning_rate": 0.00018751027051846258, |
|
"loss": 0.7669, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.31248954654812844, |
|
"learning_rate": 0.00018724894424674467, |
|
"loss": 0.7851, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.3080103407030162, |
|
"learning_rate": 0.00018698509831898853, |
|
"loss": 0.8465, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.27928229688289624, |
|
"learning_rate": 0.00018671874035478195, |
|
"loss": 0.7708, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.2926934897262978, |
|
"learning_rate": 0.00018644987804625757, |
|
"loss": 0.8816, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.2564445050641534, |
|
"learning_rate": 0.00018617851915787078, |
|
"loss": 0.8748, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.3167275363170148, |
|
"learning_rate": 0.0001859046715261756, |
|
"loss": 0.7955, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.4082416585797153, |
|
"learning_rate": 0.00018562834305959824, |
|
"loss": 0.7464, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.3030422995435233, |
|
"learning_rate": 0.0001853495417382088, |
|
"loss": 0.9046, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.2536332149798187, |
|
"learning_rate": 0.00018506827561349073, |
|
"loss": 0.7143, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.3272764072322209, |
|
"learning_rate": 0.00018478455280810838, |
|
"loss": 0.8358, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.28310232183891465, |
|
"learning_rate": 0.00018449838151567244, |
|
"loss": 0.842, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.26576658481713733, |
|
"learning_rate": 0.00018420977000050323, |
|
"loss": 0.7563, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.2612196906596331, |
|
"learning_rate": 0.00018391872659739215, |
|
"loss": 0.7631, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.43199033496139155, |
|
"learning_rate": 0.00018362525971136082, |
|
"loss": 0.8585, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.3011184188491384, |
|
"learning_rate": 0.00018332937781741858, |
|
"loss": 0.807, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.3432230727339861, |
|
"learning_rate": 0.00018303108946031747, |
|
"loss": 0.806, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.28699539333378254, |
|
"learning_rate": 0.00018273040325430574, |
|
"loss": 0.8063, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2895901648006327, |
|
"learning_rate": 0.00018242732788287884, |
|
"loss": 0.7773, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.30393045217103676, |
|
"learning_rate": 0.00018212187209852888, |
|
"loss": 0.7721, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.4409757200332159, |
|
"learning_rate": 0.00018181404472249158, |
|
"loss": 0.805, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.36679860235251033, |
|
"learning_rate": 0.00018150385464449183, |
|
"loss": 0.7759, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.276840442597116, |
|
"learning_rate": 0.00018119131082248676, |
|
"loss": 0.8182, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.2365689665357522, |
|
"learning_rate": 0.00018087642228240713, |
|
"loss": 0.7851, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.30377473821055756, |
|
"learning_rate": 0.00018055919811789658, |
|
"loss": 0.7467, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.40215196146679155, |
|
"learning_rate": 0.00018023964749004921, |
|
"loss": 0.7436, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.24424508927481137, |
|
"learning_rate": 0.00017991777962714472, |
|
"loss": 0.8502, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.3549699023868391, |
|
"learning_rate": 0.00017959360382438226, |
|
"loss": 0.8607, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.29288526726309294, |
|
"learning_rate": 0.00017926712944361164, |
|
"loss": 0.7812, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.38300023494160845, |
|
"learning_rate": 0.00017893836591306326, |
|
"loss": 0.965, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.3400552848154392, |
|
"learning_rate": 0.00017860732272707565, |
|
"loss": 0.9296, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.33946589539162436, |
|
"learning_rate": 0.0001782740094458214, |
|
"loss": 0.7948, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.3106409161075979, |
|
"learning_rate": 0.00017793843569503096, |
|
"loss": 0.9234, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.3048504659492406, |
|
"learning_rate": 0.00017760061116571472, |
|
"loss": 0.735, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.3699006267760609, |
|
"learning_rate": 0.00017726054561388325, |
|
"loss": 0.8097, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.3812470781886498, |
|
"learning_rate": 0.0001769182488602653, |
|
"loss": 0.7924, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.31391420913653745, |
|
"learning_rate": 0.0001765737307900244, |
|
"loss": 0.8468, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.2798144915599161, |
|
"learning_rate": 0.00017622700135247336, |
|
"loss": 0.7466, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.4373736539748376, |
|
"learning_rate": 0.0001758780705607869, |
|
"loss": 0.7782, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.31499380584195635, |
|
"learning_rate": 0.00017552694849171238, |
|
"loss": 0.7623, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.34084198516003655, |
|
"learning_rate": 0.00017517364528527905, |
|
"loss": 0.7643, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.220259377380433, |
|
"learning_rate": 0.00017481817114450504, |
|
"loss": 0.7041, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.3741361155303008, |
|
"learning_rate": 0.00017446053633510267, |
|
"loss": 0.8331, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.25186403738162744, |
|
"learning_rate": 0.00017410075118518207, |
|
"loss": 0.7746, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.3381680931423155, |
|
"learning_rate": 0.000173738826084953, |
|
"loss": 0.8091, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.30332385851807925, |
|
"learning_rate": 0.00017337477148642453, |
|
"loss": 0.8123, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.4189781620141866, |
|
"learning_rate": 0.0001730085979031035, |
|
"loss": 0.7662, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.20916757459764715, |
|
"learning_rate": 0.0001726403159096907, |
|
"loss": 0.7658, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.25400278359501394, |
|
"learning_rate": 0.0001722699361417755, |
|
"loss": 0.7761, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.29634266788228114, |
|
"learning_rate": 0.00017189746929552885, |
|
"loss": 0.7712, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.34416521475533296, |
|
"learning_rate": 0.00017152292612739427, |
|
"loss": 0.8657, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.26477218894974247, |
|
"learning_rate": 0.00017114631745377716, |
|
"loss": 0.7979, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.39575314169996223, |
|
"learning_rate": 0.00017076765415073252, |
|
"loss": 0.7657, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.3060703163684423, |
|
"learning_rate": 0.0001703869471536509, |
|
"loss": 0.7758, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.3041979747156484, |
|
"learning_rate": 0.00017000420745694254, |
|
"loss": 0.8641, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.2891225024964161, |
|
"learning_rate": 0.0001696194461137198, |
|
"loss": 0.8824, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.2718444108580119, |
|
"learning_rate": 0.0001692326742354781, |
|
"loss": 0.7924, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.259262158014512, |
|
"learning_rate": 0.00016884390299177492, |
|
"loss": 0.8369, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.3499348899825314, |
|
"learning_rate": 0.00016845314360990727, |
|
"loss": 0.8346, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.2715739588374178, |
|
"learning_rate": 0.00016806040737458745, |
|
"loss": 0.8032, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.31767202890667356, |
|
"learning_rate": 0.00016766570562761726, |
|
"loss": 0.7771, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.2775818452285115, |
|
"learning_rate": 0.00016726904976756024, |
|
"loss": 0.7571, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.40533408177768837, |
|
"learning_rate": 0.00016687045124941268, |
|
"loss": 0.7487, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.23186261109536282, |
|
"learning_rate": 0.0001664699215842728, |
|
"loss": 0.7442, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.39092343811338354, |
|
"learning_rate": 0.00016606747233900815, |
|
"loss": 0.8009, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.3086065524408362, |
|
"learning_rate": 0.00016566311513592188, |
|
"loss": 0.8045, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.3367565478870619, |
|
"learning_rate": 0.00016525686165241673, |
|
"loss": 0.767, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.3086208201423001, |
|
"learning_rate": 0.00016484872362065818, |
|
"loss": 0.8297, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.29118078471035397, |
|
"learning_rate": 0.0001644387128272353, |
|
"loss": 0.73, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.2276647259111584, |
|
"learning_rate": 0.00016402684111282048, |
|
"loss": 0.7594, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.31848819925406663, |
|
"learning_rate": 0.00016361312037182764, |
|
"loss": 0.7175, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.3783361351946572, |
|
"learning_rate": 0.00016319756255206856, |
|
"loss": 0.8027, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.266088499578184, |
|
"learning_rate": 0.00016278017965440787, |
|
"loss": 0.7452, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.29706113106114784, |
|
"learning_rate": 0.0001623609837324165, |
|
"loss": 0.8534, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.3339471167119851, |
|
"learning_rate": 0.00016193998689202358, |
|
"loss": 0.8144, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.28540623867963755, |
|
"learning_rate": 0.00016151720129116686, |
|
"loss": 0.7651, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.2639670831879774, |
|
"learning_rate": 0.00016109263913944154, |
|
"loss": 0.7034, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.33170223538556726, |
|
"learning_rate": 0.00016066631269774767, |
|
"loss": 0.7217, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.3941911797947879, |
|
"learning_rate": 0.00016023823427793626, |
|
"loss": 0.7772, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.3440363876277395, |
|
"learning_rate": 0.00015980841624245335, |
|
"loss": 0.727, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.27303644578696656, |
|
"learning_rate": 0.00015937687100398343, |
|
"loss": 0.7976, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.19363446071730062, |
|
"learning_rate": 0.0001589436110250906, |
|
"loss": 0.7601, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.3220033440998593, |
|
"learning_rate": 0.00015850864881785892, |
|
"loss": 0.8059, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.24875841441874524, |
|
"learning_rate": 0.00015807199694353093, |
|
"loss": 0.7766, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.27144712486585415, |
|
"learning_rate": 0.000157633668012145, |
|
"loss": 0.9517, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.29984291637849303, |
|
"learning_rate": 0.00015719367468217102, |
|
"loss": 0.7078, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.42717086324378223, |
|
"learning_rate": 0.00015675202966014502, |
|
"loss": 0.6811, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.34151608330461036, |
|
"learning_rate": 0.0001563087457003021, |
|
"loss": 0.7748, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.3709398696526256, |
|
"learning_rate": 0.0001558638356042081, |
|
"loss": 0.7182, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.34922444682335496, |
|
"learning_rate": 0.00015541731222038998, |
|
"loss": 0.8094, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.3980009403514347, |
|
"learning_rate": 0.00015496918844396467, |
|
"loss": 0.8039, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.3512099804126243, |
|
"learning_rate": 0.00015451947721626676, |
|
"loss": 0.79, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.40719330570424017, |
|
"learning_rate": 0.00015406819152447474, |
|
"loss": 0.6692, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.32754689412178806, |
|
"learning_rate": 0.0001536153444012359, |
|
"loss": 0.7442, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.3468017225536143, |
|
"learning_rate": 0.00015316094892428995, |
|
"loss": 0.7848, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.3434551451114442, |
|
"learning_rate": 0.00015270501821609158, |
|
"loss": 0.7438, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.34114725338586066, |
|
"learning_rate": 0.00015224756544343114, |
|
"loss": 0.6742, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.35582931502840126, |
|
"learning_rate": 0.00015178860381705457, |
|
"loss": 0.6642, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.3951492370139673, |
|
"learning_rate": 0.00015132814659128205, |
|
"loss": 0.7963, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.3017429849876104, |
|
"learning_rate": 0.00015086620706362486, |
|
"loss": 0.7752, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2626021589785786, |
|
"learning_rate": 0.00015040279857440176, |
|
"loss": 0.7782, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.33978693007893296, |
|
"learning_rate": 0.0001499379345063534, |
|
"loss": 0.7799, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.3251454729934429, |
|
"learning_rate": 0.00014947162828425606, |
|
"loss": 0.7907, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.46699196417323946, |
|
"learning_rate": 0.00014900389337453392, |
|
"loss": 0.8757, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.30759216383899046, |
|
"learning_rate": 0.00014853474328487, |
|
"loss": 0.8248, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5240595026063802, |
|
"learning_rate": 0.00014806419156381632, |
|
"loss": 0.8153, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2984711481085597, |
|
"learning_rate": 0.0001475922518004025, |
|
"loss": 0.8307, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.34139646796135426, |
|
"learning_rate": 0.00014711893762374322, |
|
"loss": 0.7983, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2571926484335572, |
|
"learning_rate": 0.00014664426270264493, |
|
"loss": 0.6837, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.3257702661543834, |
|
"learning_rate": 0.00014616824074521075, |
|
"loss": 0.7656, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5681687224975429, |
|
"learning_rate": 0.00014569088549844488, |
|
"loss": 0.8412, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.3442468618645148, |
|
"learning_rate": 0.00014521221074785542, |
|
"loss": 0.7408, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.3889043102333772, |
|
"learning_rate": 0.00014473223031705637, |
|
"loss": 0.7891, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.3512289539889666, |
|
"learning_rate": 0.0001442509580673684, |
|
"loss": 0.7438, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.3124271122113035, |
|
"learning_rate": 0.00014376840789741838, |
|
"loss": 0.7047, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.2200391690908901, |
|
"learning_rate": 0.00014328459374273833, |
|
"loss": 0.7432, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.32400034100164815, |
|
"learning_rate": 0.00014279952957536266, |
|
"loss": 0.8155, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.3003484274407438, |
|
"learning_rate": 0.00014231322940342492, |
|
"loss": 0.7521, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.4116598695778175, |
|
"learning_rate": 0.00014182570727075308, |
|
"loss": 0.8548, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.42125576864395314, |
|
"learning_rate": 0.00014133697725646403, |
|
"loss": 0.8552, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.32506737333947255, |
|
"learning_rate": 0.000140847053474557, |
|
"loss": 0.7796, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.3558852515623043, |
|
"learning_rate": 0.00014035595007350592, |
|
"loss": 0.782, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.32892065566412354, |
|
"learning_rate": 0.00013986368123585093, |
|
"loss": 0.7912, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.3309987342740096, |
|
"learning_rate": 0.00013937026117778867, |
|
"loss": 0.7852, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.317076816745732, |
|
"learning_rate": 0.00013887570414876176, |
|
"loss": 0.8792, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.3888229597038326, |
|
"learning_rate": 0.00013838002443104742, |
|
"loss": 0.7537, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.3505522947043339, |
|
"learning_rate": 0.00013788323633934484, |
|
"loss": 0.7765, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.30255809120744814, |
|
"learning_rate": 0.0001373853542203619, |
|
"loss": 0.7445, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.38394599313950495, |
|
"learning_rate": 0.00013688639245240078, |
|
"loss": 0.717, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.3546082273774911, |
|
"learning_rate": 0.00013638636544494287, |
|
"loss": 0.7088, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.46456400202121617, |
|
"learning_rate": 0.00013588528763823233, |
|
"loss": 0.6481, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.38142306418882993, |
|
"learning_rate": 0.0001353831735028595, |
|
"loss": 0.8121, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.34062042874830745, |
|
"learning_rate": 0.00013488003753934263, |
|
"loss": 0.7098, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.19799193048705183, |
|
"learning_rate": 0.0001343758942777094, |
|
"loss": 0.6883, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.3696985192619358, |
|
"learning_rate": 0.000133870758277077, |
|
"loss": 0.8092, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.2874954359019885, |
|
"learning_rate": 0.00013336464412523207, |
|
"loss": 0.8209, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.3592024936010695, |
|
"learning_rate": 0.000132857566438209, |
|
"loss": 0.854, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.29409773858597665, |
|
"learning_rate": 0.00013234953985986824, |
|
"loss": 0.798, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2415718204855592, |
|
"learning_rate": 0.0001318405790614731, |
|
"loss": 0.7382, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2584643780619029, |
|
"learning_rate": 0.0001313306987412661, |
|
"loss": 0.8092, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.34126538154076436, |
|
"learning_rate": 0.00013081991362404475, |
|
"loss": 0.789, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.32753475635130697, |
|
"learning_rate": 0.00013030823846073595, |
|
"loss": 0.8413, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.3285555673315335, |
|
"learning_rate": 0.00012979568802797022, |
|
"loss": 0.7092, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2947608781251718, |
|
"learning_rate": 0.00012928227712765504, |
|
"loss": 0.645, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.33949478474040173, |
|
"learning_rate": 0.00012876802058654714, |
|
"loss": 0.804, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.43727181136357957, |
|
"learning_rate": 0.0001282529332558245, |
|
"loss": 0.8041, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.3609023630640718, |
|
"learning_rate": 0.00012773703001065737, |
|
"loss": 0.8356, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.3494948390700119, |
|
"learning_rate": 0.00012722032574977881, |
|
"loss": 0.7872, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.3275549957683315, |
|
"learning_rate": 0.0001267028353950543, |
|
"loss": 0.7883, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.2434171834573686, |
|
"learning_rate": 0.00012618457389105094, |
|
"loss": 0.7766, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.35813509273993893, |
|
"learning_rate": 0.00012566555620460569, |
|
"loss": 0.7723, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.3850234800177591, |
|
"learning_rate": 0.00012514579732439323, |
|
"loss": 0.7127, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.2990175481928644, |
|
"learning_rate": 0.00012462531226049335, |
|
"loss": 0.8027, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.26743125802211676, |
|
"learning_rate": 0.00012410411604395696, |
|
"loss": 0.7775, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.3003015429775997, |
|
"learning_rate": 0.00012358222372637248, |
|
"loss": 0.8003, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.25952231751732324, |
|
"learning_rate": 0.00012305965037943096, |
|
"loss": 0.7946, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.3571723160585395, |
|
"learning_rate": 0.00012253641109449074, |
|
"loss": 0.7369, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.3502660576927713, |
|
"learning_rate": 0.00012201252098214186, |
|
"loss": 0.8105, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.3925450057088276, |
|
"learning_rate": 0.00012148799517176948, |
|
"loss": 0.7664, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.2894085765012847, |
|
"learning_rate": 0.00012096284881111711, |
|
"loss": 0.8213, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.29374369830200575, |
|
"learning_rate": 0.00012043709706584902, |
|
"loss": 0.7723, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.2863311083269218, |
|
"learning_rate": 0.00011991075511911236, |
|
"loss": 0.696, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.3036662438900221, |
|
"learning_rate": 0.00011938383817109868, |
|
"loss": 0.8753, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3020605197833583, |
|
"learning_rate": 0.00011885636143860492, |
|
"loss": 0.8759, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3639681427966891, |
|
"learning_rate": 0.00011832834015459404, |
|
"loss": 0.8606, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.37953818216433793, |
|
"learning_rate": 0.00011779978956775506, |
|
"loss": 0.7051, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3184781493318525, |
|
"learning_rate": 0.00011727072494206262, |
|
"loss": 0.7916, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.35142683733387886, |
|
"learning_rate": 0.00011674116155633637, |
|
"loss": 0.8831, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3117830556752173, |
|
"learning_rate": 0.00011621111470379951, |
|
"loss": 0.8306, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.4495145775092123, |
|
"learning_rate": 0.00011568059969163734, |
|
"loss": 0.7767, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.29751872220308234, |
|
"learning_rate": 0.00011514963184055503, |
|
"loss": 0.7627, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5069779219255514, |
|
"learning_rate": 0.00011461822648433527, |
|
"loss": 0.7007, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.3685939765535684, |
|
"learning_rate": 0.00011408639896939548, |
|
"loss": 0.7903, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.35043401596057283, |
|
"learning_rate": 0.0001135541646543445, |
|
"loss": 0.8195, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.43437482478281425, |
|
"learning_rate": 0.00011302153890953917, |
|
"loss": 0.7474, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.424740143766434, |
|
"learning_rate": 0.00011248853711664037, |
|
"loss": 0.7487, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.4206812162224315, |
|
"learning_rate": 0.00011195517466816892, |
|
"loss": 0.7663, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.3528935885168195, |
|
"learning_rate": 0.00011142146696706086, |
|
"loss": 0.7075, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.3022231077132756, |
|
"learning_rate": 0.00011088742942622285, |
|
"loss": 0.7005, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.24230122499008153, |
|
"learning_rate": 0.00011035307746808696, |
|
"loss": 0.7103, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.3274240826179655, |
|
"learning_rate": 0.00010981842652416525, |
|
"loss": 0.7585, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.3226818393613587, |
|
"learning_rate": 0.00010928349203460421, |
|
"loss": 0.6873, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.42160428435071856, |
|
"learning_rate": 0.00010874828944773884, |
|
"loss": 0.7033, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.3673664828653425, |
|
"learning_rate": 0.0001082128342196464, |
|
"loss": 0.7568, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.35369832231150045, |
|
"learning_rate": 0.00010767714181370032, |
|
"loss": 0.7459, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.3648184560113796, |
|
"learning_rate": 0.00010714122770012332, |
|
"loss": 0.7744, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.4505619268522559, |
|
"learning_rate": 0.0001066051073555409, |
|
"loss": 0.7257, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.43443202284742777, |
|
"learning_rate": 0.00010606879626253425, |
|
"loss": 0.7188, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.3553258041770261, |
|
"learning_rate": 0.00010553230990919316, |
|
"loss": 0.7459, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.4661654069610038, |
|
"learning_rate": 0.00010499566378866879, |
|
"loss": 0.7836, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.37584682327967367, |
|
"learning_rate": 0.00010445887339872613, |
|
"loss": 0.7602, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.39145966702225243, |
|
"learning_rate": 0.00010392195424129663, |
|
"loss": 0.7742, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.3393184813627934, |
|
"learning_rate": 0.0001033849218220303, |
|
"loss": 0.7641, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.3324768161048583, |
|
"learning_rate": 0.00010284779164984808, |
|
"loss": 0.7084, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.4536643875844217, |
|
"learning_rate": 0.00010231057923649395, |
|
"loss": 0.7546, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.3383053206020978, |
|
"learning_rate": 0.00010177330009608679, |
|
"loss": 0.7897, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.3291950908164226, |
|
"learning_rate": 0.00010123596974467267, |
|
"loss": 0.837, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.40591985948567333, |
|
"learning_rate": 0.00010069860369977644, |
|
"loss": 0.7881, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.3947516646576018, |
|
"learning_rate": 0.0001001612174799538, |
|
"loss": 0.7554, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.48999744278201957, |
|
"learning_rate": 9.962382660434302e-05, |
|
"loss": 0.7049, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.27763093083945417, |
|
"learning_rate": 9.908644659221692e-05, |
|
"loss": 0.7906, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.36597705216081855, |
|
"learning_rate": 9.854909296253454e-05, |
|
"loss": 0.7717, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.361260421586406, |
|
"learning_rate": 9.801178123349298e-05, |
|
"loss": 0.8052, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.40479237805543866, |
|
"learning_rate": 9.747452692207944e-05, |
|
"loss": 0.6528, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.3337778576325595, |
|
"learning_rate": 9.693734554362274e-05, |
|
"loss": 0.7956, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.352206821846608, |
|
"learning_rate": 9.640025261134566e-05, |
|
"loss": 0.8004, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.3007022043507481, |
|
"learning_rate": 9.586326363591667e-05, |
|
"loss": 0.6586, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.32806169397898344, |
|
"learning_rate": 9.532639412500214e-05, |
|
"loss": 0.6469, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.2948353441185244, |
|
"learning_rate": 9.478965958281831e-05, |
|
"loss": 0.772, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.29433563822493, |
|
"learning_rate": 9.425307550968379e-05, |
|
"loss": 0.7587, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.2929390819806653, |
|
"learning_rate": 9.371665740157177e-05, |
|
"loss": 0.7641, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.36587748924129493, |
|
"learning_rate": 9.318042074966249e-05, |
|
"loss": 0.7423, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.3157914575950516, |
|
"learning_rate": 9.2644381039896e-05, |
|
"loss": 0.7802, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.3083734823157643, |
|
"learning_rate": 9.210855375252488e-05, |
|
"loss": 0.6806, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.37273540588458964, |
|
"learning_rate": 9.157295436166706e-05, |
|
"loss": 0.8018, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.2891457780890995, |
|
"learning_rate": 9.103759833485921e-05, |
|
"loss": 0.7924, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.31880678342943103, |
|
"learning_rate": 9.050250113260988e-05, |
|
"loss": 0.6784, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.38652296771171907, |
|
"learning_rate": 8.996767820795295e-05, |
|
"loss": 0.8423, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.36151176691802633, |
|
"learning_rate": 8.943314500600153e-05, |
|
"loss": 0.7657, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.3630982909299649, |
|
"learning_rate": 8.889891696350182e-05, |
|
"loss": 0.7316, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.346561432187551, |
|
"learning_rate": 8.836500950838743e-05, |
|
"loss": 0.7937, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.3552138882564471, |
|
"learning_rate": 8.783143805933356e-05, |
|
"loss": 0.7688, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.3883059056946058, |
|
"learning_rate": 8.729821802531212e-05, |
|
"loss": 0.8022, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.3654198034463761, |
|
"learning_rate": 8.676536480514646e-05, |
|
"loss": 0.6797, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.3437677388699394, |
|
"learning_rate": 8.623289378706665e-05, |
|
"loss": 0.8503, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.23831382342326574, |
|
"learning_rate": 8.570082034826525e-05, |
|
"loss": 0.725, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.4978109719850785, |
|
"learning_rate": 8.51691598544532e-05, |
|
"loss": 0.8173, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.3849488236961706, |
|
"learning_rate": 8.463792765941598e-05, |
|
"loss": 0.7935, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.2564830548422943, |
|
"learning_rate": 8.410713910457022e-05, |
|
"loss": 0.7616, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.3883061081379379, |
|
"learning_rate": 8.357680951852074e-05, |
|
"loss": 0.7351, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.3506058449194061, |
|
"learning_rate": 8.30469542166179e-05, |
|
"loss": 0.7693, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.30929676711753123, |
|
"learning_rate": 8.25175885005151e-05, |
|
"loss": 0.7873, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.40865576499509826, |
|
"learning_rate": 8.19887276577271e-05, |
|
"loss": 0.8042, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.5195995711187212, |
|
"learning_rate": 8.146038696118855e-05, |
|
"loss": 0.7973, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.42197209504725125, |
|
"learning_rate": 8.093258166881262e-05, |
|
"loss": 0.7533, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.30422958535775585, |
|
"learning_rate": 8.04053270230508e-05, |
|
"loss": 0.779, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.3455509672202836, |
|
"learning_rate": 7.987863825045234e-05, |
|
"loss": 0.8111, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.23483119931888347, |
|
"learning_rate": 7.935253056122478e-05, |
|
"loss": 0.6691, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.5096365631799179, |
|
"learning_rate": 7.882701914879454e-05, |
|
"loss": 0.8173, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.27454232980225196, |
|
"learning_rate": 7.83021191893682e-05, |
|
"loss": 0.8318, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.25340939505328935, |
|
"learning_rate": 7.777784584149431e-05, |
|
"loss": 0.7749, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.41218592002469173, |
|
"learning_rate": 7.725421424562541e-05, |
|
"loss": 0.7486, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.3166340386687328, |
|
"learning_rate": 7.673123952368105e-05, |
|
"loss": 0.7371, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.37125596925100546, |
|
"learning_rate": 7.620893677861097e-05, |
|
"loss": 0.8205, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.389340046711289, |
|
"learning_rate": 7.568732109395882e-05, |
|
"loss": 0.8052, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.297511489273485, |
|
"learning_rate": 7.516640753342677e-05, |
|
"loss": 0.8116, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.3551932787364764, |
|
"learning_rate": 7.464621114044041e-05, |
|
"loss": 0.7256, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.32976411229944613, |
|
"learning_rate": 7.41267469377143e-05, |
|
"loss": 0.7779, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.36641305934003204, |
|
"learning_rate": 7.360802992681803e-05, |
|
"loss": 0.7769, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.32848405433392913, |
|
"learning_rate": 7.309007508774319e-05, |
|
"loss": 0.7449, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.3818192607183943, |
|
"learning_rate": 7.257289737847067e-05, |
|
"loss": 0.7298, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.3956889666509929, |
|
"learning_rate": 7.205651173453859e-05, |
|
"loss": 0.7438, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.3186630869883142, |
|
"learning_rate": 7.154093306861115e-05, |
|
"loss": 0.8091, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.33431044470129717, |
|
"learning_rate": 7.102617627004795e-05, |
|
"loss": 0.7518, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.32535315210688565, |
|
"learning_rate": 7.051225620447375e-05, |
|
"loss": 0.8321, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.4508357834061351, |
|
"learning_rate": 6.999918771334952e-05, |
|
"loss": 0.7282, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.3512613827114045, |
|
"learning_rate": 6.948698561354363e-05, |
|
"loss": 0.7826, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.49837144853088533, |
|
"learning_rate": 6.897566469690397e-05, |
|
"loss": 0.795, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.4003697684296247, |
|
"learning_rate": 6.846523972983085e-05, |
|
"loss": 0.7951, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.3815043269956921, |
|
"learning_rate": 6.795572545285044e-05, |
|
"loss": 0.826, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.3291683320960395, |
|
"learning_rate": 6.74471365801893e-05, |
|
"loss": 0.7708, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.41704151240520887, |
|
"learning_rate": 6.693948779934911e-05, |
|
"loss": 0.7386, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.463623793653466, |
|
"learning_rate": 6.643279377068283e-05, |
|
"loss": 0.7713, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.3658375594477012, |
|
"learning_rate": 6.592706912697124e-05, |
|
"loss": 0.7786, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.4059447230155753, |
|
"learning_rate": 6.542232847300015e-05, |
|
"loss": 0.798, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.3927246312306725, |
|
"learning_rate": 6.491858638513899e-05, |
|
"loss": 0.8166, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.35333239481209877, |
|
"learning_rate": 6.441585741091955e-05, |
|
"loss": 0.7539, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.3623671701689697, |
|
"learning_rate": 6.391415606861608e-05, |
|
"loss": 0.8162, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.430064026231262, |
|
"learning_rate": 6.341349684682576e-05, |
|
"loss": 0.7593, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.30707444492883157, |
|
"learning_rate": 6.291389420405062e-05, |
|
"loss": 0.7593, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.29281767006409765, |
|
"learning_rate": 6.241536256827978e-05, |
|
"loss": 0.7074, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.3397684880342664, |
|
"learning_rate": 6.191791633657268e-05, |
|
"loss": 0.7077, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.35070530863747645, |
|
"learning_rate": 6.142156987464367e-05, |
|
"loss": 0.7888, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.31884184127852916, |
|
"learning_rate": 6.0926337516446784e-05, |
|
"loss": 0.8045, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.34522310174070975, |
|
"learning_rate": 6.043223356376197e-05, |
|
"loss": 0.8115, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.35929303458552225, |
|
"learning_rate": 5.9939272285782066e-05, |
|
"loss": 0.8234, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.3835859771257563, |
|
"learning_rate": 5.9447467918700614e-05, |
|
"loss": 0.7295, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.33889717245375717, |
|
"learning_rate": 5.895683466530091e-05, |
|
"loss": 0.7491, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.34625485711737686, |
|
"learning_rate": 5.8467386694545635e-05, |
|
"loss": 0.7882, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.3834886156777842, |
|
"learning_rate": 5.797913814116781e-05, |
|
"loss": 0.7093, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.3892980195228429, |
|
"learning_rate": 5.7492103105262715e-05, |
|
"loss": 0.794, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.39210633693040825, |
|
"learning_rate": 5.7006295651880246e-05, |
|
"loss": 0.7566, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.3582797057469045, |
|
"learning_rate": 5.6521729810619317e-05, |
|
"loss": 0.8021, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.3542924342264584, |
|
"learning_rate": 5.603841957522227e-05, |
|
"loss": 0.756, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.36575349651181366, |
|
"learning_rate": 5.555637890317091e-05, |
|
"loss": 0.7921, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.38535314462569586, |
|
"learning_rate": 5.507562171528342e-05, |
|
"loss": 0.7781, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.39735016460723493, |
|
"learning_rate": 5.459616189531234e-05, |
|
"loss": 0.6632, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.4056677466996733, |
|
"learning_rate": 5.411801328954368e-05, |
|
"loss": 0.7334, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.42376106078500364, |
|
"learning_rate": 5.36411897063968e-05, |
|
"loss": 0.8772, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.35144323747646544, |
|
"learning_rate": 5.316570491602606e-05, |
|
"loss": 0.7793, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.3783769784963828, |
|
"learning_rate": 5.269157264992276e-05, |
|
"loss": 0.8655, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.44209683459363136, |
|
"learning_rate": 5.221880660051881e-05, |
|
"loss": 0.8032, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.4882374682401987, |
|
"learning_rate": 5.1747420420791196e-05, |
|
"loss": 0.7007, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.3237759848919934, |
|
"learning_rate": 5.127742772386786e-05, |
|
"loss": 0.7897, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.36606432111465076, |
|
"learning_rate": 5.0808842082634314e-05, |
|
"loss": 0.8064, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.40999182095921494, |
|
"learning_rate": 5.0341677029341895e-05, |
|
"loss": 0.7103, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.3272955637327382, |
|
"learning_rate": 4.987594605521682e-05, |
|
"loss": 0.6785, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.3490487483696679, |
|
"learning_rate": 4.941166261007077e-05, |
|
"loss": 0.7292, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.3433624374602265, |
|
"learning_rate": 4.894884010191211e-05, |
|
"loss": 0.6762, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.325285651430037, |
|
"learning_rate": 4.848749189655915e-05, |
|
"loss": 0.7659, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.31571712296306303, |
|
"learning_rate": 4.802763131725378e-05, |
|
"loss": 0.7736, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.35722394621197917, |
|
"learning_rate": 4.756927164427685e-05, |
|
"loss": 0.7155, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.36377115960758405, |
|
"learning_rate": 4.711242611456469e-05, |
|
"loss": 0.7326, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.4323956507240235, |
|
"learning_rate": 4.665710792132671e-05, |
|
"loss": 0.7775, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.4046174615365396, |
|
"learning_rate": 4.620333021366463e-05, |
|
"loss": 0.7643, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.3796515442594094, |
|
"learning_rate": 4.5751106096192476e-05, |
|
"loss": 0.6264, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.46275385650934453, |
|
"learning_rate": 4.5300448628658254e-05, |
|
"loss": 0.688, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.4343032727751153, |
|
"learning_rate": 4.485137082556685e-05, |
|
"loss": 0.7238, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.42658163696603996, |
|
"learning_rate": 4.4403885655804115e-05, |
|
"loss": 0.7691, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5374930429188296, |
|
"learning_rate": 4.395800604226229e-05, |
|
"loss": 0.8293, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5178849424936606, |
|
"learning_rate": 4.351374486146706e-05, |
|
"loss": 0.6683, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.2580914515654273, |
|
"learning_rate": 4.307111494320524e-05, |
|
"loss": 0.6295, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.38787306791139886, |
|
"learning_rate": 4.263012907015477e-05, |
|
"loss": 0.6748, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.4320242881816677, |
|
"learning_rate": 4.219079997751515e-05, |
|
"loss": 0.6848, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.39451448787145293, |
|
"learning_rate": 4.175314035264002e-05, |
|
"loss": 0.7691, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.3024402074904783, |
|
"learning_rate": 4.131716283467034e-05, |
|
"loss": 0.7674, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.39413439687935803, |
|
"learning_rate": 4.0882880014169865e-05, |
|
"loss": 0.83, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.45210137336011785, |
|
"learning_rate": 4.045030443276115e-05, |
|
"loss": 0.7117, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.4767595879985179, |
|
"learning_rate": 4.001944858276356e-05, |
|
"loss": 0.7424, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.3405433186754331, |
|
"learning_rate": 3.9590324906832435e-05, |
|
"loss": 0.7944, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.38698863622073953, |
|
"learning_rate": 3.9162945797599895e-05, |
|
"loss": 0.7486, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.33122873819033993, |
|
"learning_rate": 3.873732359731661e-05, |
|
"loss": 0.7339, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.3593189874663698, |
|
"learning_rate": 3.831347059749587e-05, |
|
"loss": 0.8308, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.32020688896625343, |
|
"learning_rate": 3.78913990385582e-05, |
|
"loss": 0.7932, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.35408477803883764, |
|
"learning_rate": 3.7471121109478004e-05, |
|
"loss": 0.6155, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.33845060971026897, |
|
"learning_rate": 3.705264894743167e-05, |
|
"loss": 0.7798, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.5583150637853672, |
|
"learning_rate": 3.6635994637446845e-05, |
|
"loss": 0.6673, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.4929136893319016, |
|
"learning_rate": 3.6221170212053766e-05, |
|
"loss": 0.8048, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.5429072066090833, |
|
"learning_rate": 3.5808187650937276e-05, |
|
"loss": 0.7507, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.46201271372482866, |
|
"learning_rate": 3.53970588805914e-05, |
|
"loss": 0.7259, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.3129228737179152, |
|
"learning_rate": 3.498779577397453e-05, |
|
"loss": 0.7715, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.32996488069999697, |
|
"learning_rate": 3.458041015016681e-05, |
|
"loss": 0.7797, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.3214733415000198, |
|
"learning_rate": 3.4174913774028485e-05, |
|
"loss": 0.7226, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.42589629830207104, |
|
"learning_rate": 3.3771318355860593e-05, |
|
"loss": 0.7218, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.3047848353555366, |
|
"learning_rate": 3.336963555106638e-05, |
|
"loss": 0.7956, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.34359240246923894, |
|
"learning_rate": 3.296987695981493e-05, |
|
"loss": 0.666, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.4144993432032501, |
|
"learning_rate": 3.257205412670605e-05, |
|
"loss": 0.7416, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.3731255551783685, |
|
"learning_rate": 3.217617854043707e-05, |
|
"loss": 0.8345, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.4050140213208934, |
|
"learning_rate": 3.178226163347067e-05, |
|
"loss": 0.7122, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.4002439491991807, |
|
"learning_rate": 3.139031478170522e-05, |
|
"loss": 0.6805, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.42917304751384394, |
|
"learning_rate": 3.100034930414585e-05, |
|
"loss": 0.733, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.4136643224459766, |
|
"learning_rate": 3.0612376462577784e-05, |
|
"loss": 0.7807, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.3890309362984174, |
|
"learning_rate": 3.0226407461241056e-05, |
|
"loss": 0.643, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.4262911129142299, |
|
"learning_rate": 2.9842453446506868e-05, |
|
"loss": 0.823, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.30650720385697705, |
|
"learning_rate": 2.9460525506555947e-05, |
|
"loss": 0.7002, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.3980581468888342, |
|
"learning_rate": 2.9080634671057892e-05, |
|
"loss": 0.7899, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.4056498788574052, |
|
"learning_rate": 2.8702791910853144e-05, |
|
"loss": 0.701, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.4548787143471859, |
|
"learning_rate": 2.832700813763579e-05, |
|
"loss": 0.8386, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.4404855029802744, |
|
"learning_rate": 2.7953294203638625e-05, |
|
"loss": 0.7813, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.40369488587415225, |
|
"learning_rate": 2.7581660901319663e-05, |
|
"loss": 0.7886, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.44025189268752124, |
|
"learning_rate": 2.7212118963050592e-05, |
|
"loss": 0.6854, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.38769461649930276, |
|
"learning_rate": 2.6844679060806666e-05, |
|
"loss": 0.7533, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.39043309802901266, |
|
"learning_rate": 2.647935180585861e-05, |
|
"loss": 0.7324, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.4031278133263342, |
|
"learning_rate": 2.6116147748466136e-05, |
|
"loss": 0.8095, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.31989753369104523, |
|
"learning_rate": 2.575507737757341e-05, |
|
"loss": 0.7635, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.3825808697754477, |
|
"learning_rate": 2.5396151120505797e-05, |
|
"loss": 0.7067, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.34014553791218866, |
|
"learning_rate": 2.5039379342669156e-05, |
|
"loss": 0.7454, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.3441742765574342, |
|
"learning_rate": 2.4684772347250194e-05, |
|
"loss": 0.7269, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.268563145640876, |
|
"learning_rate": 2.433234037491904e-05, |
|
"loss": 0.7188, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.44327204267255527, |
|
"learning_rate": 2.3982093603533485e-05, |
|
"loss": 0.6476, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.47944546289888046, |
|
"learning_rate": 2.3634042147845036e-05, |
|
"loss": 0.7312, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.3735226907786184, |
|
"learning_rate": 2.3288196059206936e-05, |
|
"loss": 0.8098, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.48173286401906895, |
|
"learning_rate": 2.2944565325283608e-05, |
|
"loss": 0.7692, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.454018593107754, |
|
"learning_rate": 2.260315986976258e-05, |
|
"loss": 0.7258, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.4102026616293206, |
|
"learning_rate": 2.2263989552067644e-05, |
|
"loss": 0.8175, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.36721285813725996, |
|
"learning_rate": 2.1927064167074197e-05, |
|
"loss": 0.7741, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5058554441705722, |
|
"learning_rate": 2.1592393444826377e-05, |
|
"loss": 0.7664, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.41227200976610034, |
|
"learning_rate": 2.125998705025619e-05, |
|
"loss": 0.7922, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.3739806966013022, |
|
"learning_rate": 2.0929854582904095e-05, |
|
"loss": 0.6827, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.3526698208142984, |
|
"learning_rate": 2.060200557664215e-05, |
|
"loss": 0.7712, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.355624340580361, |
|
"learning_rate": 2.0276449499398352e-05, |
|
"loss": 0.7217, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.35879322380857276, |
|
"learning_rate": 1.9953195752883535e-05, |
|
"loss": 0.8101, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.4381419678532357, |
|
"learning_rate": 1.9632253672319466e-05, |
|
"loss": 0.7784, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.3338214481525901, |
|
"learning_rate": 1.9313632526169713e-05, |
|
"loss": 0.7633, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.3419374997650999, |
|
"learning_rate": 1.899734151587157e-05, |
|
"loss": 0.6726, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.39283447932176424, |
|
"learning_rate": 1.868338977557058e-05, |
|
"loss": 0.7787, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.37003709150492736, |
|
"learning_rate": 1.837178637185666e-05, |
|
"loss": 0.7466, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.3961613001539733, |
|
"learning_rate": 1.8062540303502284e-05, |
|
"loss": 0.7097, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.35142355910690376, |
|
"learning_rate": 1.7755660501202565e-05, |
|
"loss": 0.6774, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.41038393576069904, |
|
"learning_rate": 1.745115582731749e-05, |
|
"loss": 0.7496, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.39409588845344945, |
|
"learning_rate": 1.7149035075615794e-05, |
|
"loss": 0.7187, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.44791745932431604, |
|
"learning_rate": 1.6849306971021116e-05, |
|
"loss": 0.7898, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.3525758016199936, |
|
"learning_rate": 1.6551980169360005e-05, |
|
"loss": 0.7511, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.36563645526797983, |
|
"learning_rate": 1.6257063257111938e-05, |
|
"loss": 0.7397, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.3351064266182499, |
|
"learning_rate": 1.596456475116147e-05, |
|
"loss": 0.7379, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5068000232220052, |
|
"learning_rate": 1.567449309855199e-05, |
|
"loss": 0.751, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.43979138745697033, |
|
"learning_rate": 1.5386856676242146e-05, |
|
"loss": 0.8085, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.356641089597573, |
|
"learning_rate": 1.5101663790863596e-05, |
|
"loss": 0.6256, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.44817921454892296, |
|
"learning_rate": 1.4818922678481429e-05, |
|
"loss": 0.7675, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.38078400449692273, |
|
"learning_rate": 1.4538641504355965e-05, |
|
"loss": 0.689, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.39634310148432367, |
|
"learning_rate": 1.4260828362707301e-05, |
|
"loss": 0.7727, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.3227302452177864, |
|
"learning_rate": 1.3985491276481323e-05, |
|
"loss": 0.6711, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.35229071646321697, |
|
"learning_rate": 1.3712638197118111e-05, |
|
"loss": 0.7711, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.3536271561860169, |
|
"learning_rate": 1.3442277004322257e-05, |
|
"loss": 0.8075, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.348867332644309, |
|
"learning_rate": 1.3174415505835436e-05, |
|
"loss": 0.7561, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.3535699706794319, |
|
"learning_rate": 1.2909061437210669e-05, |
|
"loss": 0.6532, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.29622730647422324, |
|
"learning_rate": 1.264622246158924e-05, |
|
"loss": 0.7651, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.36090199758429575, |
|
"learning_rate": 1.2385906169479167e-05, |
|
"loss": 0.8015, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.402156004342123, |
|
"learning_rate": 1.2128120078536076e-05, |
|
"loss": 0.6387, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.42634964362612304, |
|
"learning_rate": 1.1872871633346094e-05, |
|
"loss": 0.7452, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.4198409057077085, |
|
"learning_rate": 1.1620168205210869e-05, |
|
"loss": 0.7722, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.32753834269003024, |
|
"learning_rate": 1.1370017091934714e-05, |
|
"loss": 0.6906, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.32874585989735094, |
|
"learning_rate": 1.1122425517613722e-05, |
|
"loss": 0.6583, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.40652778318077754, |
|
"learning_rate": 1.0877400632427359e-05, |
|
"loss": 0.674, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.507341912219104, |
|
"learning_rate": 1.0634949512431814e-05, |
|
"loss": 0.7677, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.3696792851429222, |
|
"learning_rate": 1.0395079159355658e-05, |
|
"loss": 0.7034, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.4493375882862214, |
|
"learning_rate": 1.0157796500397699e-05, |
|
"loss": 0.7487, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.41029746791384736, |
|
"learning_rate": 9.92310838802698e-06, |
|
"loss": 0.7405, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.2972360830137905, |
|
"learning_rate": 9.691021599784711e-06, |
|
"loss": 0.6979, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.4032566104643188, |
|
"learning_rate": 9.461542838088722e-06, |
|
"loss": 0.7898, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.27918369114356745, |
|
"learning_rate": 9.23467873003977e-06, |
|
"loss": 0.8092, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.36249726345459377, |
|
"learning_rate": 9.010435827230313e-06, |
|
"loss": 0.6445, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.3964708844545875, |
|
"learning_rate": 8.788820605555082e-06, |
|
"loss": 0.7462, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.3502801511645061, |
|
"learning_rate": 8.569839465024299e-06, |
|
"loss": 0.7233, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.4033170990560745, |
|
"learning_rate": 8.35349872957869e-06, |
|
"loss": 0.8105, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.3954070605648136, |
|
"learning_rate": 8.139804646906923e-06, |
|
"loss": 0.7059, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.37474529564255277, |
|
"learning_rate": 7.928763388265181e-06, |
|
"loss": 0.8582, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.32809524672734786, |
|
"learning_rate": 7.720381048298897e-06, |
|
"loss": 0.7581, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.4149259283243587, |
|
"learning_rate": 7.5146636448668485e-06, |
|
"loss": 0.7735, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.4263926138100582, |
|
"learning_rate": 7.3116171188671865e-06, |
|
"loss": 0.8028, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.45104438156690363, |
|
"learning_rate": 7.111247334066129e-06, |
|
"loss": 0.752, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.43860221599692545, |
|
"learning_rate": 6.913560076928361e-06, |
|
"loss": 0.7119, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.3597066697079075, |
|
"learning_rate": 6.71856105645009e-06, |
|
"loss": 0.7666, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5298228542098834, |
|
"learning_rate": 6.526255903994105e-06, |
|
"loss": 0.6903, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.348385379079852, |
|
"learning_rate": 6.336650173127223e-06, |
|
"loss": 0.7291, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.40348728722999555, |
|
"learning_rate": 6.149749339459787e-06, |
|
"loss": 0.6929, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.42152434212067974, |
|
"learning_rate": 5.96555880048767e-06, |
|
"loss": 0.7092, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.3418766857878975, |
|
"learning_rate": 5.784083875436286e-06, |
|
"loss": 0.7017, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.4129577303513544, |
|
"learning_rate": 5.605329805107084e-06, |
|
"loss": 0.8389, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.34322551067313023, |
|
"learning_rate": 5.429301751726068e-06, |
|
"loss": 0.8468, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.42129646163250906, |
|
"learning_rate": 5.256004798794889e-06, |
|
"loss": 0.7467, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.38112811696400956, |
|
"learning_rate": 5.085443950943858e-06, |
|
"loss": 0.6878, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.38823071010954757, |
|
"learning_rate": 4.917624133787535e-06, |
|
"loss": 0.839, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.41462150440141854, |
|
"learning_rate": 4.752550193782457e-06, |
|
"loss": 0.7937, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.47038565975041013, |
|
"learning_rate": 4.590226898087169e-06, |
|
"loss": 0.7394, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.42313697531774797, |
|
"learning_rate": 4.430658934424536e-06, |
|
"loss": 0.7365, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.37056389807076956, |
|
"learning_rate": 4.2738509109464194e-06, |
|
"loss": 0.7771, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.4282991333253571, |
|
"learning_rate": 4.119807356100536e-06, |
|
"loss": 0.8332, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.3399316066449957, |
|
"learning_rate": 3.968532718499718e-06, |
|
"loss": 0.719, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.3045272007549396, |
|
"learning_rate": 3.8200313667934415e-06, |
|
"loss": 0.7398, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.3940052052861745, |
|
"learning_rate": 3.674307589541637e-06, |
|
"loss": 0.6926, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.5430719327118287, |
|
"learning_rate": 3.5313655950908964e-06, |
|
"loss": 0.724, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.3409882746688945, |
|
"learning_rate": 3.391209511452853e-06, |
|
"loss": 0.7768, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.32988045764813406, |
|
"learning_rate": 3.253843386185085e-06, |
|
"loss": 0.7503, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.30435984793513565, |
|
"learning_rate": 3.1192711862740865e-06, |
|
"loss": 0.7373, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.4219409570685578, |
|
"learning_rate": 2.9874967980208724e-06, |
|
"loss": 0.7532, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.40198268281516975, |
|
"learning_rate": 2.858524026928555e-06, |
|
"loss": 0.746, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.5529913137285583, |
|
"learning_rate": 2.7323565975926222e-06, |
|
"loss": 0.8412, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.5641310624198427, |
|
"learning_rate": 2.6089981535932453e-06, |
|
"loss": 0.7381, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.4070763719787106, |
|
"learning_rate": 2.4884522573901505e-06, |
|
"loss": 0.7469, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.412932484046869, |
|
"learning_rate": 2.3707223902196595e-06, |
|
"loss": 0.8027, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.3698602015181267, |
|
"learning_rate": 2.2558119519942357e-06, |
|
"loss": 0.7422, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.41987825527789996, |
|
"learning_rate": 2.143724261204194e-06, |
|
"loss": 0.7901, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.4558561938965998, |
|
"learning_rate": 2.034462554821992e-06, |
|
"loss": 0.7254, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.31521663581005915, |
|
"learning_rate": 1.928029988208635e-06, |
|
"loss": 0.7234, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.4536950604685854, |
|
"learning_rate": 1.8244296350226398e-06, |
|
"loss": 0.8439, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.3324327843215253, |
|
"learning_rate": 1.7236644871312047e-06, |
|
"loss": 0.7318, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.3441401957085473, |
|
"learning_rate": 1.6257374545238457e-06, |
|
"loss": 0.7219, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.31548933651921574, |
|
"learning_rate": 1.530651365228375e-06, |
|
"loss": 0.6584, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.43658195529889465, |
|
"learning_rate": 1.4384089652291543e-06, |
|
"loss": 0.8155, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.28926411213003883, |
|
"learning_rate": 1.349012918387904e-06, |
|
"loss": 0.7126, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.5240863579149662, |
|
"learning_rate": 1.2624658063666639e-06, |
|
"loss": 0.8585, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.34319193062085684, |
|
"learning_rate": 1.1787701285533193e-06, |
|
"loss": 0.7302, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.37091855665958207, |
|
"learning_rate": 1.0979283019893704e-06, |
|
"loss": 0.8102, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.4173368619893207, |
|
"learning_rate": 1.019942661300166e-06, |
|
"loss": 0.8052, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.2740737663998489, |
|
"learning_rate": 9.448154586274794e-07, |
|
"loss": 0.6628, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.39165587634667004, |
|
"learning_rate": 8.725488635644152e-07, |
|
"loss": 0.8068, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.3462783087024963, |
|
"learning_rate": 8.031449630928167e-07, |
|
"loss": 0.755, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.3501973539234669, |
|
"learning_rate": 7.366057615229904e-07, |
|
"loss": 0.8435, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.3089654640513169, |
|
"learning_rate": 6.729331804357863e-07, |
|
"loss": 0.7804, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.3732310282262484, |
|
"learning_rate": 6.121290586271311e-07, |
|
"loss": 0.7352, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.35885721051799435, |
|
"learning_rate": 5.54195152054926e-07, |
|
"loss": 0.719, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.4350143969198028, |
|
"learning_rate": 4.99133133788332e-07, |
|
"loss": 0.7083, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.3878552023049681, |
|
"learning_rate": 4.4694459395943077e-07, |
|
"loss": 0.787, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.5569927297196836, |
|
"learning_rate": 3.9763103971734993e-07, |
|
"loss": 0.8012, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.4146895070331553, |
|
"learning_rate": 3.5119389518470936e-07, |
|
"loss": 0.6918, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.3438997693511246, |
|
"learning_rate": 3.076345014164872e-07, |
|
"loss": 0.7113, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.3948501011870127, |
|
"learning_rate": 2.669541163613176e-07, |
|
"loss": 0.7671, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.3840771316522947, |
|
"learning_rate": 2.2915391482514204e-07, |
|
"loss": 0.7372, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.34038583545962225, |
|
"learning_rate": 1.9423498843726962e-07, |
|
"loss": 0.6481, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.3931737526809346, |
|
"learning_rate": 1.6219834561889136e-07, |
|
"loss": 0.7199, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.38986041242847186, |
|
"learning_rate": 1.3304491155393674e-07, |
|
"loss": 0.7816, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.4355430101066633, |
|
"learning_rate": 1.0677552816233949e-07, |
|
"loss": 0.6702, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.388098975061164, |
|
"learning_rate": 8.339095407575715e-08, |
|
"loss": 0.6688, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.3699381979359166, |
|
"learning_rate": 6.28918646156329e-08, |
|
"loss": 0.8179, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.374170534262767, |
|
"learning_rate": 4.5278851773711274e-08, |
|
"loss": 0.7136, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.41255583678961755, |
|
"learning_rate": 3.055242419492954e-08, |
|
"loss": 0.7418, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.45777314712192396, |
|
"learning_rate": 1.8713007162740605e-08, |
|
"loss": 0.7344, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.47083864745606346, |
|
"learning_rate": 9.760942586822808e-09, |
|
"loss": 0.8361, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.4272718560438237, |
|
"learning_rate": 3.69648899322117e-09, |
|
"loss": 0.7523, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.4516813250246949, |
|
"learning_rate": 5.198215168533693e-10, |
|
"loss": 0.7346, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.2166675329208374, |
|
"eval_runtime": 1667.535, |
|
"eval_samples_per_second": 1.385, |
|
"eval_steps_per_second": 0.347, |
|
"step": 3248 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 3248, |
|
"total_flos": 6658064344678400.0, |
|
"train_loss": 0.6257000189696627, |
|
"train_runtime": 38581.5438, |
|
"train_samples_per_second": 1.347, |
|
"train_steps_per_second": 0.084 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3248, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"total_flos": 6658064344678400.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|