|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 100, |
|
"global_step": 3860, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05181347150259067, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 1.2953367875647668e-07, |
|
"loss": 1.0187, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10362694300518134, |
|
"grad_norm": 0.11962890625, |
|
"learning_rate": 2.5906735751295336e-07, |
|
"loss": 1.016, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15544041450777202, |
|
"grad_norm": 0.11962890625, |
|
"learning_rate": 3.886010362694301e-07, |
|
"loss": 1.0219, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20725388601036268, |
|
"grad_norm": 0.11767578125, |
|
"learning_rate": 5.181347150259067e-07, |
|
"loss": 1.0164, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.25906735751295334, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 6.476683937823834e-07, |
|
"loss": 1.0119, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25906735751295334, |
|
"eval_main_loss": 1.0200841426849365, |
|
"eval_main_runtime": 50.8504, |
|
"eval_main_samples_per_second": 30.206, |
|
"eval_main_steps_per_second": 3.776, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25906735751295334, |
|
"eval_anatomy_loss": 2.9678244590759277, |
|
"eval_anatomy_runtime": 0.2681, |
|
"eval_anatomy_samples_per_second": 7.459, |
|
"eval_anatomy_steps_per_second": 3.729, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25906735751295334, |
|
"eval_college_mathematics_loss": 2.1804275512695312, |
|
"eval_college_mathematics_runtime": 0.2684, |
|
"eval_college_mathematics_samples_per_second": 7.451, |
|
"eval_college_mathematics_steps_per_second": 3.726, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25906735751295334, |
|
"eval_international_law_loss": 3.191988229751587, |
|
"eval_international_law_runtime": 0.2665, |
|
"eval_international_law_samples_per_second": 7.505, |
|
"eval_international_law_steps_per_second": 3.752, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.31088082901554404, |
|
"grad_norm": 0.12890625, |
|
"learning_rate": 7.772020725388602e-07, |
|
"loss": 1.0155, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3626943005181347, |
|
"grad_norm": 0.134765625, |
|
"learning_rate": 9.067357512953369e-07, |
|
"loss": 1.022, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.41450777202072536, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 1.0362694300518134e-06, |
|
"loss": 1.0071, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.46632124352331605, |
|
"grad_norm": 0.13671875, |
|
"learning_rate": 1.1658031088082903e-06, |
|
"loss": 1.0226, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5181347150259067, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 1.2953367875647669e-06, |
|
"loss": 1.0175, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5181347150259067, |
|
"eval_main_loss": 1.017417311668396, |
|
"eval_main_runtime": 50.9406, |
|
"eval_main_samples_per_second": 30.153, |
|
"eval_main_steps_per_second": 3.769, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5181347150259067, |
|
"eval_anatomy_loss": 2.9614720344543457, |
|
"eval_anatomy_runtime": 0.2677, |
|
"eval_anatomy_samples_per_second": 7.471, |
|
"eval_anatomy_steps_per_second": 3.735, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5181347150259067, |
|
"eval_college_mathematics_loss": 2.1731653213500977, |
|
"eval_college_mathematics_runtime": 0.2679, |
|
"eval_college_mathematics_samples_per_second": 7.466, |
|
"eval_college_mathematics_steps_per_second": 3.733, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5181347150259067, |
|
"eval_international_law_loss": 3.1835579872131348, |
|
"eval_international_law_runtime": 0.2687, |
|
"eval_international_law_samples_per_second": 7.443, |
|
"eval_international_law_steps_per_second": 3.722, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5699481865284974, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 1.4248704663212437e-06, |
|
"loss": 1.0125, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6217616580310881, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 1.5544041450777204e-06, |
|
"loss": 1.0142, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6735751295336787, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 1.683937823834197e-06, |
|
"loss": 1.0175, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7253886010362695, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 1.8134715025906738e-06, |
|
"loss": 1.0147, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7772020725388601, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 1.9430051813471504e-06, |
|
"loss": 1.0072, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7772020725388601, |
|
"eval_main_loss": 1.0116238594055176, |
|
"eval_main_runtime": 50.9159, |
|
"eval_main_samples_per_second": 30.167, |
|
"eval_main_steps_per_second": 3.771, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7772020725388601, |
|
"eval_anatomy_loss": 2.944845676422119, |
|
"eval_anatomy_runtime": 0.2687, |
|
"eval_anatomy_samples_per_second": 7.443, |
|
"eval_anatomy_steps_per_second": 3.721, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7772020725388601, |
|
"eval_college_mathematics_loss": 2.1590933799743652, |
|
"eval_college_mathematics_runtime": 0.2683, |
|
"eval_college_mathematics_samples_per_second": 7.455, |
|
"eval_college_mathematics_steps_per_second": 3.727, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7772020725388601, |
|
"eval_international_law_loss": 3.1690821647644043, |
|
"eval_international_law_runtime": 0.2666, |
|
"eval_international_law_samples_per_second": 7.501, |
|
"eval_international_law_steps_per_second": 3.75, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8290155440414507, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 2.072538860103627e-06, |
|
"loss": 1.017, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8808290155440415, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 2.2020725388601037e-06, |
|
"loss": 1.0084, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9326424870466321, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 2.3316062176165805e-06, |
|
"loss": 1.0008, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9844559585492227, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 2.461139896373057e-06, |
|
"loss": 1.0035, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0362694300518134, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 2.5906735751295338e-06, |
|
"loss": 1.0034, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0362694300518134, |
|
"eval_main_loss": 1.0014457702636719, |
|
"eval_main_runtime": 50.699, |
|
"eval_main_samples_per_second": 30.296, |
|
"eval_main_steps_per_second": 3.787, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0362694300518134, |
|
"eval_anatomy_loss": 2.91424298286438, |
|
"eval_anatomy_runtime": 0.2675, |
|
"eval_anatomy_samples_per_second": 7.475, |
|
"eval_anatomy_steps_per_second": 3.738, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0362694300518134, |
|
"eval_college_mathematics_loss": 2.136711597442627, |
|
"eval_college_mathematics_runtime": 0.2681, |
|
"eval_college_mathematics_samples_per_second": 7.46, |
|
"eval_college_mathematics_steps_per_second": 3.73, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0362694300518134, |
|
"eval_international_law_loss": 3.133229970932007, |
|
"eval_international_law_runtime": 0.2663, |
|
"eval_international_law_samples_per_second": 7.511, |
|
"eval_international_law_steps_per_second": 3.755, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0880829015544042, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 2.7202072538860106e-06, |
|
"loss": 0.9984, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1398963730569949, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 2.8497409326424875e-06, |
|
"loss": 0.9973, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1917098445595855, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 2.979274611398964e-06, |
|
"loss": 0.9898, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2435233160621761, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 3.1088082901554407e-06, |
|
"loss": 0.9841, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2953367875647668, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 3.238341968911917e-06, |
|
"loss": 0.9908, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2953367875647668, |
|
"eval_main_loss": 0.9807333946228027, |
|
"eval_main_runtime": 50.6947, |
|
"eval_main_samples_per_second": 30.299, |
|
"eval_main_steps_per_second": 3.787, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2953367875647668, |
|
"eval_anatomy_loss": 2.860694408416748, |
|
"eval_anatomy_runtime": 0.2674, |
|
"eval_anatomy_samples_per_second": 7.479, |
|
"eval_anatomy_steps_per_second": 3.739, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2953367875647668, |
|
"eval_college_mathematics_loss": 2.0933423042297363, |
|
"eval_college_mathematics_runtime": 0.2663, |
|
"eval_college_mathematics_samples_per_second": 7.51, |
|
"eval_college_mathematics_steps_per_second": 3.755, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2953367875647668, |
|
"eval_international_law_loss": 3.0646896362304688, |
|
"eval_international_law_runtime": 0.2664, |
|
"eval_international_law_samples_per_second": 7.506, |
|
"eval_international_law_steps_per_second": 3.753, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3471502590673574, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 3.367875647668394e-06, |
|
"loss": 0.9792, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3989637305699483, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 3.497409326424871e-06, |
|
"loss": 0.9697, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.450777202072539, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 3.6269430051813476e-06, |
|
"loss": 0.9604, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.5025906735751295, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 3.756476683937824e-06, |
|
"loss": 0.9549, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5544041450777202, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 3.886010362694301e-06, |
|
"loss": 0.9634, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5544041450777202, |
|
"eval_main_loss": 0.9588530659675598, |
|
"eval_main_runtime": 50.8939, |
|
"eval_main_samples_per_second": 30.18, |
|
"eval_main_steps_per_second": 3.773, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5544041450777202, |
|
"eval_anatomy_loss": 2.800969123840332, |
|
"eval_anatomy_runtime": 0.268, |
|
"eval_anatomy_samples_per_second": 7.464, |
|
"eval_anatomy_steps_per_second": 3.732, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5544041450777202, |
|
"eval_college_mathematics_loss": 2.0425174236297607, |
|
"eval_college_mathematics_runtime": 0.2684, |
|
"eval_college_mathematics_samples_per_second": 7.452, |
|
"eval_college_mathematics_steps_per_second": 3.726, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5544041450777202, |
|
"eval_international_law_loss": 2.9860198497772217, |
|
"eval_international_law_runtime": 0.2678, |
|
"eval_international_law_samples_per_second": 7.467, |
|
"eval_international_law_steps_per_second": 3.734, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6062176165803108, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 4.015544041450777e-06, |
|
"loss": 0.9551, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.6580310880829017, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 4.145077720207254e-06, |
|
"loss": 0.9462, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.709844559585492, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 4.274611398963731e-06, |
|
"loss": 0.9398, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.761658031088083, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 4.404145077720207e-06, |
|
"loss": 0.9289, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8134715025906736, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 4.533678756476685e-06, |
|
"loss": 0.9209, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8134715025906736, |
|
"eval_main_loss": 0.9181744456291199, |
|
"eval_main_runtime": 50.9128, |
|
"eval_main_samples_per_second": 30.169, |
|
"eval_main_steps_per_second": 3.771, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8134715025906736, |
|
"eval_anatomy_loss": 2.6760823726654053, |
|
"eval_anatomy_runtime": 0.2694, |
|
"eval_anatomy_samples_per_second": 7.423, |
|
"eval_anatomy_steps_per_second": 3.711, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8134715025906736, |
|
"eval_college_mathematics_loss": 1.9530092477798462, |
|
"eval_college_mathematics_runtime": 0.2669, |
|
"eval_college_mathematics_samples_per_second": 7.494, |
|
"eval_college_mathematics_steps_per_second": 3.747, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8134715025906736, |
|
"eval_international_law_loss": 2.8536064624786377, |
|
"eval_international_law_runtime": 0.268, |
|
"eval_international_law_samples_per_second": 7.462, |
|
"eval_international_law_steps_per_second": 3.731, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8652849740932642, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 4.663212435233161e-06, |
|
"loss": 0.9132, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.917098445595855, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 4.7927461139896375e-06, |
|
"loss": 0.8982, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.9689119170984455, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 4.922279792746114e-06, |
|
"loss": 0.898, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.0207253886010363, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 4.9999171995395824e-06, |
|
"loss": 0.8827, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.0725388601036268, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 4.9989857573474595e-06, |
|
"loss": 0.8667, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.0725388601036268, |
|
"eval_main_loss": 0.8773276209831238, |
|
"eval_main_runtime": 50.6915, |
|
"eval_main_samples_per_second": 30.301, |
|
"eval_main_steps_per_second": 3.788, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.0725388601036268, |
|
"eval_anatomy_loss": 2.5175046920776367, |
|
"eval_anatomy_runtime": 0.2667, |
|
"eval_anatomy_samples_per_second": 7.498, |
|
"eval_anatomy_steps_per_second": 3.749, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.0725388601036268, |
|
"eval_college_mathematics_loss": 1.8391788005828857, |
|
"eval_college_mathematics_runtime": 0.2681, |
|
"eval_college_mathematics_samples_per_second": 7.459, |
|
"eval_college_mathematics_steps_per_second": 3.73, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.0725388601036268, |
|
"eval_international_law_loss": 2.7238972187042236, |
|
"eval_international_law_runtime": 0.2657, |
|
"eval_international_law_samples_per_second": 7.529, |
|
"eval_international_law_steps_per_second": 3.764, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.1243523316062176, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 4.997019759281217e-06, |
|
"loss": 0.8706, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.1761658031088085, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 4.9940200192449906e-06, |
|
"loss": 0.8744, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.227979274611399, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.989987779102074e-06, |
|
"loss": 0.8583, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.2797927461139897, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 4.984924708160789e-06, |
|
"loss": 0.8637, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.33160621761658, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 4.978832902483415e-06, |
|
"loss": 0.8592, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.33160621761658, |
|
"eval_main_loss": 0.8600960373878479, |
|
"eval_main_runtime": 50.6644, |
|
"eval_main_samples_per_second": 30.317, |
|
"eval_main_steps_per_second": 3.79, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.33160621761658, |
|
"eval_anatomy_loss": 2.4422662258148193, |
|
"eval_anatomy_runtime": 0.2671, |
|
"eval_anatomy_samples_per_second": 7.488, |
|
"eval_anatomy_steps_per_second": 3.744, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.33160621761658, |
|
"eval_college_mathematics_loss": 1.7872785329818726, |
|
"eval_college_mathematics_runtime": 0.267, |
|
"eval_college_mathematics_samples_per_second": 7.492, |
|
"eval_college_mathematics_steps_per_second": 3.746, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.33160621761658, |
|
"eval_international_law_loss": 2.6731436252593994, |
|
"eval_international_law_runtime": 0.2664, |
|
"eval_international_law_samples_per_second": 7.508, |
|
"eval_international_law_steps_per_second": 3.754, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.383419689119171, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 4.971714884018439e-06, |
|
"loss": 0.8623, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.4352331606217614, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 4.96357359955649e-06, |
|
"loss": 0.8496, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.4870466321243523, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 4.9544124195104015e-06, |
|
"loss": 0.8595, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.538860103626943, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 4.944235136519888e-06, |
|
"loss": 0.8512, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.5906735751295336, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 4.933045963881431e-06, |
|
"loss": 0.8522, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.5906735751295336, |
|
"eval_main_loss": 0.8523173928260803, |
|
"eval_main_runtime": 50.9143, |
|
"eval_main_samples_per_second": 30.168, |
|
"eval_main_steps_per_second": 3.771, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.5906735751295336, |
|
"eval_anatomy_loss": 2.420971632003784, |
|
"eval_anatomy_runtime": 0.2681, |
|
"eval_anatomy_samples_per_second": 7.459, |
|
"eval_anatomy_steps_per_second": 3.729, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.5906735751295336, |
|
"eval_college_mathematics_loss": 1.7655620574951172, |
|
"eval_college_mathematics_runtime": 0.2683, |
|
"eval_college_mathematics_samples_per_second": 7.454, |
|
"eval_college_mathematics_steps_per_second": 3.727, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.5906735751295336, |
|
"eval_international_law_loss": 2.6557209491729736, |
|
"eval_international_law_runtime": 0.2675, |
|
"eval_international_law_samples_per_second": 7.475, |
|
"eval_international_law_steps_per_second": 3.738, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.6424870466321244, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 4.920849533804017e-06, |
|
"loss": 0.8525, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.694300518134715, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 4.907650895491443e-06, |
|
"loss": 0.8554, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.7461139896373057, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 4.893455513052003e-06, |
|
"loss": 0.8489, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.7979274611398965, |
|
"grad_norm": 0.25, |
|
"learning_rate": 4.878269263236391e-06, |
|
"loss": 0.844, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.849740932642487, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 4.86209843300479e-06, |
|
"loss": 0.8456, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.849740932642487, |
|
"eval_main_loss": 0.8472149968147278, |
|
"eval_main_runtime": 50.8752, |
|
"eval_main_samples_per_second": 30.192, |
|
"eval_main_steps_per_second": 3.774, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.849740932642487, |
|
"eval_anatomy_loss": 2.407443046569824, |
|
"eval_anatomy_runtime": 0.2673, |
|
"eval_anatomy_samples_per_second": 7.481, |
|
"eval_anatomy_steps_per_second": 3.741, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.849740932642487, |
|
"eval_college_mathematics_loss": 1.7581956386566162, |
|
"eval_college_mathematics_runtime": 0.2664, |
|
"eval_college_mathematics_samples_per_second": 7.507, |
|
"eval_college_mathematics_steps_per_second": 3.753, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.849740932642487, |
|
"eval_international_law_loss": 2.643361806869507, |
|
"eval_international_law_runtime": 0.266, |
|
"eval_international_law_samples_per_second": 7.518, |
|
"eval_international_law_steps_per_second": 3.759, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.901554404145078, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 4.8449497169241285e-06, |
|
"loss": 0.847, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.9533678756476682, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 4.826830214396594e-06, |
|
"loss": 0.8444, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.005181347150259, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 4.807747426720553e-06, |
|
"loss": 0.8454, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.05699481865285, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 4.78770925398508e-06, |
|
"loss": 0.8409, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.1088082901554404, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 4.766723991799407e-06, |
|
"loss": 0.8392, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.1088082901554404, |
|
"eval_main_loss": 0.8436682820320129, |
|
"eval_main_runtime": 50.6014, |
|
"eval_main_samples_per_second": 30.355, |
|
"eval_main_steps_per_second": 3.794, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.1088082901554404, |
|
"eval_anatomy_loss": 2.3988683223724365, |
|
"eval_anatomy_runtime": 0.2661, |
|
"eval_anatomy_samples_per_second": 7.517, |
|
"eval_anatomy_steps_per_second": 3.758, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.1088082901554404, |
|
"eval_college_mathematics_loss": 1.7500412464141846, |
|
"eval_college_mathematics_runtime": 0.2666, |
|
"eval_college_mathematics_samples_per_second": 7.502, |
|
"eval_college_mathematics_steps_per_second": 3.751, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.1088082901554404, |
|
"eval_international_law_loss": 2.639014482498169, |
|
"eval_international_law_runtime": 0.2661, |
|
"eval_international_law_samples_per_second": 7.517, |
|
"eval_international_law_steps_per_second": 3.759, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.160621761658031, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 4.744800327858608e-06, |
|
"loss": 0.8409, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.2124352331606216, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 4.721947338346993e-06, |
|
"loss": 0.8413, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.2642487046632125, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 4.698174484180641e-06, |
|
"loss": 0.8364, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.3160621761658033, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 4.673491607090684e-06, |
|
"loss": 0.8351, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.3678756476683938, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 4.647908925548918e-06, |
|
"loss": 0.8354, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.3678756476683938, |
|
"eval_main_loss": 0.8413074016571045, |
|
"eval_main_runtime": 50.8355, |
|
"eval_main_samples_per_second": 30.215, |
|
"eval_main_steps_per_second": 3.777, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.3678756476683938, |
|
"eval_anatomy_loss": 2.3927080631256104, |
|
"eval_anatomy_runtime": 0.2684, |
|
"eval_anatomy_samples_per_second": 7.451, |
|
"eval_anatomy_steps_per_second": 3.725, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.3678756476683938, |
|
"eval_college_mathematics_loss": 1.7482649087905884, |
|
"eval_college_mathematics_runtime": 0.2673, |
|
"eval_college_mathematics_samples_per_second": 7.482, |
|
"eval_college_mathematics_steps_per_second": 3.741, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.3678756476683938, |
|
"eval_international_law_loss": 2.6322338581085205, |
|
"eval_international_law_runtime": 0.2677, |
|
"eval_international_law_samples_per_second": 7.47, |
|
"eval_international_law_steps_per_second": 3.735, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.4196891191709846, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 4.621437030537461e-06, |
|
"loss": 0.8428, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.471502590673575, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 4.594086881164184e-06, |
|
"loss": 0.8495, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.523316062176166, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 4.565869800125747e-06, |
|
"loss": 0.8445, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.5751295336787567, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 4.536797469020116e-06, |
|
"loss": 0.8441, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.626943005181347, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 4.506881923510493e-06, |
|
"loss": 0.8388, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.626943005181347, |
|
"eval_main_loss": 0.8397356867790222, |
|
"eval_main_runtime": 50.8597, |
|
"eval_main_samples_per_second": 30.201, |
|
"eval_main_steps_per_second": 3.775, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.626943005181347, |
|
"eval_anatomy_loss": 2.392620325088501, |
|
"eval_anatomy_runtime": 0.268, |
|
"eval_anatomy_samples_per_second": 7.464, |
|
"eval_anatomy_steps_per_second": 3.732, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.626943005181347, |
|
"eval_college_mathematics_loss": 1.746407151222229, |
|
"eval_college_mathematics_runtime": 0.2684, |
|
"eval_college_mathematics_samples_per_second": 7.451, |
|
"eval_college_mathematics_steps_per_second": 3.725, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.626943005181347, |
|
"eval_international_law_loss": 2.632657527923584, |
|
"eval_international_law_runtime": 0.2666, |
|
"eval_international_law_samples_per_second": 7.503, |
|
"eval_international_law_steps_per_second": 3.751, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.6787564766839376, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 4.476135548342666e-06, |
|
"loss": 0.8405, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.7305699481865284, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 4.444571072217848e-06, |
|
"loss": 0.8353, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.7823834196891193, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 4.4122015625231125e-06, |
|
"loss": 0.838, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.8341968911917097, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 4.37904041992163e-06, |
|
"loss": 0.8339, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.8860103626943006, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 4.345101372804917e-06, |
|
"loss": 0.8352, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.8860103626943006, |
|
"eval_main_loss": 0.8387607932090759, |
|
"eval_main_runtime": 50.9241, |
|
"eval_main_samples_per_second": 30.163, |
|
"eval_main_steps_per_second": 3.77, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.8860103626943006, |
|
"eval_anatomy_loss": 2.3933463096618652, |
|
"eval_anatomy_runtime": 0.2694, |
|
"eval_anatomy_samples_per_second": 7.424, |
|
"eval_anatomy_steps_per_second": 3.712, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.8860103626943006, |
|
"eval_college_mathematics_loss": 1.7452200651168823, |
|
"eval_college_mathematics_runtime": 0.2686, |
|
"eval_college_mathematics_samples_per_second": 7.447, |
|
"eval_college_mathematics_steps_per_second": 3.724, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.8860103626943006, |
|
"eval_international_law_loss": 2.631213903427124, |
|
"eval_international_law_runtime": 0.2675, |
|
"eval_international_law_samples_per_second": 7.475, |
|
"eval_international_law_steps_per_second": 3.738, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.937823834196891, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 4.310398471609416e-06, |
|
"loss": 0.8379, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.989637305699482, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 4.274946082999753e-06, |
|
"loss": 0.8366, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.041450777202073, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 4.238758883921077e-06, |
|
"loss": 0.8351, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.0932642487046635, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 4.201851855522946e-06, |
|
"loss": 0.8427, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.1450777202072535, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 4.1642402769572775e-06, |
|
"loss": 0.8375, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.1450777202072535, |
|
"eval_main_loss": 0.8381660580635071, |
|
"eval_main_runtime": 50.8823, |
|
"eval_main_samples_per_second": 30.187, |
|
"eval_main_steps_per_second": 3.773, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.1450777202072535, |
|
"eval_anatomy_loss": 2.3937363624572754, |
|
"eval_anatomy_runtime": 0.2711, |
|
"eval_anatomy_samples_per_second": 7.376, |
|
"eval_anatomy_steps_per_second": 3.688, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.1450777202072535, |
|
"eval_college_mathematics_loss": 1.7459180355072021, |
|
"eval_college_mathematics_runtime": 0.2669, |
|
"eval_college_mathematics_samples_per_second": 7.493, |
|
"eval_college_mathematics_steps_per_second": 3.747, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.1450777202072535, |
|
"eval_international_law_loss": 2.6354634761810303, |
|
"eval_international_law_runtime": 0.2663, |
|
"eval_international_law_samples_per_second": 7.509, |
|
"eval_international_law_steps_per_second": 3.755, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.196891191709844, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 4.125939719052927e-06, |
|
"loss": 0.8405, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.248704663212435, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 4.086966037869515e-06, |
|
"loss": 0.8346, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.300518134715026, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 4.047335368133176e-06, |
|
"loss": 0.8388, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 4.352331606217617, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 4.0070641165569335e-06, |
|
"loss": 0.8301, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 4.404145077720207, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 3.96616895504848e-06, |
|
"loss": 0.8366, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.404145077720207, |
|
"eval_main_loss": 0.8377915024757385, |
|
"eval_main_runtime": 50.9147, |
|
"eval_main_samples_per_second": 30.168, |
|
"eval_main_steps_per_second": 3.771, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.404145077720207, |
|
"eval_anatomy_loss": 2.394949197769165, |
|
"eval_anatomy_runtime": 0.2684, |
|
"eval_anatomy_samples_per_second": 7.451, |
|
"eval_anatomy_steps_per_second": 3.726, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.404145077720207, |
|
"eval_college_mathematics_loss": 1.7438111305236816, |
|
"eval_college_mathematics_runtime": 0.2676, |
|
"eval_college_mathematics_samples_per_second": 7.475, |
|
"eval_college_mathematics_steps_per_second": 3.737, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.404145077720207, |
|
"eval_international_law_loss": 2.6339313983917236, |
|
"eval_international_law_runtime": 0.2665, |
|
"eval_international_law_samples_per_second": 7.506, |
|
"eval_international_law_steps_per_second": 3.753, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.455958549222798, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 3.924666813808176e-06, |
|
"loss": 0.833, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 4.507772020725389, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 3.882574874320099e-06, |
|
"loss": 0.8381, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 4.5595854922279795, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 3.839910562239088e-06, |
|
"loss": 0.8438, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 4.61139896373057, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 3.7966915401766845e-06, |
|
"loss": 0.832, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 4.66321243523316, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 3.752935700388982e-06, |
|
"loss": 0.843, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.66321243523316, |
|
"eval_main_loss": 0.8375835418701172, |
|
"eval_main_runtime": 50.8717, |
|
"eval_main_samples_per_second": 30.194, |
|
"eval_main_steps_per_second": 3.774, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.66321243523316, |
|
"eval_anatomy_loss": 2.3957273960113525, |
|
"eval_anatomy_runtime": 0.2674, |
|
"eval_anatomy_samples_per_second": 7.479, |
|
"eval_anatomy_steps_per_second": 3.74, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.66321243523316, |
|
"eval_college_mathematics_loss": 1.745951771736145, |
|
"eval_college_mathematics_runtime": 0.2678, |
|
"eval_college_mathematics_samples_per_second": 7.468, |
|
"eval_college_mathematics_steps_per_second": 3.734, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.66321243523316, |
|
"eval_international_law_loss": 2.635204315185547, |
|
"eval_international_law_runtime": 0.2682, |
|
"eval_international_law_samples_per_second": 7.458, |
|
"eval_international_law_steps_per_second": 3.729, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.715025906735751, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 3.7086611573694107e-06, |
|
"loss": 0.8254, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 4.766839378238342, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 3.663886240349507e-06, |
|
"loss": 0.8342, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 4.818652849740933, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 3.6186294857107933e-06, |
|
"loss": 0.8368, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 4.870466321243523, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 3.5729096293108935e-06, |
|
"loss": 0.8406, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 4.922279792746114, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 3.526745598727071e-06, |
|
"loss": 0.8317, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.922279792746114, |
|
"eval_main_loss": 0.8374476432800293, |
|
"eval_main_runtime": 50.8861, |
|
"eval_main_samples_per_second": 30.185, |
|
"eval_main_steps_per_second": 3.773, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.922279792746114, |
|
"eval_anatomy_loss": 2.3970742225646973, |
|
"eval_anatomy_runtime": 0.268, |
|
"eval_anatomy_samples_per_second": 7.462, |
|
"eval_anatomy_steps_per_second": 3.731, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.922279792746114, |
|
"eval_college_mathematics_loss": 1.7456122636795044, |
|
"eval_college_mathematics_runtime": 0.2683, |
|
"eval_college_mathematics_samples_per_second": 7.454, |
|
"eval_college_mathematics_steps_per_second": 3.727, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.922279792746114, |
|
"eval_international_law_loss": 2.6364247798919678, |
|
"eval_international_law_runtime": 0.267, |
|
"eval_international_law_samples_per_second": 7.49, |
|
"eval_international_law_steps_per_second": 3.745, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.974093264248705, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 3.4801565054203962e-06, |
|
"loss": 0.834, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 5.025906735751295, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 3.433161636823782e-06, |
|
"loss": 0.8409, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 5.077720207253886, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 3.3857804483571803e-06, |
|
"loss": 0.8324, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 5.129533678756476, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 3.3380325553732223e-06, |
|
"loss": 0.8433, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 5.181347150259067, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 3.2899377250366536e-06, |
|
"loss": 0.8307, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.181347150259067, |
|
"eval_main_loss": 0.8373920321464539, |
|
"eval_main_runtime": 50.7102, |
|
"eval_main_samples_per_second": 30.29, |
|
"eval_main_steps_per_second": 3.786, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.181347150259067, |
|
"eval_anatomy_loss": 2.3975374698638916, |
|
"eval_anatomy_runtime": 0.2673, |
|
"eval_anatomy_samples_per_second": 7.483, |
|
"eval_anatomy_steps_per_second": 3.741, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.181347150259067, |
|
"eval_college_mathematics_loss": 1.746949315071106, |
|
"eval_college_mathematics_runtime": 0.2679, |
|
"eval_college_mathematics_samples_per_second": 7.465, |
|
"eval_college_mathematics_steps_per_second": 3.732, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.181347150259067, |
|
"eval_international_law_loss": 2.637000799179077, |
|
"eval_international_law_runtime": 0.267, |
|
"eval_international_law_samples_per_second": 7.49, |
|
"eval_international_law_steps_per_second": 3.745, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.233160621761658, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 3.2415158681409215e-06, |
|
"loss": 0.836, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 5.284974093264249, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 3.1927870308652953e-06, |
|
"loss": 0.8447, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 5.33678756476684, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 3.1437713864759483e-06, |
|
"loss": 0.8383, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 5.38860103626943, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 3.0944892269744155e-06, |
|
"loss": 0.8412, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 5.4404145077720205, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 3.044960954696906e-06, |
|
"loss": 0.837, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 5.4404145077720205, |
|
"eval_main_loss": 0.837336003780365, |
|
"eval_main_runtime": 50.8778, |
|
"eval_main_samples_per_second": 30.19, |
|
"eval_main_steps_per_second": 3.774, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 5.4404145077720205, |
|
"eval_anatomy_loss": 2.39707350730896, |
|
"eval_anatomy_runtime": 0.264, |
|
"eval_anatomy_samples_per_second": 7.575, |
|
"eval_anatomy_steps_per_second": 3.787, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 5.4404145077720205, |
|
"eval_college_mathematics_loss": 1.746044397354126, |
|
"eval_college_mathematics_runtime": 0.2668, |
|
"eval_college_mathematics_samples_per_second": 7.497, |
|
"eval_college_mathematics_steps_per_second": 3.748, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 5.4404145077720205, |
|
"eval_international_law_loss": 2.639299154281616, |
|
"eval_international_law_runtime": 0.2686, |
|
"eval_international_law_samples_per_second": 7.446, |
|
"eval_international_law_steps_per_second": 3.723, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 5.492227979274611, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 2.9952070738679312e-06, |
|
"loss": 0.8349, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 5.544041450777202, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 2.9452481821117544e-06, |
|
"loss": 0.8261, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 5.595854922279793, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 2.895104961925179e-06, |
|
"loss": 0.8305, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 5.647668393782383, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 2.844798172115185e-06, |
|
"loss": 0.8316, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 5.699481865284974, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 2.7943486392049972e-06, |
|
"loss": 0.8334, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 5.699481865284974, |
|
"eval_main_loss": 0.8372721076011658, |
|
"eval_main_runtime": 50.8569, |
|
"eval_main_samples_per_second": 30.202, |
|
"eval_main_steps_per_second": 3.775, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 5.699481865284974, |
|
"eval_anatomy_loss": 2.3966822624206543, |
|
"eval_anatomy_runtime": 0.2669, |
|
"eval_anatomy_samples_per_second": 7.494, |
|
"eval_anatomy_steps_per_second": 3.747, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 5.699481865284974, |
|
"eval_college_mathematics_loss": 1.7497589588165283, |
|
"eval_college_mathematics_runtime": 0.2678, |
|
"eval_college_mathematics_samples_per_second": 7.468, |
|
"eval_college_mathematics_steps_per_second": 3.734, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 5.699481865284974, |
|
"eval_international_law_loss": 2.634140968322754, |
|
"eval_international_law_runtime": 0.267, |
|
"eval_international_law_samples_per_second": 7.492, |
|
"eval_international_law_steps_per_second": 3.746, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 5.751295336787565, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 2.7437772488120945e-06, |
|
"loss": 0.8305, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 5.803108808290156, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 2.6931049370017755e-06, |
|
"loss": 0.8383, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 5.8549222797927465, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 2.6423526816198253e-06, |
|
"loss": 0.8445, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 5.9067357512953365, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 2.5915414936078933e-06, |
|
"loss": 0.8385, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 5.958549222797927, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 2.5406924083051683e-06, |
|
"loss": 0.8359, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.958549222797927, |
|
"eval_main_loss": 0.8372599482536316, |
|
"eval_main_runtime": 50.7835, |
|
"eval_main_samples_per_second": 30.246, |
|
"eval_main_steps_per_second": 3.781, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.958549222797927, |
|
"eval_anatomy_loss": 2.3953776359558105, |
|
"eval_anatomy_runtime": 0.2668, |
|
"eval_anatomy_samples_per_second": 7.496, |
|
"eval_anatomy_steps_per_second": 3.748, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.958549222797927, |
|
"eval_college_mathematics_loss": 1.7478266954421997, |
|
"eval_college_mathematics_runtime": 0.2698, |
|
"eval_college_mathematics_samples_per_second": 7.413, |
|
"eval_college_mathematics_steps_per_second": 3.707, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.958549222797927, |
|
"eval_international_law_loss": 2.6391143798828125, |
|
"eval_international_law_runtime": 0.2663, |
|
"eval_international_law_samples_per_second": 7.509, |
|
"eval_international_law_steps_per_second": 3.755, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 6.010362694300518, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 2.4898264767399445e-06, |
|
"loss": 0.8316, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 6.062176165803109, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 2.438964756914712e-06, |
|
"loss": 0.8412, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 6.1139896373057, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 2.3881283050883368e-06, |
|
"loss": 0.8368, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 6.16580310880829, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 2.337338167058981e-06, |
|
"loss": 0.8392, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 6.217616580310881, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 2.286615369451342e-06, |
|
"loss": 0.834, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 6.217616580310881, |
|
"eval_main_loss": 0.8373088836669922, |
|
"eval_main_runtime": 50.8482, |
|
"eval_main_samples_per_second": 30.208, |
|
"eval_main_steps_per_second": 3.776, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 6.217616580310881, |
|
"eval_anatomy_loss": 2.3978271484375, |
|
"eval_anatomy_runtime": 0.2671, |
|
"eval_anatomy_samples_per_second": 7.487, |
|
"eval_anatomy_steps_per_second": 3.744, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 6.217616580310881, |
|
"eval_college_mathematics_loss": 1.7477797269821167, |
|
"eval_college_mathematics_runtime": 0.2685, |
|
"eval_college_mathematics_samples_per_second": 7.449, |
|
"eval_college_mathematics_steps_per_second": 3.724, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 6.217616580310881, |
|
"eval_international_law_loss": 2.638612985610962, |
|
"eval_international_law_runtime": 0.2668, |
|
"eval_international_law_samples_per_second": 7.495, |
|
"eval_international_law_steps_per_second": 3.748, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 6.269430051813472, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 2.2359809110118358e-06, |
|
"loss": 0.8391, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 6.321243523316062, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 2.1854557539153203e-06, |
|
"loss": 0.8368, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 6.373056994818652, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 2.1350608150869563e-06, |
|
"loss": 0.8321, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 6.424870466321243, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 2.0848169575428057e-06, |
|
"loss": 0.8375, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 6.476683937823834, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 2.034744981752741e-06, |
|
"loss": 0.835, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.476683937823834, |
|
"eval_main_loss": 0.8372688889503479, |
|
"eval_main_runtime": 50.7481, |
|
"eval_main_samples_per_second": 30.267, |
|
"eval_main_steps_per_second": 3.783, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.476683937823834, |
|
"eval_anatomy_loss": 2.397125244140625, |
|
"eval_anatomy_runtime": 0.2662, |
|
"eval_anatomy_samples_per_second": 7.513, |
|
"eval_anatomy_steps_per_second": 3.757, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.476683937823834, |
|
"eval_college_mathematics_loss": 1.7471617460250854, |
|
"eval_college_mathematics_runtime": 0.2671, |
|
"eval_college_mathematics_samples_per_second": 7.487, |
|
"eval_college_mathematics_steps_per_second": 3.744, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.476683937823834, |
|
"eval_international_law_loss": 2.6356821060180664, |
|
"eval_international_law_runtime": 0.2665, |
|
"eval_international_law_samples_per_second": 7.504, |
|
"eval_international_law_steps_per_second": 3.752, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.528497409326425, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 1.9848656170292556e-06, |
|
"loss": 0.8291, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 6.580310880829016, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 1.9351995129457305e-06, |
|
"loss": 0.8325, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 6.632124352331607, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 1.88576723078771e-06, |
|
"loss": 0.8345, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 6.683937823834197, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 1.8365892350407238e-06, |
|
"loss": 0.8387, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 6.7357512953367875, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 1.7876858849181982e-06, |
|
"loss": 0.8311, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 6.7357512953367875, |
|
"eval_main_loss": 0.8372130393981934, |
|
"eval_main_runtime": 50.6463, |
|
"eval_main_samples_per_second": 30.328, |
|
"eval_main_steps_per_second": 3.791, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 6.7357512953367875, |
|
"eval_anatomy_loss": 2.393521308898926, |
|
"eval_anatomy_runtime": 0.267, |
|
"eval_anatomy_samples_per_second": 7.49, |
|
"eval_anatomy_steps_per_second": 3.745, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 6.7357512953367875, |
|
"eval_college_mathematics_loss": 1.748924970626831, |
|
"eval_college_mathematics_runtime": 0.2664, |
|
"eval_college_mathematics_samples_per_second": 7.509, |
|
"eval_college_mathematics_steps_per_second": 3.754, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 6.7357512953367875, |
|
"eval_international_law_loss": 2.637521505355835, |
|
"eval_international_law_runtime": 0.2667, |
|
"eval_international_law_samples_per_second": 7.499, |
|
"eval_international_law_steps_per_second": 3.749, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 6.787564766839378, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 1.73907742593293e-06, |
|
"loss": 0.8395, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 6.839378238341969, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 1.690783981515648e-06, |
|
"loss": 0.8399, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 6.891191709844559, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 1.642825544684101e-06, |
|
"loss": 0.8302, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 6.94300518134715, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 1.5952219697661455e-06, |
|
"loss": 0.8338, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 6.994818652849741, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 1.5479929641802492e-06, |
|
"loss": 0.837, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 6.994818652849741, |
|
"eval_main_loss": 0.8372467160224915, |
|
"eval_main_runtime": 50.6632, |
|
"eval_main_samples_per_second": 30.318, |
|
"eval_main_steps_per_second": 3.79, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 6.994818652849741, |
|
"eval_anatomy_loss": 2.399848222732544, |
|
"eval_anatomy_runtime": 0.2663, |
|
"eval_anatomy_samples_per_second": 7.512, |
|
"eval_anatomy_steps_per_second": 3.756, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 6.994818652849741, |
|
"eval_college_mathematics_loss": 1.7486381530761719, |
|
"eval_college_mathematics_runtime": 0.268, |
|
"eval_college_mathematics_samples_per_second": 7.464, |
|
"eval_college_mathematics_steps_per_second": 3.732, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 6.994818652849741, |
|
"eval_international_law_loss": 2.637385845184326, |
|
"eval_international_law_runtime": 0.2665, |
|
"eval_international_law_samples_per_second": 7.504, |
|
"eval_international_law_steps_per_second": 3.752, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 7.046632124352332, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 1.5011580802768048e-06, |
|
"loss": 0.8392, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 7.098445595854923, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 1.4547367072436519e-06, |
|
"loss": 0.8326, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 7.150259067357513, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 1.4087480630791405e-06, |
|
"loss": 0.8324, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 7.2020725388601035, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 1.3632111866360585e-06, |
|
"loss": 0.8309, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 7.253886010362694, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 1.318144929739743e-06, |
|
"loss": 0.8292, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 7.253886010362694, |
|
"eval_main_loss": 0.8372478485107422, |
|
"eval_main_runtime": 50.677, |
|
"eval_main_samples_per_second": 30.31, |
|
"eval_main_steps_per_second": 3.789, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 7.253886010362694, |
|
"eval_anatomy_loss": 2.3977112770080566, |
|
"eval_anatomy_runtime": 0.2672, |
|
"eval_anatomy_samples_per_second": 7.484, |
|
"eval_anatomy_steps_per_second": 3.742, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 7.253886010362694, |
|
"eval_college_mathematics_loss": 1.745300531387329, |
|
"eval_college_mathematics_runtime": 0.2659, |
|
"eval_college_mathematics_samples_per_second": 7.523, |
|
"eval_college_mathematics_steps_per_second": 3.761, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 7.253886010362694, |
|
"eval_international_law_loss": 2.6362087726593018, |
|
"eval_international_law_runtime": 0.2666, |
|
"eval_international_law_samples_per_second": 7.502, |
|
"eval_international_law_steps_per_second": 3.751, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 7.305699481865285, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 1.273567949383601e-06, |
|
"loss": 0.8384, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 7.357512953367876, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 1.229498700005295e-06, |
|
"loss": 0.8375, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 7.409326424870466, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 1.1859554258467843e-06, |
|
"loss": 0.8416, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 7.461139896373057, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 1.1429561534013869e-06, |
|
"loss": 0.8367, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 7.512953367875648, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 1.1005186839509887e-06, |
|
"loss": 0.8372, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 7.512953367875648, |
|
"eval_main_loss": 0.8372209668159485, |
|
"eval_main_runtime": 50.6545, |
|
"eval_main_samples_per_second": 30.323, |
|
"eval_main_steps_per_second": 3.79, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 7.512953367875648, |
|
"eval_anatomy_loss": 2.3963139057159424, |
|
"eval_anatomy_runtime": 0.2679, |
|
"eval_anatomy_samples_per_second": 7.464, |
|
"eval_anatomy_steps_per_second": 3.732, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 7.512953367875648, |
|
"eval_college_mathematics_loss": 1.7459056377410889, |
|
"eval_college_mathematics_runtime": 0.2655, |
|
"eval_college_mathematics_samples_per_second": 7.534, |
|
"eval_college_mathematics_steps_per_second": 3.767, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 7.512953367875648, |
|
"eval_international_law_loss": 2.6337826251983643, |
|
"eval_international_law_runtime": 0.2662, |
|
"eval_international_law_samples_per_second": 7.512, |
|
"eval_international_law_steps_per_second": 3.756, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 7.564766839378239, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 1.0586605861964804e-06, |
|
"loss": 0.8313, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 7.616580310880829, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 1.01739918898449e-06, |
|
"loss": 0.8346, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 7.668393782383419, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 9.767515741334039e-07, |
|
"loss": 0.8372, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 7.72020725388601, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 9.367345693616625e-07, |
|
"loss": 0.8343, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 7.772020725388601, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 8.973647413212494e-07, |
|
"loss": 0.8441, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 7.772020725388601, |
|
"eval_main_loss": 0.8372817039489746, |
|
"eval_main_runtime": 50.6245, |
|
"eval_main_samples_per_second": 30.341, |
|
"eval_main_steps_per_second": 3.793, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 7.772020725388601, |
|
"eval_anatomy_loss": 2.3954248428344727, |
|
"eval_anatomy_runtime": 0.2651, |
|
"eval_anatomy_samples_per_second": 7.545, |
|
"eval_anatomy_steps_per_second": 3.773, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 7.772020725388601, |
|
"eval_college_mathematics_loss": 1.744511365890503, |
|
"eval_college_mathematics_runtime": 0.2656, |
|
"eval_college_mathematics_samples_per_second": 7.53, |
|
"eval_college_mathematics_steps_per_second": 3.765, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 7.772020725388601, |
|
"eval_international_law_loss": 2.635951280593872, |
|
"eval_international_law_runtime": 0.2679, |
|
"eval_international_law_samples_per_second": 7.464, |
|
"eval_international_law_steps_per_second": 3.732, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 7.823834196891192, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 8.586583887392546e-07, |
|
"loss": 0.8383, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 7.875647668393782, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 8.206315356703634e-07, |
|
"loss": 0.8312, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 7.927461139896373, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 7.832999248630479e-07, |
|
"loss": 0.8401, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 7.979274611398964, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 7.466790112422257e-07, |
|
"loss": 0.8283, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 8.031088082901555, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 7.107839555110707e-07, |
|
"loss": 0.8378, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 8.031088082901555, |
|
"eval_main_loss": 0.8372604846954346, |
|
"eval_main_runtime": 50.6329, |
|
"eval_main_samples_per_second": 30.336, |
|
"eval_main_steps_per_second": 3.792, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 8.031088082901555, |
|
"eval_anatomy_loss": 2.397829532623291, |
|
"eval_anatomy_runtime": 0.2664, |
|
"eval_anatomy_samples_per_second": 7.509, |
|
"eval_anatomy_steps_per_second": 3.754, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 8.031088082901555, |
|
"eval_college_mathematics_loss": 1.745050311088562, |
|
"eval_college_mathematics_runtime": 0.2662, |
|
"eval_college_mathematics_samples_per_second": 7.512, |
|
"eval_college_mathematics_steps_per_second": 3.756, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 8.031088082901555, |
|
"eval_international_law_loss": 2.635841131210327, |
|
"eval_international_law_runtime": 0.2664, |
|
"eval_international_law_samples_per_second": 7.508, |
|
"eval_international_law_steps_per_second": 3.754, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 8.082901554404145, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 6.756296178746282e-07, |
|
"loss": 0.8382, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 8.134715025906736, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 6.412305518878343e-07, |
|
"loss": 0.8411, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 8.186528497409327, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 6.076009984304837e-07, |
|
"loss": 0.8411, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 8.238341968911918, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 5.747548798116451e-07, |
|
"loss": 0.8384, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 8.290155440414507, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 5.427057940059607e-07, |
|
"loss": 0.8304, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 8.290155440414507, |
|
"eval_main_loss": 0.8372419476509094, |
|
"eval_main_runtime": 50.616, |
|
"eval_main_samples_per_second": 30.346, |
|
"eval_main_steps_per_second": 3.793, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 8.290155440414507, |
|
"eval_anatomy_loss": 2.398320436477661, |
|
"eval_anatomy_runtime": 0.2666, |
|
"eval_anatomy_samples_per_second": 7.501, |
|
"eval_anatomy_steps_per_second": 3.751, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 8.290155440414507, |
|
"eval_college_mathematics_loss": 1.7492754459381104, |
|
"eval_college_mathematics_runtime": 0.2666, |
|
"eval_college_mathematics_samples_per_second": 7.501, |
|
"eval_college_mathematics_steps_per_second": 3.75, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 8.290155440414507, |
|
"eval_international_law_loss": 2.6363821029663086, |
|
"eval_international_law_runtime": 0.266, |
|
"eval_international_law_samples_per_second": 7.519, |
|
"eval_international_law_steps_per_second": 3.76, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 8.341968911917098, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 5.11467009024216e-07, |
|
"loss": 0.8361, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 8.393782383419689, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 4.810514574205125e-07, |
|
"loss": 0.8339, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 8.44559585492228, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 4.5147173093831264e-07, |
|
"loss": 0.8345, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 8.49740932642487, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 4.227400752975835e-07, |
|
"loss": 0.8374, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 8.549222797927461, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 3.9486838512518777e-07, |
|
"loss": 0.8374, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 8.549222797927461, |
|
"eval_main_loss": 0.8372399806976318, |
|
"eval_main_runtime": 50.5872, |
|
"eval_main_samples_per_second": 30.363, |
|
"eval_main_steps_per_second": 3.795, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 8.549222797927461, |
|
"eval_anatomy_loss": 2.3960189819335938, |
|
"eval_anatomy_runtime": 0.2667, |
|
"eval_anatomy_samples_per_second": 7.498, |
|
"eval_anatomy_steps_per_second": 3.749, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 8.549222797927461, |
|
"eval_college_mathematics_loss": 1.7492492198944092, |
|
"eval_college_mathematics_runtime": 0.2676, |
|
"eval_college_mathematics_samples_per_second": 7.475, |
|
"eval_college_mathematics_steps_per_second": 3.738, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 8.549222797927461, |
|
"eval_international_law_loss": 2.6380083560943604, |
|
"eval_international_law_runtime": 0.2658, |
|
"eval_international_law_samples_per_second": 7.525, |
|
"eval_international_law_steps_per_second": 3.762, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 8.601036269430052, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 3.678681990306207e-07, |
|
"loss": 0.8359, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 8.652849740932643, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 3.4175069482914105e-07, |
|
"loss": 0.8284, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 8.704663212435234, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 3.165266849142581e-07, |
|
"loss": 0.8334, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 8.756476683937823, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 2.9220661178151366e-07, |
|
"loss": 0.8337, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 8.808290155440414, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 2.688005437053845e-07, |
|
"loss": 0.8382, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 8.808290155440414, |
|
"eval_main_loss": 0.8372331261634827, |
|
"eval_main_runtime": 50.6285, |
|
"eval_main_samples_per_second": 30.339, |
|
"eval_main_steps_per_second": 3.792, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 8.808290155440414, |
|
"eval_anatomy_loss": 2.39411997795105, |
|
"eval_anatomy_runtime": 0.2664, |
|
"eval_anatomy_samples_per_second": 7.507, |
|
"eval_anatomy_steps_per_second": 3.753, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 8.808290155440414, |
|
"eval_college_mathematics_loss": 1.7451952695846558, |
|
"eval_college_mathematics_runtime": 0.2673, |
|
"eval_college_mathematics_samples_per_second": 7.482, |
|
"eval_college_mathematics_steps_per_second": 3.741, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 8.808290155440414, |
|
"eval_international_law_loss": 2.6384801864624023, |
|
"eval_international_law_runtime": 0.2666, |
|
"eval_international_law_samples_per_second": 7.502, |
|
"eval_international_law_steps_per_second": 3.751, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 8.860103626943005, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 2.4631817057111597e-07, |
|
"loss": 0.8363, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 8.911917098445596, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 2.247687998632031e-07, |
|
"loss": 0.8389, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 8.963730569948186, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 2.0416135281218218e-07, |
|
"loss": 0.8336, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 9.015544041450777, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 1.8450436070132889e-07, |
|
"loss": 0.8333, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 9.067357512953368, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 1.6580596133478926e-07, |
|
"loss": 0.8373, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 9.067357512953368, |
|
"eval_main_loss": 0.837254524230957, |
|
"eval_main_runtime": 50.6558, |
|
"eval_main_samples_per_second": 30.322, |
|
"eval_main_steps_per_second": 3.79, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 9.067357512953368, |
|
"eval_anatomy_loss": 2.395867347717285, |
|
"eval_anatomy_runtime": 0.2665, |
|
"eval_anatomy_samples_per_second": 7.506, |
|
"eval_anatomy_steps_per_second": 3.753, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 9.067357512953368, |
|
"eval_college_mathematics_loss": 1.7460345029830933, |
|
"eval_college_mathematics_runtime": 0.267, |
|
"eval_college_mathematics_samples_per_second": 7.491, |
|
"eval_college_mathematics_steps_per_second": 3.746, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 9.067357512953368, |
|
"eval_international_law_loss": 2.6364336013793945, |
|
"eval_international_law_runtime": 0.2659, |
|
"eval_international_law_samples_per_second": 7.522, |
|
"eval_international_law_steps_per_second": 3.761, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 9.119170984455959, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 1.4807389566860675e-07, |
|
"loss": 0.838, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 9.17098445595855, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 1.3131550460604242e-07, |
|
"loss": 0.8266, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 9.22279792746114, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 1.1553772595851109e-07, |
|
"loss": 0.8412, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 9.27461139896373, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 1.0074709157339657e-07, |
|
"loss": 0.8361, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 9.32642487046632, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 8.694972462992918e-08, |
|
"loss": 0.8373, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 9.32642487046632, |
|
"eval_main_loss": 0.8372175693511963, |
|
"eval_main_runtime": 50.62, |
|
"eval_main_samples_per_second": 30.344, |
|
"eval_main_steps_per_second": 3.793, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 9.32642487046632, |
|
"eval_anatomy_loss": 2.394951105117798, |
|
"eval_anatomy_runtime": 0.2668, |
|
"eval_anatomy_samples_per_second": 7.496, |
|
"eval_anatomy_steps_per_second": 3.748, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 9.32642487046632, |
|
"eval_college_mathematics_loss": 1.7458065748214722, |
|
"eval_college_mathematics_runtime": 0.2669, |
|
"eval_college_mathematics_samples_per_second": 7.494, |
|
"eval_college_mathematics_steps_per_second": 3.747, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 9.32642487046632, |
|
"eval_international_law_loss": 2.639335870742798, |
|
"eval_international_law_runtime": 0.2665, |
|
"eval_international_law_samples_per_second": 7.505, |
|
"eval_international_law_steps_per_second": 3.752, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 9.378238341968911, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 7.415133710424794e-08, |
|
"loss": 0.8335, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 9.430051813471502, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 6.235722740469936e-08, |
|
"loss": 0.8379, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 9.481865284974093, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 5.157227817834648e-08, |
|
"loss": 0.831, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 9.533678756476684, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 4.180095428960168e-08, |
|
"loss": 0.8399, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 9.585492227979275, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 3.304730097181463e-08, |
|
"loss": 0.8428, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 9.585492227979275, |
|
"eval_main_loss": 0.8372709155082703, |
|
"eval_main_runtime": 50.6434, |
|
"eval_main_samples_per_second": 30.33, |
|
"eval_main_steps_per_second": 3.791, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 9.585492227979275, |
|
"eval_anatomy_loss": 2.3959925174713135, |
|
"eval_anatomy_runtime": 0.2686, |
|
"eval_anatomy_samples_per_second": 7.447, |
|
"eval_anatomy_steps_per_second": 3.723, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 9.585492227979275, |
|
"eval_college_mathematics_loss": 1.7489873170852661, |
|
"eval_college_mathematics_runtime": 0.2656, |
|
"eval_college_mathematics_samples_per_second": 7.529, |
|
"eval_college_mathematics_steps_per_second": 3.765, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 9.585492227979275, |
|
"eval_international_law_loss": 2.6358718872070312, |
|
"eval_international_law_runtime": 0.2658, |
|
"eval_international_law_samples_per_second": 7.525, |
|
"eval_international_law_steps_per_second": 3.763, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 9.637305699481866, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 2.5314942152586954e-08, |
|
"loss": 0.8366, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 9.689119170984457, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 1.8607078953498392e-08, |
|
"loss": 0.8386, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 9.740932642487046, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 1.292648836487609e-08, |
|
"loss": 0.8321, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 9.792746113989637, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 8.275522096146404e-09, |
|
"loss": 0.8342, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 9.844559585492227, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 4.656105602250382e-09, |
|
"loss": 0.8404, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 9.844559585492227, |
|
"eval_main_loss": 0.8372488021850586, |
|
"eval_main_runtime": 50.6539, |
|
"eval_main_samples_per_second": 30.323, |
|
"eval_main_steps_per_second": 3.79, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 9.844559585492227, |
|
"eval_anatomy_loss": 2.3990156650543213, |
|
"eval_anatomy_runtime": 0.2664, |
|
"eval_anatomy_samples_per_second": 7.508, |
|
"eval_anatomy_steps_per_second": 3.754, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 9.844559585492227, |
|
"eval_college_mathematics_loss": 1.7470096349716187, |
|
"eval_college_mathematics_runtime": 0.2654, |
|
"eval_college_mathematics_samples_per_second": 7.535, |
|
"eval_college_mathematics_steps_per_second": 3.767, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 9.844559585492227, |
|
"eval_international_law_loss": 2.6360206604003906, |
|
"eval_international_law_runtime": 0.2672, |
|
"eval_international_law_samples_per_second": 7.485, |
|
"eval_international_law_steps_per_second": 3.743, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 9.896373056994818, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 2.0697372865235986e-09, |
|
"loss": 0.8301, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 9.94818652849741, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 5.174878803720917e-10, |
|
"loss": 0.8299, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.0, |
|
"loss": 0.832, |
|
"step": 3860 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 3860, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.8346399786239263e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|