|
{ |
|
"best_metric": 1.5968632698059082, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-130", |
|
"epoch": 0.04518178609248147, |
|
"eval_steps": 10, |
|
"global_step": 160, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0002823861630780092, |
|
"grad_norm": 1.2056736946105957, |
|
"learning_rate": 2e-05, |
|
"loss": 1.9521, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0002823861630780092, |
|
"eval_loss": 2.352551221847534, |
|
"eval_runtime": 133.636, |
|
"eval_samples_per_second": 5.582, |
|
"eval_steps_per_second": 5.582, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0005647723261560184, |
|
"grad_norm": 4.347576141357422, |
|
"learning_rate": 4e-05, |
|
"loss": 3.1192, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0008471584892340275, |
|
"grad_norm": 0.7937899231910706, |
|
"learning_rate": 6e-05, |
|
"loss": 1.1903, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0011295446523120368, |
|
"grad_norm": 3.9262712001800537, |
|
"learning_rate": 8e-05, |
|
"loss": 3.5143, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0014119308153900459, |
|
"grad_norm": 2.7941317558288574, |
|
"learning_rate": 0.0001, |
|
"loss": 2.9205, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.001694316978468055, |
|
"grad_norm": 0.9424725770950317, |
|
"learning_rate": 0.00012, |
|
"loss": 2.582, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.001976703141546064, |
|
"grad_norm": 1.8026996850967407, |
|
"learning_rate": 0.00014, |
|
"loss": 2.0555, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0022590893046240735, |
|
"grad_norm": 1.0503661632537842, |
|
"learning_rate": 0.00016, |
|
"loss": 2.6065, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0025414754677020824, |
|
"grad_norm": 2.3215556144714355, |
|
"learning_rate": 0.00018, |
|
"loss": 2.6192, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0028238616307800918, |
|
"grad_norm": 1.60389244556427, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1819, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0028238616307800918, |
|
"eval_loss": 2.1240005493164062, |
|
"eval_runtime": 133.7157, |
|
"eval_samples_per_second": 5.579, |
|
"eval_steps_per_second": 5.579, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.003106247793858101, |
|
"grad_norm": 1.0237597227096558, |
|
"learning_rate": 0.0001999979446958366, |
|
"loss": 1.8844, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00338863395693611, |
|
"grad_norm": 4.726717472076416, |
|
"learning_rate": 0.00019999177886783194, |
|
"loss": 3.1172, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0036710201200141194, |
|
"grad_norm": 1.6403062343597412, |
|
"learning_rate": 0.00019998150276943902, |
|
"loss": 1.7007, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.003953406283092128, |
|
"grad_norm": 2.1164722442626953, |
|
"learning_rate": 0.000199967116823068, |
|
"loss": 2.5321, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.004235792446170138, |
|
"grad_norm": 3.5340867042541504, |
|
"learning_rate": 0.0001999486216200688, |
|
"loss": 2.3608, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.004518178609248147, |
|
"grad_norm": 5.55496072769165, |
|
"learning_rate": 0.00019992601792070679, |
|
"loss": 1.6141, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.004800564772326156, |
|
"grad_norm": 7.0048136711120605, |
|
"learning_rate": 0.00019989930665413147, |
|
"loss": 2.2728, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.005082950935404165, |
|
"grad_norm": 1.7807990312576294, |
|
"learning_rate": 0.00019986848891833845, |
|
"loss": 1.6759, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.005365337098482174, |
|
"grad_norm": 3.1774418354034424, |
|
"learning_rate": 0.0001998335659801241, |
|
"loss": 0.8362, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0056477232615601836, |
|
"grad_norm": 5.73573637008667, |
|
"learning_rate": 0.00019979453927503364, |
|
"loss": 2.7934, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0056477232615601836, |
|
"eval_loss": 1.7092158794403076, |
|
"eval_runtime": 133.3036, |
|
"eval_samples_per_second": 5.596, |
|
"eval_steps_per_second": 5.596, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.005930109424638193, |
|
"grad_norm": 1.705460548400879, |
|
"learning_rate": 0.00019975141040730207, |
|
"loss": 1.3571, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.006212495587716202, |
|
"grad_norm": 1.3597909212112427, |
|
"learning_rate": 0.0001997041811497882, |
|
"loss": 1.491, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.006494881750794211, |
|
"grad_norm": 1.3038731813430786, |
|
"learning_rate": 0.00019965285344390184, |
|
"loss": 2.4389, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.00677726791387222, |
|
"grad_norm": 2.3923144340515137, |
|
"learning_rate": 0.00019959742939952392, |
|
"loss": 1.3591, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0070596540769502295, |
|
"grad_norm": 2.964477300643921, |
|
"learning_rate": 0.00019953791129491983, |
|
"loss": 1.1022, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.007342040240028239, |
|
"grad_norm": 3.184072732925415, |
|
"learning_rate": 0.00019947430157664576, |
|
"loss": 1.4837, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.007624426403106248, |
|
"grad_norm": 1.576446294784546, |
|
"learning_rate": 0.00019940660285944803, |
|
"loss": 2.0992, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.007906812566184257, |
|
"grad_norm": 2.878796100616455, |
|
"learning_rate": 0.00019933481792615583, |
|
"loss": 2.4712, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.008189198729262267, |
|
"grad_norm": 2.5952467918395996, |
|
"learning_rate": 0.0001992589497275665, |
|
"loss": 1.6412, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.008471584892340275, |
|
"grad_norm": 2.8451197147369385, |
|
"learning_rate": 0.0001991790013823246, |
|
"loss": 1.6372, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.008471584892340275, |
|
"eval_loss": 1.6834224462509155, |
|
"eval_runtime": 133.2875, |
|
"eval_samples_per_second": 5.597, |
|
"eval_steps_per_second": 5.597, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.008753971055418284, |
|
"grad_norm": 2.855771780014038, |
|
"learning_rate": 0.00019909497617679348, |
|
"loss": 2.608, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.009036357218496294, |
|
"grad_norm": 2.0999650955200195, |
|
"learning_rate": 0.0001990068775649202, |
|
"loss": 1.6182, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.009318743381574303, |
|
"grad_norm": 5.745433330535889, |
|
"learning_rate": 0.00019891470916809362, |
|
"loss": 1.6578, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.009601129544652313, |
|
"grad_norm": 1.7802093029022217, |
|
"learning_rate": 0.00019881847477499557, |
|
"loss": 1.8047, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.009883515707730321, |
|
"grad_norm": 3.094785451889038, |
|
"learning_rate": 0.00019871817834144504, |
|
"loss": 1.2005, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01016590187080833, |
|
"grad_norm": 2.3366200923919678, |
|
"learning_rate": 0.0001986138239902355, |
|
"loss": 2.0731, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.01044828803388634, |
|
"grad_norm": 3.872102737426758, |
|
"learning_rate": 0.0001985054160109657, |
|
"loss": 1.301, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.010730674196964348, |
|
"grad_norm": 1.3710724115371704, |
|
"learning_rate": 0.00019839295885986296, |
|
"loss": 1.4742, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.011013060360042359, |
|
"grad_norm": 2.481275796890259, |
|
"learning_rate": 0.0001982764571596004, |
|
"loss": 1.5866, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.011295446523120367, |
|
"grad_norm": 1.8309324979782104, |
|
"learning_rate": 0.00019815591569910654, |
|
"loss": 2.644, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.011295446523120367, |
|
"eval_loss": 1.6503487825393677, |
|
"eval_runtime": 133.397, |
|
"eval_samples_per_second": 5.592, |
|
"eval_steps_per_second": 5.592, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.011577832686198376, |
|
"grad_norm": 1.9744967222213745, |
|
"learning_rate": 0.00019803133943336874, |
|
"loss": 2.3377, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.011860218849276386, |
|
"grad_norm": 1.438886046409607, |
|
"learning_rate": 0.0001979027334832293, |
|
"loss": 1.4721, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.012142605012354394, |
|
"grad_norm": 3.23305082321167, |
|
"learning_rate": 0.00019777010313517518, |
|
"loss": 1.7882, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.012424991175432405, |
|
"grad_norm": 5.8521013259887695, |
|
"learning_rate": 0.00019763345384112043, |
|
"loss": 1.2744, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.012707377338510413, |
|
"grad_norm": 5.29448127746582, |
|
"learning_rate": 0.00019749279121818235, |
|
"loss": 1.0265, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.012989763501588421, |
|
"grad_norm": 3.448343515396118, |
|
"learning_rate": 0.00019734812104845047, |
|
"loss": 2.1143, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.013272149664666432, |
|
"grad_norm": 2.209937810897827, |
|
"learning_rate": 0.00019719944927874881, |
|
"loss": 1.325, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.01355453582774444, |
|
"grad_norm": 1.200800895690918, |
|
"learning_rate": 0.0001970467820203915, |
|
"loss": 1.8635, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.01383692199082245, |
|
"grad_norm": 1.2242612838745117, |
|
"learning_rate": 0.00019689012554893154, |
|
"loss": 1.7063, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.014119308153900459, |
|
"grad_norm": 1.140673279762268, |
|
"learning_rate": 0.00019672948630390294, |
|
"loss": 1.315, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.014119308153900459, |
|
"eval_loss": 1.6434087753295898, |
|
"eval_runtime": 133.392, |
|
"eval_samples_per_second": 5.593, |
|
"eval_steps_per_second": 5.593, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.014401694316978467, |
|
"grad_norm": 2.5457074642181396, |
|
"learning_rate": 0.00019656487088855592, |
|
"loss": 1.5371, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.014684080480056478, |
|
"grad_norm": 2.4005753993988037, |
|
"learning_rate": 0.00019639628606958533, |
|
"loss": 1.0716, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.014966466643134486, |
|
"grad_norm": 2.4281392097473145, |
|
"learning_rate": 0.0001962237387768529, |
|
"loss": 2.1807, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.015248852806212496, |
|
"grad_norm": 2.5013136863708496, |
|
"learning_rate": 0.00019604723610310194, |
|
"loss": 1.0288, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.015531238969290505, |
|
"grad_norm": 2.0805702209472656, |
|
"learning_rate": 0.00019586678530366606, |
|
"loss": 1.1582, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.015813625132368513, |
|
"grad_norm": 7.539531707763672, |
|
"learning_rate": 0.00019568239379617088, |
|
"loss": 1.7791, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.016096011295446522, |
|
"grad_norm": 1.5069524049758911, |
|
"learning_rate": 0.00019549406916022905, |
|
"loss": 1.6449, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.016378397458524534, |
|
"grad_norm": 1.1627360582351685, |
|
"learning_rate": 0.00019530181913712872, |
|
"loss": 1.2881, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.016660783621602542, |
|
"grad_norm": 5.9471282958984375, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 1.6948, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.01694316978468055, |
|
"grad_norm": 1.4962432384490967, |
|
"learning_rate": 0.00019490557470106686, |
|
"loss": 1.3254, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01694316978468055, |
|
"eval_loss": 1.633779525756836, |
|
"eval_runtime": 133.2937, |
|
"eval_samples_per_second": 5.597, |
|
"eval_steps_per_second": 5.597, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01722555594775856, |
|
"grad_norm": 1.3225600719451904, |
|
"learning_rate": 0.00019470159657616215, |
|
"loss": 1.7811, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.017507942110836568, |
|
"grad_norm": 1.632053017616272, |
|
"learning_rate": 0.00019449372563954293, |
|
"loss": 1.8656, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.01779032827391458, |
|
"grad_norm": 1.7111440896987915, |
|
"learning_rate": 0.0001942819704359693, |
|
"loss": 0.8098, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.018072714436992588, |
|
"grad_norm": 1.5115035772323608, |
|
"learning_rate": 0.00019406633966986828, |
|
"loss": 1.0582, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.018355100600070597, |
|
"grad_norm": 2.999513626098633, |
|
"learning_rate": 0.00019384684220497605, |
|
"loss": 2.1569, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.018637486763148605, |
|
"grad_norm": 1.0796102285385132, |
|
"learning_rate": 0.00019362348706397373, |
|
"loss": 1.4516, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.018919872926226614, |
|
"grad_norm": 2.9733681678771973, |
|
"learning_rate": 0.00019339628342811632, |
|
"loss": 1.6857, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.019202259089304625, |
|
"grad_norm": 2.086916923522949, |
|
"learning_rate": 0.0001931652406368554, |
|
"loss": 0.7689, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.019484645252382634, |
|
"grad_norm": 1.511246919631958, |
|
"learning_rate": 0.0001929303681874552, |
|
"loss": 1.5469, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.019767031415460642, |
|
"grad_norm": 2.029017925262451, |
|
"learning_rate": 0.0001926916757346022, |
|
"loss": 1.3705, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.019767031415460642, |
|
"eval_loss": 1.6125953197479248, |
|
"eval_runtime": 133.426, |
|
"eval_samples_per_second": 5.591, |
|
"eval_steps_per_second": 5.591, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02004941757853865, |
|
"grad_norm": 2.52913761138916, |
|
"learning_rate": 0.00019244917309000817, |
|
"loss": 1.722, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.02033180374161666, |
|
"grad_norm": 7.770167350769043, |
|
"learning_rate": 0.00019220287022200707, |
|
"loss": 2.7036, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.02061418990469467, |
|
"grad_norm": 1.7608412504196167, |
|
"learning_rate": 0.0001919527772551451, |
|
"loss": 1.2768, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.02089657606777268, |
|
"grad_norm": 2.3405442237854004, |
|
"learning_rate": 0.00019169890446976454, |
|
"loss": 2.2241, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.02117896223085069, |
|
"grad_norm": 2.386042356491089, |
|
"learning_rate": 0.00019144126230158127, |
|
"loss": 1.3922, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.021461348393928697, |
|
"grad_norm": 2.280710458755493, |
|
"learning_rate": 0.0001911798613412557, |
|
"loss": 1.608, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.021743734557006705, |
|
"grad_norm": 1.2972298860549927, |
|
"learning_rate": 0.0001909147123339575, |
|
"loss": 2.1776, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.022026120720084717, |
|
"grad_norm": 1.4631404876708984, |
|
"learning_rate": 0.0001906458261789238, |
|
"loss": 3.1008, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.022308506883162726, |
|
"grad_norm": 1.0595492124557495, |
|
"learning_rate": 0.00019037321392901136, |
|
"loss": 1.3511, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.022590893046240734, |
|
"grad_norm": 0.9610152244567871, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 2.1504, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.022590893046240734, |
|
"eval_loss": 1.602448582649231, |
|
"eval_runtime": 133.5988, |
|
"eval_samples_per_second": 5.584, |
|
"eval_steps_per_second": 5.584, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.022873279209318743, |
|
"grad_norm": 1.699086308479309, |
|
"learning_rate": 0.0001898168561213419, |
|
"loss": 1.3892, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.02315566537239675, |
|
"grad_norm": 1.2091821432113647, |
|
"learning_rate": 0.0001895331334332753, |
|
"loss": 1.5162, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.023438051535474763, |
|
"grad_norm": 1.6631978750228882, |
|
"learning_rate": 0.0001892457303887706, |
|
"loss": 0.8076, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.02372043769855277, |
|
"grad_norm": 1.577644944190979, |
|
"learning_rate": 0.0001889546588018412, |
|
"loss": 1.4393, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.02400282386163078, |
|
"grad_norm": 1.207412838935852, |
|
"learning_rate": 0.00018865993063730004, |
|
"loss": 2.1015, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.02428521002470879, |
|
"grad_norm": 2.7810745239257812, |
|
"learning_rate": 0.00018836155801026753, |
|
"loss": 1.4547, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.024567596187786797, |
|
"grad_norm": 2.054161787033081, |
|
"learning_rate": 0.0001880595531856738, |
|
"loss": 1.3083, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.02484998235086481, |
|
"grad_norm": 3.753908634185791, |
|
"learning_rate": 0.00018775392857775432, |
|
"loss": 2.8305, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.025132368513942818, |
|
"grad_norm": 4.611723899841309, |
|
"learning_rate": 0.00018744469674953956, |
|
"loss": 1.7302, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.025414754677020826, |
|
"grad_norm": 2.27549409866333, |
|
"learning_rate": 0.00018713187041233896, |
|
"loss": 1.8115, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.025414754677020826, |
|
"eval_loss": 1.6072229146957397, |
|
"eval_runtime": 133.6033, |
|
"eval_samples_per_second": 5.584, |
|
"eval_steps_per_second": 5.584, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.025697140840098835, |
|
"grad_norm": 1.4146398305892944, |
|
"learning_rate": 0.00018681546242521786, |
|
"loss": 1.1993, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.025979527003176843, |
|
"grad_norm": 2.2005670070648193, |
|
"learning_rate": 0.00018649548579446936, |
|
"loss": 1.9517, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.026261913166254855, |
|
"grad_norm": 1.241758108139038, |
|
"learning_rate": 0.0001861719536730795, |
|
"loss": 2.0892, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.026544299329332863, |
|
"grad_norm": 1.4617339372634888, |
|
"learning_rate": 0.00018584487936018661, |
|
"loss": 2.2815, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.026826685492410872, |
|
"grad_norm": 1.677581548690796, |
|
"learning_rate": 0.00018551427630053463, |
|
"loss": 1.75, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.02710907165548888, |
|
"grad_norm": 3.2750422954559326, |
|
"learning_rate": 0.00018518015808392045, |
|
"loss": 1.8473, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.02739145781856689, |
|
"grad_norm": 1.7410293817520142, |
|
"learning_rate": 0.00018484253844463526, |
|
"loss": 1.2498, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.0276738439816449, |
|
"grad_norm": 1.0431251525878906, |
|
"learning_rate": 0.00018450143126090015, |
|
"loss": 2.3196, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.02795623014472291, |
|
"grad_norm": 2.758586883544922, |
|
"learning_rate": 0.00018415685055429533, |
|
"loss": 1.9701, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.028238616307800918, |
|
"grad_norm": 1.7685903310775757, |
|
"learning_rate": 0.00018380881048918405, |
|
"loss": 1.4758, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.028238616307800918, |
|
"eval_loss": 1.6011497974395752, |
|
"eval_runtime": 133.8006, |
|
"eval_samples_per_second": 5.575, |
|
"eval_steps_per_second": 5.575, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.028521002470878926, |
|
"grad_norm": 1.1203055381774902, |
|
"learning_rate": 0.00018345732537213027, |
|
"loss": 1.7475, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.028803388633956935, |
|
"grad_norm": 1.274515986442566, |
|
"learning_rate": 0.00018310240965131041, |
|
"loss": 2.6023, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.029085774797034947, |
|
"grad_norm": 2.5792765617370605, |
|
"learning_rate": 0.00018274407791591966, |
|
"loss": 1.1908, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.029368160960112955, |
|
"grad_norm": 1.466035008430481, |
|
"learning_rate": 0.00018238234489557215, |
|
"loss": 0.7359, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.029650547123190964, |
|
"grad_norm": 3.4681172370910645, |
|
"learning_rate": 0.0001820172254596956, |
|
"loss": 1.8144, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.029932933286268972, |
|
"grad_norm": 4.0510993003845215, |
|
"learning_rate": 0.00018164873461691986, |
|
"loss": 0.7832, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.03021531944934698, |
|
"grad_norm": 5.226031303405762, |
|
"learning_rate": 0.00018127688751446027, |
|
"loss": 1.7575, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.030497705612424993, |
|
"grad_norm": 1.0487242937088013, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 2.077, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.030780091775503, |
|
"grad_norm": 1.5338118076324463, |
|
"learning_rate": 0.0001805231858085356, |
|
"loss": 1.5191, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.03106247793858101, |
|
"grad_norm": 1.2566704750061035, |
|
"learning_rate": 0.00018014136218679567, |
|
"loss": 1.756, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.03106247793858101, |
|
"eval_loss": 1.6045676469802856, |
|
"eval_runtime": 133.6566, |
|
"eval_samples_per_second": 5.581, |
|
"eval_steps_per_second": 5.581, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.03134486410165902, |
|
"grad_norm": 0.7646064758300781, |
|
"learning_rate": 0.00017975624426754848, |
|
"loss": 1.6671, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.03162725026473703, |
|
"grad_norm": 2.133544445037842, |
|
"learning_rate": 0.00017936784788148328, |
|
"loss": 1.5909, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.03190963642781504, |
|
"grad_norm": 2.059943199157715, |
|
"learning_rate": 0.00017897618899405423, |
|
"loss": 1.7489, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.032192022590893044, |
|
"grad_norm": 0.8779903650283813, |
|
"learning_rate": 0.00017858128370482426, |
|
"loss": 1.4871, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.032474408753971055, |
|
"grad_norm": 1.5168753862380981, |
|
"learning_rate": 0.000178183148246803, |
|
"loss": 1.8045, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.03275679491704907, |
|
"grad_norm": 1.1241475343704224, |
|
"learning_rate": 0.00017778179898577973, |
|
"loss": 1.64, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.03303918108012707, |
|
"grad_norm": 9.078608512878418, |
|
"learning_rate": 0.00017737725241965069, |
|
"loss": 2.7736, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.033321567243205084, |
|
"grad_norm": 3.2590787410736084, |
|
"learning_rate": 0.00017696952517774062, |
|
"loss": 2.5064, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.03360395340628309, |
|
"grad_norm": 2.293269395828247, |
|
"learning_rate": 0.00017655863402011947, |
|
"loss": 2.146, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0338863395693611, |
|
"grad_norm": 1.5803933143615723, |
|
"learning_rate": 0.00017614459583691346, |
|
"loss": 1.4979, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0338863395693611, |
|
"eval_loss": 1.603722095489502, |
|
"eval_runtime": 133.2987, |
|
"eval_samples_per_second": 5.596, |
|
"eval_steps_per_second": 5.596, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03416872573243911, |
|
"grad_norm": 1.4283939599990845, |
|
"learning_rate": 0.00017572742764761055, |
|
"loss": 1.4789, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.03445111189551712, |
|
"grad_norm": 1.3361456394195557, |
|
"learning_rate": 0.00017530714660036112, |
|
"loss": 0.784, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.03473349805859513, |
|
"grad_norm": 1.0861424207687378, |
|
"learning_rate": 0.00017488376997127283, |
|
"loss": 2.2809, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.035015884221673135, |
|
"grad_norm": 4.459283351898193, |
|
"learning_rate": 0.0001744573151637007, |
|
"loss": 1.4483, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.03529827038475115, |
|
"grad_norm": 1.324436902999878, |
|
"learning_rate": 0.00017402779970753155, |
|
"loss": 2.6136, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.03558065654782916, |
|
"grad_norm": 3.7964041233062744, |
|
"learning_rate": 0.0001735952412584635, |
|
"loss": 1.092, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.035863042710907164, |
|
"grad_norm": 2.560436725616455, |
|
"learning_rate": 0.00017315965759728014, |
|
"loss": 1.4307, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.036145428873985176, |
|
"grad_norm": 1.473990797996521, |
|
"learning_rate": 0.00017272106662911973, |
|
"loss": 1.1344, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.03642781503706318, |
|
"grad_norm": 3.3736298084259033, |
|
"learning_rate": 0.00017227948638273916, |
|
"loss": 1.5746, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.03671020120014119, |
|
"grad_norm": 1.5858126878738403, |
|
"learning_rate": 0.00017183493500977278, |
|
"loss": 1.3798, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03671020120014119, |
|
"eval_loss": 1.5968632698059082, |
|
"eval_runtime": 133.2364, |
|
"eval_samples_per_second": 5.599, |
|
"eval_steps_per_second": 5.599, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.036992587363219205, |
|
"grad_norm": 0.9281368851661682, |
|
"learning_rate": 0.0001713874307839863, |
|
"loss": 1.8154, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.03727497352629721, |
|
"grad_norm": 2.6511611938476562, |
|
"learning_rate": 0.0001709369921005258, |
|
"loss": 1.4, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.03755735968937522, |
|
"grad_norm": 1.9646121263504028, |
|
"learning_rate": 0.00017048363747516117, |
|
"loss": 1.4126, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.03783974585245323, |
|
"grad_norm": 1.7208032608032227, |
|
"learning_rate": 0.00017002738554352552, |
|
"loss": 0.5568, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.03812213201553124, |
|
"grad_norm": 2.9022722244262695, |
|
"learning_rate": 0.00016956825506034867, |
|
"loss": 1.6914, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.03840451817860925, |
|
"grad_norm": 1.368131160736084, |
|
"learning_rate": 0.00016910626489868649, |
|
"loss": 1.5647, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.038686904341687256, |
|
"grad_norm": 1.5058932304382324, |
|
"learning_rate": 0.00016864143404914504, |
|
"loss": 2.4011, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.03896929050476527, |
|
"grad_norm": 2.3039586544036865, |
|
"learning_rate": 0.00016817378161909996, |
|
"loss": 0.9973, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.03925167666784327, |
|
"grad_norm": 1.9210929870605469, |
|
"learning_rate": 0.00016770332683191096, |
|
"loss": 1.7679, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.039534062830921285, |
|
"grad_norm": 1.3414863348007202, |
|
"learning_rate": 0.0001672300890261317, |
|
"loss": 1.4788, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.039534062830921285, |
|
"eval_loss": 1.6231486797332764, |
|
"eval_runtime": 133.2243, |
|
"eval_samples_per_second": 5.6, |
|
"eval_steps_per_second": 5.6, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0398164489939993, |
|
"grad_norm": 1.8461261987686157, |
|
"learning_rate": 0.0001667540876547148, |
|
"loss": 2.1281, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.0400988351570773, |
|
"grad_norm": 3.0568318367004395, |
|
"learning_rate": 0.0001662753422842123, |
|
"loss": 1.476, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.040381221320155314, |
|
"grad_norm": 1.1168292760849, |
|
"learning_rate": 0.00016579387259397127, |
|
"loss": 0.9893, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.04066360748323332, |
|
"grad_norm": 2.078538417816162, |
|
"learning_rate": 0.00016530969837532487, |
|
"loss": 0.8452, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.04094599364631133, |
|
"grad_norm": 2.543635845184326, |
|
"learning_rate": 0.00016482283953077887, |
|
"loss": 1.5892, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.04122837980938934, |
|
"grad_norm": 1.8149226903915405, |
|
"learning_rate": 0.00016433331607319343, |
|
"loss": 1.4509, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.04151076597246735, |
|
"grad_norm": 1.4729820489883423, |
|
"learning_rate": 0.00016384114812496056, |
|
"loss": 1.5081, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.04179315213554536, |
|
"grad_norm": 2.2723262310028076, |
|
"learning_rate": 0.00016334635591717703, |
|
"loss": 2.19, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.042075538298623365, |
|
"grad_norm": 1.748171091079712, |
|
"learning_rate": 0.00016284895978881236, |
|
"loss": 2.2346, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.04235792446170138, |
|
"grad_norm": 1.4556044340133667, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 1.1614, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04235792446170138, |
|
"eval_loss": 1.6199051141738892, |
|
"eval_runtime": 133.3692, |
|
"eval_samples_per_second": 5.593, |
|
"eval_steps_per_second": 5.593, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04264031062477939, |
|
"grad_norm": 1.4214565753936768, |
|
"learning_rate": 0.00016184643766056317, |
|
"loss": 1.9267, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.042922696787857394, |
|
"grad_norm": 1.2661665678024292, |
|
"learning_rate": 0.00016134135287043669, |
|
"loss": 1.8779, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.043205082950935406, |
|
"grad_norm": 2.253584384918213, |
|
"learning_rate": 0.00016083374657755134, |
|
"loss": 1.2, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.04348746911401341, |
|
"grad_norm": 2.3451638221740723, |
|
"learning_rate": 0.00016032363964761363, |
|
"loss": 2.1057, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.04376985527709142, |
|
"grad_norm": 2.77101731300354, |
|
"learning_rate": 0.00015981105304912162, |
|
"loss": 1.8791, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.044052241440169435, |
|
"grad_norm": 1.678722620010376, |
|
"learning_rate": 0.00015929600785250257, |
|
"loss": 2.4479, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.04433462760324744, |
|
"grad_norm": 1.2198508977890015, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 2.3366, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.04461701376632545, |
|
"grad_norm": 5.628009796142578, |
|
"learning_rate": 0.0001582586264510396, |
|
"loss": 1.3177, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.04489939992940346, |
|
"grad_norm": 2.065458297729492, |
|
"learning_rate": 0.00015773633288888197, |
|
"loss": 2.0971, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.04518178609248147, |
|
"grad_norm": 0.9564564824104309, |
|
"learning_rate": 0.00015721166601221698, |
|
"loss": 1.3886, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04518178609248147, |
|
"eval_loss": 1.6064260005950928, |
|
"eval_runtime": 133.4242, |
|
"eval_samples_per_second": 5.591, |
|
"eval_steps_per_second": 5.591, |
|
"step": 160 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 3 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.566085736300544e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|