|
{ |
|
"best_metric": 0.7551020408163265, |
|
"best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-0/checkpoint-240", |
|
"epoch": 8.0, |
|
"eval_steps": 500, |
|
"global_step": 384, |
|
"is_hyper_param_search": true, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.2701194286346436, |
|
"learning_rate": 1.7573936619349767e-06, |
|
"loss": 0.7024, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.218135952949524, |
|
"learning_rate": 3.5147873238699533e-06, |
|
"loss": 0.6932, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.5910921096801758, |
|
"learning_rate": 5.2721809858049295e-06, |
|
"loss": 0.6916, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.0592715740203857, |
|
"learning_rate": 7.029574647739907e-06, |
|
"loss": 0.6854, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.132177472114563, |
|
"learning_rate": 8.786968309674883e-06, |
|
"loss": 0.6719, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.9851714372634888, |
|
"learning_rate": 1.0544361971609859e-05, |
|
"loss": 0.6633, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.5592440366744995, |
|
"learning_rate": 1.2301755633544835e-05, |
|
"loss": 0.6288, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.8793619871139526, |
|
"learning_rate": 1.4059149295479813e-05, |
|
"loss": 0.6561, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.987308382987976, |
|
"learning_rate": 1.581654295741479e-05, |
|
"loss": 0.6343, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_f1": 0.7326732673267327, |
|
"eval_loss": 0.6868629455566406, |
|
"eval_runtime": 1.3443, |
|
"eval_samples_per_second": 47.609, |
|
"eval_steps_per_second": 5.951, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 3.1652989387512207, |
|
"learning_rate": 1.6792872769600888e-05, |
|
"loss": 0.6712, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.9990371465682983, |
|
"learning_rate": 1.6597606807163667e-05, |
|
"loss": 0.6657, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.8210965394973755, |
|
"learning_rate": 1.640234084472645e-05, |
|
"loss": 0.476, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.5364782810211182, |
|
"learning_rate": 1.6207074882289228e-05, |
|
"loss": 0.5768, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.6772537231445312, |
|
"learning_rate": 1.6011808919852007e-05, |
|
"loss": 0.5039, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.0453933477401733, |
|
"learning_rate": 1.581654295741479e-05, |
|
"loss": 0.7182, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 1.621067762374878, |
|
"learning_rate": 1.562127699497757e-05, |
|
"loss": 0.6178, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 1.7878597974777222, |
|
"learning_rate": 1.542601103254035e-05, |
|
"loss": 0.6192, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.6824157238006592, |
|
"learning_rate": 1.5230745070103131e-05, |
|
"loss": 0.6109, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.7989784479141235, |
|
"learning_rate": 1.5035479107665912e-05, |
|
"loss": 0.6367, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_f1": 0.7326732673267327, |
|
"eval_loss": 0.7258987426757812, |
|
"eval_runtime": 1.3947, |
|
"eval_samples_per_second": 45.888, |
|
"eval_steps_per_second": 5.736, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.6805263757705688, |
|
"learning_rate": 1.484021314522869e-05, |
|
"loss": 0.6271, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.6469630002975464, |
|
"learning_rate": 1.4644947182791471e-05, |
|
"loss": 0.5085, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.9272030591964722, |
|
"learning_rate": 1.4449681220354252e-05, |
|
"loss": 0.6377, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.7515610456466675, |
|
"learning_rate": 1.4254415257917033e-05, |
|
"loss": 0.655, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.1927835941314697, |
|
"learning_rate": 1.4059149295479813e-05, |
|
"loss": 0.5797, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.0289098024368286, |
|
"learning_rate": 1.3863883333042594e-05, |
|
"loss": 0.5291, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.7173566818237305, |
|
"learning_rate": 1.3668617370605374e-05, |
|
"loss": 0.6829, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.8574482202529907, |
|
"learning_rate": 1.3473351408168155e-05, |
|
"loss": 0.4509, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.1971725225448608, |
|
"learning_rate": 1.3278085445730936e-05, |
|
"loss": 0.4369, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_f1": 0.7326732673267327, |
|
"eval_loss": 0.7368831634521484, |
|
"eval_runtime": 1.4074, |
|
"eval_samples_per_second": 45.475, |
|
"eval_steps_per_second": 5.684, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 2.6530747413635254, |
|
"learning_rate": 1.3082819483293713e-05, |
|
"loss": 0.6167, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 2.0653493404388428, |
|
"learning_rate": 1.2887553520856493e-05, |
|
"loss": 0.5266, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 1.5351653099060059, |
|
"learning_rate": 1.2692287558419274e-05, |
|
"loss": 0.5224, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 1.5351920127868652, |
|
"learning_rate": 1.2497021595982055e-05, |
|
"loss": 0.622, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.5447399616241455, |
|
"learning_rate": 1.2301755633544835e-05, |
|
"loss": 0.4725, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 4.063059329986572, |
|
"learning_rate": 1.2106489671107616e-05, |
|
"loss": 0.4089, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 2.517096519470215, |
|
"learning_rate": 1.1911223708670397e-05, |
|
"loss": 0.579, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 2.847086191177368, |
|
"learning_rate": 1.1755010938720622e-05, |
|
"loss": 0.4687, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 1.7841814756393433, |
|
"learning_rate": 1.1559744976283403e-05, |
|
"loss": 0.4846, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 3.5284550189971924, |
|
"learning_rate": 1.1364479013846184e-05, |
|
"loss": 0.5628, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_f1": 0.74, |
|
"eval_loss": 0.6948537826538086, |
|
"eval_runtime": 1.3673, |
|
"eval_samples_per_second": 46.807, |
|
"eval_steps_per_second": 5.851, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 1.6642357110977173, |
|
"learning_rate": 1.1169213051408962e-05, |
|
"loss": 0.5529, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 3.0695078372955322, |
|
"learning_rate": 1.0973947088971741e-05, |
|
"loss": 0.3534, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 2.5939130783081055, |
|
"learning_rate": 1.0778681126534522e-05, |
|
"loss": 0.4534, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 9.777613639831543, |
|
"learning_rate": 1.0583415164097303e-05, |
|
"loss": 0.5345, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 4.490750312805176, |
|
"learning_rate": 1.0388149201660083e-05, |
|
"loss": 0.4854, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 3.7475597858428955, |
|
"learning_rate": 1.0192883239222864e-05, |
|
"loss": 0.4174, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 1.7532075643539429, |
|
"learning_rate": 9.997617276785644e-06, |
|
"loss": 0.4379, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 1.4538604021072388, |
|
"learning_rate": 9.802351314348425e-06, |
|
"loss": 0.3269, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 3.058314085006714, |
|
"learning_rate": 9.607085351911206e-06, |
|
"loss": 0.3553, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 5.367821216583252, |
|
"learning_rate": 9.45087258196143e-06, |
|
"loss": 0.4753, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_f1": 0.7551020408163265, |
|
"eval_loss": 0.7308206558227539, |
|
"eval_runtime": 1.3546, |
|
"eval_samples_per_second": 47.246, |
|
"eval_steps_per_second": 5.906, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 1.079416275024414, |
|
"learning_rate": 9.25560661952421e-06, |
|
"loss": 0.2547, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 5.21, |
|
"grad_norm": 6.890663146972656, |
|
"learning_rate": 9.060340657086991e-06, |
|
"loss": 0.3043, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.31, |
|
"grad_norm": 5.391942977905273, |
|
"learning_rate": 8.86507469464977e-06, |
|
"loss": 0.4958, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 5.42, |
|
"grad_norm": Infinity, |
|
"learning_rate": 8.708861924699996e-06, |
|
"loss": 0.4157, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 2.2877392768859863, |
|
"learning_rate": 8.513595962262776e-06, |
|
"loss": 0.289, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"grad_norm": 6.283646106719971, |
|
"learning_rate": 8.318329999825557e-06, |
|
"loss": 0.4315, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"grad_norm": 8.02907657623291, |
|
"learning_rate": 8.123064037388336e-06, |
|
"loss": 0.3901, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 5.83, |
|
"grad_norm": 5.0022664070129395, |
|
"learning_rate": 7.927798074951117e-06, |
|
"loss": 0.3458, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.94, |
|
"grad_norm": 9.924915313720703, |
|
"learning_rate": 7.732532112513897e-06, |
|
"loss": 0.3239, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_f1": 0.7526881720430108, |
|
"eval_loss": 0.734130859375, |
|
"eval_runtime": 1.3629, |
|
"eval_samples_per_second": 46.958, |
|
"eval_steps_per_second": 5.87, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 6.04, |
|
"grad_norm": 6.063592910766602, |
|
"learning_rate": 7.537266150076677e-06, |
|
"loss": 0.4556, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 6.15, |
|
"grad_norm": 5.061245441436768, |
|
"learning_rate": 7.3420001876394575e-06, |
|
"loss": 0.2344, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 1.229814052581787, |
|
"learning_rate": 7.146734225202238e-06, |
|
"loss": 0.2622, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"grad_norm": 10.423860549926758, |
|
"learning_rate": 6.951468262765019e-06, |
|
"loss": 0.3694, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 9.655594825744629, |
|
"learning_rate": 6.756202300327799e-06, |
|
"loss": 0.3442, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"grad_norm": 3.6673190593719482, |
|
"learning_rate": 6.560936337890579e-06, |
|
"loss": 0.3164, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"grad_norm": 11.887277603149414, |
|
"learning_rate": 6.36567037545336e-06, |
|
"loss": 0.2361, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.77, |
|
"grad_norm": 12.80982780456543, |
|
"learning_rate": 6.17040441301614e-06, |
|
"loss": 0.3502, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 14.678775787353516, |
|
"learning_rate": 5.975138450578921e-06, |
|
"loss": 0.2829, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.98, |
|
"grad_norm": 2.002340078353882, |
|
"learning_rate": 5.7798724881417015e-06, |
|
"loss": 0.3153, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_f1": 0.7368421052631577, |
|
"eval_loss": 0.8391213417053223, |
|
"eval_runtime": 1.3632, |
|
"eval_samples_per_second": 46.947, |
|
"eval_steps_per_second": 5.868, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 7.08, |
|
"grad_norm": 14.519176483154297, |
|
"learning_rate": 5.584606525704481e-06, |
|
"loss": 0.1675, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"grad_norm": 1.7075341939926147, |
|
"learning_rate": 5.389340563267261e-06, |
|
"loss": 0.3502, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 7.29, |
|
"grad_norm": 1.1975409984588623, |
|
"learning_rate": 5.194074600830042e-06, |
|
"loss": 0.1761, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 24.773109436035156, |
|
"learning_rate": 4.998808638392822e-06, |
|
"loss": 0.2723, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 1.3228851556777954, |
|
"learning_rate": 4.803542675955603e-06, |
|
"loss": 0.2997, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 11.504044532775879, |
|
"learning_rate": 4.608276713518383e-06, |
|
"loss": 0.167, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 7.71, |
|
"grad_norm": 4.855327129364014, |
|
"learning_rate": 4.413010751081163e-06, |
|
"loss": 0.1576, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 7.81, |
|
"grad_norm": 3.231921434402466, |
|
"learning_rate": 4.217744788643944e-06, |
|
"loss": 0.3102, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"grad_norm": 11.368690490722656, |
|
"learning_rate": 4.022478826206724e-06, |
|
"loss": 0.3451, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_f1": 0.7333333333333333, |
|
"eval_loss": 0.8304653167724609, |
|
"eval_runtime": 1.3846, |
|
"eval_samples_per_second": 46.222, |
|
"eval_steps_per_second": 5.778, |
|
"step": 384 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 480, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 5808783041309760.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": { |
|
"learning_rate": 1.6870979154575775e-05, |
|
"per_device_train_batch_size": 4 |
|
} |
|
} |
|
|