ntr-base-qrecc / checkpoint-20000 /trainer_state.json
3v324v23's picture
update new models
ccb7d61
raw
history blame
13 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.5195263290501386,
"eval_steps": 500,
"global_step": 20000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06,
"learning_rate": 0.000975,
"loss": 1.0238,
"step": 500
},
{
"epoch": 0.06,
"eval_loss": 0.884382426738739,
"eval_runtime": 4.1887,
"eval_samples_per_second": 238.736,
"eval_steps_per_second": 29.842,
"step": 500
},
{
"epoch": 0.13,
"learning_rate": 0.00095,
"loss": 0.9971,
"step": 1000
},
{
"epoch": 0.13,
"eval_loss": 0.8562089800834656,
"eval_runtime": 4.1399,
"eval_samples_per_second": 241.551,
"eval_steps_per_second": 30.194,
"step": 1000
},
{
"epoch": 0.19,
"learning_rate": 0.000925,
"loss": 0.9522,
"step": 1500
},
{
"epoch": 0.19,
"eval_loss": 0.8645691871643066,
"eval_runtime": 4.1915,
"eval_samples_per_second": 238.576,
"eval_steps_per_second": 29.822,
"step": 1500
},
{
"epoch": 0.25,
"learning_rate": 0.0009000000000000001,
"loss": 0.9143,
"step": 2000
},
{
"epoch": 0.25,
"eval_loss": 0.8668216466903687,
"eval_runtime": 4.1753,
"eval_samples_per_second": 239.502,
"eval_steps_per_second": 29.938,
"step": 2000
},
{
"epoch": 0.31,
"learning_rate": 0.000875,
"loss": 0.9151,
"step": 2500
},
{
"epoch": 0.31,
"eval_loss": 0.8562004566192627,
"eval_runtime": 4.2561,
"eval_samples_per_second": 234.956,
"eval_steps_per_second": 29.369,
"step": 2500
},
{
"epoch": 0.38,
"learning_rate": 0.00085,
"loss": 0.9263,
"step": 3000
},
{
"epoch": 0.38,
"eval_loss": 0.850217342376709,
"eval_runtime": 4.1404,
"eval_samples_per_second": 241.521,
"eval_steps_per_second": 30.19,
"step": 3000
},
{
"epoch": 0.44,
"learning_rate": 0.000825,
"loss": 0.9102,
"step": 3500
},
{
"epoch": 0.44,
"eval_loss": 0.8581109642982483,
"eval_runtime": 4.1878,
"eval_samples_per_second": 238.788,
"eval_steps_per_second": 29.849,
"step": 3500
},
{
"epoch": 0.5,
"learning_rate": 0.0008,
"loss": 0.8866,
"step": 4000
},
{
"epoch": 0.5,
"eval_loss": 0.8308799862861633,
"eval_runtime": 4.1108,
"eval_samples_per_second": 243.259,
"eval_steps_per_second": 30.407,
"step": 4000
},
{
"epoch": 0.57,
"learning_rate": 0.0007750000000000001,
"loss": 0.911,
"step": 4500
},
{
"epoch": 0.57,
"eval_loss": 0.801396369934082,
"eval_runtime": 4.1725,
"eval_samples_per_second": 239.665,
"eval_steps_per_second": 29.958,
"step": 4500
},
{
"epoch": 0.63,
"learning_rate": 0.00075,
"loss": 0.8536,
"step": 5000
},
{
"epoch": 0.63,
"eval_loss": 0.8181760907173157,
"eval_runtime": 4.1738,
"eval_samples_per_second": 239.59,
"eval_steps_per_second": 29.949,
"step": 5000
},
{
"epoch": 0.69,
"learning_rate": 0.000725,
"loss": 0.8277,
"step": 5500
},
{
"epoch": 0.69,
"eval_loss": 0.7971529960632324,
"eval_runtime": 4.1414,
"eval_samples_per_second": 241.466,
"eval_steps_per_second": 30.183,
"step": 5500
},
{
"epoch": 0.76,
"learning_rate": 0.0007,
"loss": 0.8413,
"step": 6000
},
{
"epoch": 0.76,
"eval_loss": 0.8046051263809204,
"eval_runtime": 4.278,
"eval_samples_per_second": 233.755,
"eval_steps_per_second": 29.219,
"step": 6000
},
{
"epoch": 0.82,
"learning_rate": 0.000675,
"loss": 0.8491,
"step": 6500
},
{
"epoch": 0.82,
"eval_loss": 0.8008071184158325,
"eval_runtime": 4.2965,
"eval_samples_per_second": 232.746,
"eval_steps_per_second": 29.093,
"step": 6500
},
{
"epoch": 0.88,
"learning_rate": 0.0006500000000000001,
"loss": 0.8077,
"step": 7000
},
{
"epoch": 0.88,
"eval_loss": 0.7875821590423584,
"eval_runtime": 4.1627,
"eval_samples_per_second": 240.231,
"eval_steps_per_second": 30.029,
"step": 7000
},
{
"epoch": 0.94,
"learning_rate": 0.000625,
"loss": 0.796,
"step": 7500
},
{
"epoch": 0.94,
"eval_loss": 0.7883646488189697,
"eval_runtime": 4.2129,
"eval_samples_per_second": 237.368,
"eval_steps_per_second": 29.671,
"step": 7500
},
{
"epoch": 1.01,
"learning_rate": 0.0006,
"loss": 0.7952,
"step": 8000
},
{
"epoch": 1.01,
"eval_loss": 0.7713276743888855,
"eval_runtime": 4.1008,
"eval_samples_per_second": 243.858,
"eval_steps_per_second": 30.482,
"step": 8000
},
{
"epoch": 1.07,
"learning_rate": 0.000575,
"loss": 0.5761,
"step": 8500
},
{
"epoch": 1.07,
"eval_loss": 0.7843192219734192,
"eval_runtime": 4.1527,
"eval_samples_per_second": 240.808,
"eval_steps_per_second": 30.101,
"step": 8500
},
{
"epoch": 1.13,
"learning_rate": 0.00055,
"loss": 0.6084,
"step": 9000
},
{
"epoch": 1.13,
"eval_loss": 0.7597091197967529,
"eval_runtime": 4.2348,
"eval_samples_per_second": 236.137,
"eval_steps_per_second": 29.517,
"step": 9000
},
{
"epoch": 1.2,
"learning_rate": 0.0005250000000000001,
"loss": 0.5863,
"step": 9500
},
{
"epoch": 1.2,
"eval_loss": 0.7753661274909973,
"eval_runtime": 4.2434,
"eval_samples_per_second": 235.658,
"eval_steps_per_second": 29.457,
"step": 9500
},
{
"epoch": 1.26,
"learning_rate": 0.0005,
"loss": 0.5835,
"step": 10000
},
{
"epoch": 1.26,
"eval_loss": 0.7525186538696289,
"eval_runtime": 4.1329,
"eval_samples_per_second": 241.962,
"eval_steps_per_second": 30.245,
"step": 10000
},
{
"epoch": 1.32,
"learning_rate": 0.000475,
"loss": 0.5751,
"step": 10500
},
{
"epoch": 1.32,
"eval_loss": 0.7624223828315735,
"eval_runtime": 4.1841,
"eval_samples_per_second": 239.002,
"eval_steps_per_second": 29.875,
"step": 10500
},
{
"epoch": 1.39,
"learning_rate": 0.00045000000000000004,
"loss": 0.5746,
"step": 11000
},
{
"epoch": 1.39,
"eval_loss": 0.7706524729728699,
"eval_runtime": 4.2408,
"eval_samples_per_second": 235.806,
"eval_steps_per_second": 29.476,
"step": 11000
},
{
"epoch": 1.45,
"learning_rate": 0.000425,
"loss": 0.5847,
"step": 11500
},
{
"epoch": 1.45,
"eval_loss": 0.7473410367965698,
"eval_runtime": 4.1712,
"eval_samples_per_second": 239.742,
"eval_steps_per_second": 29.968,
"step": 11500
},
{
"epoch": 1.51,
"learning_rate": 0.0004,
"loss": 0.5759,
"step": 12000
},
{
"epoch": 1.51,
"eval_loss": 0.7421715259552002,
"eval_runtime": 4.1644,
"eval_samples_per_second": 240.128,
"eval_steps_per_second": 30.016,
"step": 12000
},
{
"epoch": 1.57,
"learning_rate": 0.000375,
"loss": 0.5922,
"step": 12500
},
{
"epoch": 1.57,
"eval_loss": 0.7362275123596191,
"eval_runtime": 4.1889,
"eval_samples_per_second": 238.726,
"eval_steps_per_second": 29.841,
"step": 12500
},
{
"epoch": 1.64,
"learning_rate": 0.00035,
"loss": 0.5678,
"step": 13000
},
{
"epoch": 1.64,
"eval_loss": 0.7327093482017517,
"eval_runtime": 4.151,
"eval_samples_per_second": 240.907,
"eval_steps_per_second": 30.113,
"step": 13000
},
{
"epoch": 1.7,
"learning_rate": 0.00032500000000000004,
"loss": 0.5627,
"step": 13500
},
{
"epoch": 1.7,
"eval_loss": 0.7284647226333618,
"eval_runtime": 4.2289,
"eval_samples_per_second": 236.47,
"eval_steps_per_second": 29.559,
"step": 13500
},
{
"epoch": 1.76,
"learning_rate": 0.0003,
"loss": 0.5568,
"step": 14000
},
{
"epoch": 1.76,
"eval_loss": 0.7164832949638367,
"eval_runtime": 4.2221,
"eval_samples_per_second": 236.848,
"eval_steps_per_second": 29.606,
"step": 14000
},
{
"epoch": 1.83,
"learning_rate": 0.000275,
"loss": 0.5589,
"step": 14500
},
{
"epoch": 1.83,
"eval_loss": 0.7153368592262268,
"eval_runtime": 4.2183,
"eval_samples_per_second": 237.06,
"eval_steps_per_second": 29.633,
"step": 14500
},
{
"epoch": 1.89,
"learning_rate": 0.00025,
"loss": 0.5675,
"step": 15000
},
{
"epoch": 1.89,
"eval_loss": 0.7135257720947266,
"eval_runtime": 4.2335,
"eval_samples_per_second": 236.21,
"eval_steps_per_second": 29.526,
"step": 15000
},
{
"epoch": 1.95,
"learning_rate": 0.00022500000000000002,
"loss": 0.5422,
"step": 15500
},
{
"epoch": 1.95,
"eval_loss": 0.719129741191864,
"eval_runtime": 4.2149,
"eval_samples_per_second": 237.255,
"eval_steps_per_second": 29.657,
"step": 15500
},
{
"epoch": 2.02,
"learning_rate": 0.0002,
"loss": 0.5104,
"step": 16000
},
{
"epoch": 2.02,
"eval_loss": 0.7474156022071838,
"eval_runtime": 4.1473,
"eval_samples_per_second": 241.123,
"eval_steps_per_second": 30.14,
"step": 16000
},
{
"epoch": 2.08,
"learning_rate": 0.000175,
"loss": 0.3835,
"step": 16500
},
{
"epoch": 2.08,
"eval_loss": 0.7469050288200378,
"eval_runtime": 4.1498,
"eval_samples_per_second": 240.978,
"eval_steps_per_second": 30.122,
"step": 16500
},
{
"epoch": 2.14,
"learning_rate": 0.00015,
"loss": 0.3805,
"step": 17000
},
{
"epoch": 2.14,
"eval_loss": 0.7489848136901855,
"eval_runtime": 3.9927,
"eval_samples_per_second": 250.456,
"eval_steps_per_second": 31.307,
"step": 17000
},
{
"epoch": 2.2,
"learning_rate": 0.000125,
"loss": 0.3718,
"step": 17500
},
{
"epoch": 2.2,
"eval_loss": 0.7569396495819092,
"eval_runtime": 4.0483,
"eval_samples_per_second": 247.019,
"eval_steps_per_second": 30.877,
"step": 17500
},
{
"epoch": 2.27,
"learning_rate": 0.0001,
"loss": 0.3688,
"step": 18000
},
{
"epoch": 2.27,
"eval_loss": 0.7492154836654663,
"eval_runtime": 4.0676,
"eval_samples_per_second": 245.848,
"eval_steps_per_second": 30.731,
"step": 18000
},
{
"epoch": 2.33,
"learning_rate": 7.5e-05,
"loss": 0.3645,
"step": 18500
},
{
"epoch": 2.33,
"eval_loss": 0.7465029954910278,
"eval_runtime": 4.2652,
"eval_samples_per_second": 234.456,
"eval_steps_per_second": 29.307,
"step": 18500
},
{
"epoch": 2.39,
"learning_rate": 5e-05,
"loss": 0.3865,
"step": 19000
},
{
"epoch": 2.39,
"eval_loss": 0.7468438744544983,
"eval_runtime": 4.0121,
"eval_samples_per_second": 249.249,
"eval_steps_per_second": 31.156,
"step": 19000
},
{
"epoch": 2.46,
"learning_rate": 2.5e-05,
"loss": 0.3633,
"step": 19500
},
{
"epoch": 2.46,
"eval_loss": 0.7479762434959412,
"eval_runtime": 4.0316,
"eval_samples_per_second": 248.04,
"eval_steps_per_second": 31.005,
"step": 19500
},
{
"epoch": 2.52,
"learning_rate": 0.0,
"loss": 0.3674,
"step": 20000
},
{
"epoch": 2.52,
"eval_loss": 0.7462155222892761,
"eval_runtime": 4.0244,
"eval_samples_per_second": 248.485,
"eval_steps_per_second": 31.061,
"step": 20000
}
],
"logging_steps": 500,
"max_steps": 20000,
"num_train_epochs": 3,
"save_steps": 5000,
"total_flos": 3.549121832463667e+16,
"trial_name": null,
"trial_params": null
}