Phi-3.5-mini-instruct-24-9-29 / trainer_state.json
win10's picture
Upload folder using huggingface_hub
ca2e14b verified
raw
history blame
18.5 kB
{
"best_metric": 1.3194454908370972,
"best_model_checkpoint": "saves/Llama-3.2-3B-Instruct/Phi-3.5-mini-instruct-24-9-29\\checkpoint-100",
"epoch": 0.16016016016016016,
"eval_steps": 100,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016016016016016017,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2106,
"step": 10
},
{
"epoch": 0.0032032032032032033,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.4712,
"step": 20
},
{
"epoch": 0.004804804804804805,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2974,
"step": 30
},
{
"epoch": 0.006406406406406407,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.304,
"step": 40
},
{
"epoch": 0.008008008008008008,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2863,
"step": 50
},
{
"epoch": 0.00960960960960961,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.5074,
"step": 60
},
{
"epoch": 0.01121121121121121,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3915,
"step": 70
},
{
"epoch": 0.012812812812812813,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3476,
"step": 80
},
{
"epoch": 0.014414414414414415,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3663,
"step": 90
},
{
"epoch": 0.016016016016016016,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.428,
"step": 100
},
{
"epoch": 0.016016016016016016,
"eval_loss": 1.3194454908370972,
"eval_runtime": 12.8069,
"eval_samples_per_second": 7.808,
"eval_steps_per_second": 3.904,
"step": 100
},
{
"epoch": 0.01761761761761762,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2343,
"step": 110
},
{
"epoch": 0.01921921921921922,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.343,
"step": 120
},
{
"epoch": 0.02082082082082082,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3258,
"step": 130
},
{
"epoch": 0.02242242242242242,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3007,
"step": 140
},
{
"epoch": 0.024024024024024024,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.35,
"step": 150
},
{
"epoch": 0.025625625625625627,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2401,
"step": 160
},
{
"epoch": 0.027227227227227226,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2047,
"step": 170
},
{
"epoch": 0.02882882882882883,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3125,
"step": 180
},
{
"epoch": 0.03043043043043043,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.283,
"step": 190
},
{
"epoch": 0.03203203203203203,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2653,
"step": 200
},
{
"epoch": 0.03203203203203203,
"eval_loss": 1.3194454908370972,
"eval_runtime": 12.8038,
"eval_samples_per_second": 7.81,
"eval_steps_per_second": 3.905,
"step": 200
},
{
"epoch": 0.033633633633633635,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3293,
"step": 210
},
{
"epoch": 0.03523523523523524,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.5406,
"step": 220
},
{
"epoch": 0.036836836836836834,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2827,
"step": 230
},
{
"epoch": 0.03843843843843844,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2914,
"step": 240
},
{
"epoch": 0.04004004004004004,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3335,
"step": 250
},
{
"epoch": 0.04164164164164164,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.4173,
"step": 260
},
{
"epoch": 0.043243243243243246,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2077,
"step": 270
},
{
"epoch": 0.04484484484484484,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2665,
"step": 280
},
{
"epoch": 0.046446446446446445,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3744,
"step": 290
},
{
"epoch": 0.04804804804804805,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3084,
"step": 300
},
{
"epoch": 0.04804804804804805,
"eval_loss": 1.3194454908370972,
"eval_runtime": 12.9051,
"eval_samples_per_second": 7.749,
"eval_steps_per_second": 3.874,
"step": 300
},
{
"epoch": 0.04964964964964965,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3119,
"step": 310
},
{
"epoch": 0.051251251251251254,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.1759,
"step": 320
},
{
"epoch": 0.05285285285285285,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.4001,
"step": 330
},
{
"epoch": 0.05445445445445445,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3775,
"step": 340
},
{
"epoch": 0.056056056056056056,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2751,
"step": 350
},
{
"epoch": 0.05765765765765766,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3655,
"step": 360
},
{
"epoch": 0.05925925925925926,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3717,
"step": 370
},
{
"epoch": 0.06086086086086086,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.22,
"step": 380
},
{
"epoch": 0.06246246246246246,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3391,
"step": 390
},
{
"epoch": 0.06406406406406406,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3234,
"step": 400
},
{
"epoch": 0.06406406406406406,
"eval_loss": 1.3194454908370972,
"eval_runtime": 12.7832,
"eval_samples_per_second": 7.823,
"eval_steps_per_second": 3.911,
"step": 400
},
{
"epoch": 0.06566566566566566,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.1899,
"step": 410
},
{
"epoch": 0.06726726726726727,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.4017,
"step": 420
},
{
"epoch": 0.06886886886886887,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.4969,
"step": 430
},
{
"epoch": 0.07047047047047048,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2447,
"step": 440
},
{
"epoch": 0.07207207207207207,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3669,
"step": 450
},
{
"epoch": 0.07367367367367367,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.5245,
"step": 460
},
{
"epoch": 0.07527527527527528,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.4036,
"step": 470
},
{
"epoch": 0.07687687687687687,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.4029,
"step": 480
},
{
"epoch": 0.07847847847847848,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.5582,
"step": 490
},
{
"epoch": 0.08008008008008008,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.4091,
"step": 500
},
{
"epoch": 0.08008008008008008,
"eval_loss": 1.3194454908370972,
"eval_runtime": 12.8939,
"eval_samples_per_second": 7.756,
"eval_steps_per_second": 3.878,
"step": 500
},
{
"epoch": 0.08168168168168168,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2554,
"step": 510
},
{
"epoch": 0.08328328328328329,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3238,
"step": 520
},
{
"epoch": 0.08488488488488488,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2115,
"step": 530
},
{
"epoch": 0.08648648648648649,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3589,
"step": 540
},
{
"epoch": 0.08808808808808809,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3267,
"step": 550
},
{
"epoch": 0.08968968968968968,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2068,
"step": 560
},
{
"epoch": 0.0912912912912913,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.4504,
"step": 570
},
{
"epoch": 0.09289289289289289,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3775,
"step": 580
},
{
"epoch": 0.0944944944944945,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3212,
"step": 590
},
{
"epoch": 0.0960960960960961,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2878,
"step": 600
},
{
"epoch": 0.0960960960960961,
"eval_loss": 1.3194454908370972,
"eval_runtime": 12.7841,
"eval_samples_per_second": 7.822,
"eval_steps_per_second": 3.911,
"step": 600
},
{
"epoch": 0.09769769769769769,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.4902,
"step": 610
},
{
"epoch": 0.0992992992992993,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.203,
"step": 620
},
{
"epoch": 0.1009009009009009,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2783,
"step": 630
},
{
"epoch": 0.10250250250250251,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2326,
"step": 640
},
{
"epoch": 0.1041041041041041,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3132,
"step": 650
},
{
"epoch": 0.1057057057057057,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2495,
"step": 660
},
{
"epoch": 0.10730730730730731,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.329,
"step": 670
},
{
"epoch": 0.1089089089089089,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3178,
"step": 680
},
{
"epoch": 0.11051051051051052,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3798,
"step": 690
},
{
"epoch": 0.11211211211211211,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2933,
"step": 700
},
{
"epoch": 0.11211211211211211,
"eval_loss": 1.3194454908370972,
"eval_runtime": 12.8596,
"eval_samples_per_second": 7.776,
"eval_steps_per_second": 3.888,
"step": 700
},
{
"epoch": 0.11371371371371371,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2721,
"step": 710
},
{
"epoch": 0.11531531531531532,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3859,
"step": 720
},
{
"epoch": 0.11691691691691691,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2536,
"step": 730
},
{
"epoch": 0.11851851851851852,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3625,
"step": 740
},
{
"epoch": 0.12012012012012012,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3386,
"step": 750
},
{
"epoch": 0.12172172172172172,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.4023,
"step": 760
},
{
"epoch": 0.12332332332332333,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2977,
"step": 770
},
{
"epoch": 0.12492492492492492,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.4091,
"step": 780
},
{
"epoch": 0.12652652652652652,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2513,
"step": 790
},
{
"epoch": 0.12812812812812813,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3246,
"step": 800
},
{
"epoch": 0.12812812812812813,
"eval_loss": 1.3194454908370972,
"eval_runtime": 12.7102,
"eval_samples_per_second": 7.868,
"eval_steps_per_second": 3.934,
"step": 800
},
{
"epoch": 0.12972972972972974,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3271,
"step": 810
},
{
"epoch": 0.13133133133133132,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3497,
"step": 820
},
{
"epoch": 0.13293293293293293,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2833,
"step": 830
},
{
"epoch": 0.13453453453453454,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3491,
"step": 840
},
{
"epoch": 0.13613613613613615,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2592,
"step": 850
},
{
"epoch": 0.13773773773773773,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3241,
"step": 860
},
{
"epoch": 0.13933933933933934,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3809,
"step": 870
},
{
"epoch": 0.14094094094094095,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.294,
"step": 880
},
{
"epoch": 0.14254254254254253,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3025,
"step": 890
},
{
"epoch": 0.14414414414414414,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2911,
"step": 900
},
{
"epoch": 0.14414414414414414,
"eval_loss": 1.3194454908370972,
"eval_runtime": 12.7786,
"eval_samples_per_second": 7.826,
"eval_steps_per_second": 3.913,
"step": 900
},
{
"epoch": 0.14574574574574575,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3149,
"step": 910
},
{
"epoch": 0.14734734734734733,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2697,
"step": 920
},
{
"epoch": 0.14894894894894894,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3088,
"step": 930
},
{
"epoch": 0.15055055055055055,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3579,
"step": 940
},
{
"epoch": 0.15215215215215216,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.4073,
"step": 950
},
{
"epoch": 0.15375375375375375,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2242,
"step": 960
},
{
"epoch": 0.15535535535535536,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.3585,
"step": 970
},
{
"epoch": 0.15695695695695697,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2407,
"step": 980
},
{
"epoch": 0.15855855855855855,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2609,
"step": 990
},
{
"epoch": 0.16016016016016016,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.4227,
"step": 1000
},
{
"epoch": 0.16016016016016016,
"eval_loss": 1.3194454908370972,
"eval_runtime": 12.8395,
"eval_samples_per_second": 7.788,
"eval_steps_per_second": 3.894,
"step": 1000
},
{
"epoch": 0.16016016016016016,
"step": 1000,
"total_flos": 2.9103944129622835e+17,
"train_loss": 1.3281120185852051,
"train_runtime": 3519.7932,
"train_samples_per_second": 4.546,
"train_steps_per_second": 0.284
}
],
"logging_steps": 10,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.9103944129622835e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}