{ "best_metric": 1.3194454908370972, "best_model_checkpoint": "saves/Llama-3.2-3B-Instruct/Phi-3.5-mini-instruct-24-9-29\\checkpoint-100", "epoch": 0.16016016016016016, "eval_steps": 100, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016016016016016017, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2106, "step": 10 }, { "epoch": 0.0032032032032032033, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.4712, "step": 20 }, { "epoch": 0.004804804804804805, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2974, "step": 30 }, { "epoch": 0.006406406406406407, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.304, "step": 40 }, { "epoch": 0.008008008008008008, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2863, "step": 50 }, { "epoch": 0.00960960960960961, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.5074, "step": 60 }, { "epoch": 0.01121121121121121, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3915, "step": 70 }, { "epoch": 0.012812812812812813, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3476, "step": 80 }, { "epoch": 0.014414414414414415, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3663, "step": 90 }, { "epoch": 0.016016016016016016, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.428, "step": 100 }, { "epoch": 0.016016016016016016, "eval_loss": 1.3194454908370972, "eval_runtime": 12.8069, "eval_samples_per_second": 7.808, "eval_steps_per_second": 3.904, "step": 100 }, { "epoch": 0.01761761761761762, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2343, "step": 110 }, { "epoch": 0.01921921921921922, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.343, "step": 120 }, { "epoch": 0.02082082082082082, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3258, "step": 130 }, { "epoch": 0.02242242242242242, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3007, "step": 140 }, { "epoch": 0.024024024024024024, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.35, "step": 150 }, { "epoch": 0.025625625625625627, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2401, "step": 160 }, { "epoch": 0.027227227227227226, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2047, "step": 170 }, { "epoch": 0.02882882882882883, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3125, "step": 180 }, { "epoch": 0.03043043043043043, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.283, "step": 190 }, { "epoch": 0.03203203203203203, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2653, "step": 200 }, { "epoch": 0.03203203203203203, "eval_loss": 1.3194454908370972, "eval_runtime": 12.8038, "eval_samples_per_second": 7.81, "eval_steps_per_second": 3.905, "step": 200 }, { "epoch": 0.033633633633633635, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3293, "step": 210 }, { "epoch": 0.03523523523523524, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.5406, "step": 220 }, { "epoch": 0.036836836836836834, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2827, "step": 230 }, { "epoch": 0.03843843843843844, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2914, "step": 240 }, { "epoch": 0.04004004004004004, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3335, "step": 250 }, { "epoch": 0.04164164164164164, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.4173, "step": 260 }, { "epoch": 0.043243243243243246, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2077, "step": 270 }, { "epoch": 0.04484484484484484, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2665, "step": 280 }, { "epoch": 0.046446446446446445, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3744, "step": 290 }, { "epoch": 0.04804804804804805, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3084, "step": 300 }, { "epoch": 0.04804804804804805, "eval_loss": 1.3194454908370972, "eval_runtime": 12.9051, "eval_samples_per_second": 7.749, "eval_steps_per_second": 3.874, "step": 300 }, { "epoch": 0.04964964964964965, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3119, "step": 310 }, { "epoch": 0.051251251251251254, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.1759, "step": 320 }, { "epoch": 0.05285285285285285, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.4001, "step": 330 }, { "epoch": 0.05445445445445445, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3775, "step": 340 }, { "epoch": 0.056056056056056056, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2751, "step": 350 }, { "epoch": 0.05765765765765766, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3655, "step": 360 }, { "epoch": 0.05925925925925926, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3717, "step": 370 }, { "epoch": 0.06086086086086086, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.22, "step": 380 }, { "epoch": 0.06246246246246246, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3391, "step": 390 }, { "epoch": 0.06406406406406406, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3234, "step": 400 }, { "epoch": 0.06406406406406406, "eval_loss": 1.3194454908370972, "eval_runtime": 12.7832, "eval_samples_per_second": 7.823, "eval_steps_per_second": 3.911, "step": 400 }, { "epoch": 0.06566566566566566, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.1899, "step": 410 }, { "epoch": 0.06726726726726727, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.4017, "step": 420 }, { "epoch": 0.06886886886886887, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.4969, "step": 430 }, { "epoch": 0.07047047047047048, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2447, "step": 440 }, { "epoch": 0.07207207207207207, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3669, "step": 450 }, { "epoch": 0.07367367367367367, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.5245, "step": 460 }, { "epoch": 0.07527527527527528, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.4036, "step": 470 }, { "epoch": 0.07687687687687687, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.4029, "step": 480 }, { "epoch": 0.07847847847847848, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.5582, "step": 490 }, { "epoch": 0.08008008008008008, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.4091, "step": 500 }, { "epoch": 0.08008008008008008, "eval_loss": 1.3194454908370972, "eval_runtime": 12.8939, "eval_samples_per_second": 7.756, "eval_steps_per_second": 3.878, "step": 500 }, { "epoch": 0.08168168168168168, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2554, "step": 510 }, { "epoch": 0.08328328328328329, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3238, "step": 520 }, { "epoch": 0.08488488488488488, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2115, "step": 530 }, { "epoch": 0.08648648648648649, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3589, "step": 540 }, { "epoch": 0.08808808808808809, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3267, "step": 550 }, { "epoch": 0.08968968968968968, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2068, "step": 560 }, { "epoch": 0.0912912912912913, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.4504, "step": 570 }, { "epoch": 0.09289289289289289, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3775, "step": 580 }, { "epoch": 0.0944944944944945, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3212, "step": 590 }, { "epoch": 0.0960960960960961, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2878, "step": 600 }, { "epoch": 0.0960960960960961, "eval_loss": 1.3194454908370972, "eval_runtime": 12.7841, "eval_samples_per_second": 7.822, "eval_steps_per_second": 3.911, "step": 600 }, { "epoch": 0.09769769769769769, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.4902, "step": 610 }, { "epoch": 0.0992992992992993, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.203, "step": 620 }, { "epoch": 0.1009009009009009, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2783, "step": 630 }, { "epoch": 0.10250250250250251, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2326, "step": 640 }, { "epoch": 0.1041041041041041, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3132, "step": 650 }, { "epoch": 0.1057057057057057, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2495, "step": 660 }, { "epoch": 0.10730730730730731, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.329, "step": 670 }, { "epoch": 0.1089089089089089, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3178, "step": 680 }, { "epoch": 0.11051051051051052, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3798, "step": 690 }, { "epoch": 0.11211211211211211, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2933, "step": 700 }, { "epoch": 0.11211211211211211, "eval_loss": 1.3194454908370972, "eval_runtime": 12.8596, "eval_samples_per_second": 7.776, "eval_steps_per_second": 3.888, "step": 700 }, { "epoch": 0.11371371371371371, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2721, "step": 710 }, { "epoch": 0.11531531531531532, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3859, "step": 720 }, { "epoch": 0.11691691691691691, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2536, "step": 730 }, { "epoch": 0.11851851851851852, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3625, "step": 740 }, { "epoch": 0.12012012012012012, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3386, "step": 750 }, { "epoch": 0.12172172172172172, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.4023, "step": 760 }, { "epoch": 0.12332332332332333, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2977, "step": 770 }, { "epoch": 0.12492492492492492, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.4091, "step": 780 }, { "epoch": 0.12652652652652652, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2513, "step": 790 }, { "epoch": 0.12812812812812813, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3246, "step": 800 }, { "epoch": 0.12812812812812813, "eval_loss": 1.3194454908370972, "eval_runtime": 12.7102, "eval_samples_per_second": 7.868, "eval_steps_per_second": 3.934, "step": 800 }, { "epoch": 0.12972972972972974, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3271, "step": 810 }, { "epoch": 0.13133133133133132, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3497, "step": 820 }, { "epoch": 0.13293293293293293, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2833, "step": 830 }, { "epoch": 0.13453453453453454, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3491, "step": 840 }, { "epoch": 0.13613613613613615, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2592, "step": 850 }, { "epoch": 0.13773773773773773, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3241, "step": 860 }, { "epoch": 0.13933933933933934, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3809, "step": 870 }, { "epoch": 0.14094094094094095, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.294, "step": 880 }, { "epoch": 0.14254254254254253, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3025, "step": 890 }, { "epoch": 0.14414414414414414, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2911, "step": 900 }, { "epoch": 0.14414414414414414, "eval_loss": 1.3194454908370972, "eval_runtime": 12.7786, "eval_samples_per_second": 7.826, "eval_steps_per_second": 3.913, "step": 900 }, { "epoch": 0.14574574574574575, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3149, "step": 910 }, { "epoch": 0.14734734734734733, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2697, "step": 920 }, { "epoch": 0.14894894894894894, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3088, "step": 930 }, { "epoch": 0.15055055055055055, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3579, "step": 940 }, { "epoch": 0.15215215215215216, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.4073, "step": 950 }, { "epoch": 0.15375375375375375, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2242, "step": 960 }, { "epoch": 0.15535535535535536, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.3585, "step": 970 }, { "epoch": 0.15695695695695697, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2407, "step": 980 }, { "epoch": 0.15855855855855855, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.2609, "step": 990 }, { "epoch": 0.16016016016016016, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 1.4227, "step": 1000 }, { "epoch": 0.16016016016016016, "eval_loss": 1.3194454908370972, "eval_runtime": 12.8395, "eval_samples_per_second": 7.788, "eval_steps_per_second": 3.894, "step": 1000 }, { "epoch": 0.16016016016016016, "step": 1000, "total_flos": 2.9103944129622835e+17, "train_loss": 1.3281120185852051, "train_runtime": 3519.7932, "train_samples_per_second": 4.546, "train_steps_per_second": 0.284 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.9103944129622835e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }