{ "best_metric": 0.7564328908920288, "best_model_checkpoint": "../../output/llama3-8b-sft/LangGPT/checkpoint-700", "epoch": 9.0, "eval_steps": 100, "global_step": 720, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.125, "grad_norm": 0.33964911103248596, "learning_rate": 2.5e-05, "loss": 1.1993, "step": 10 }, { "epoch": 0.25, "grad_norm": 0.4269595146179199, "learning_rate": 5e-05, "loss": 1.1546, "step": 20 }, { "epoch": 0.375, "grad_norm": 0.39937543869018555, "learning_rate": 4.997482666353287e-05, "loss": 1.0663, "step": 30 }, { "epoch": 0.5, "grad_norm": 0.3308526277542114, "learning_rate": 4.989935734988098e-05, "loss": 1.0087, "step": 40 }, { "epoch": 0.625, "grad_norm": 0.2543019950389862, "learning_rate": 4.977374404419837e-05, "loss": 0.9476, "step": 50 }, { "epoch": 0.75, "grad_norm": 0.2518838346004486, "learning_rate": 4.959823971496574e-05, "loss": 0.9267, "step": 60 }, { "epoch": 0.875, "grad_norm": 0.2807057201862335, "learning_rate": 4.937319780454559e-05, "loss": 0.8875, "step": 70 }, { "epoch": 1.0, "grad_norm": 0.25611793994903564, "learning_rate": 4.909907151739633e-05, "loss": 0.8747, "step": 80 }, { "epoch": 1.125, "grad_norm": 0.28418758511543274, "learning_rate": 4.877641290737884e-05, "loss": 0.8721, "step": 90 }, { "epoch": 1.25, "grad_norm": 0.294350266456604, "learning_rate": 4.8405871765993433e-05, "loss": 0.8441, "step": 100 }, { "epoch": 1.25, "eval_loss": 0.8461617231369019, "eval_runtime": 158.2385, "eval_samples_per_second": 5.397, "eval_steps_per_second": 1.352, "step": 100 }, { "epoch": 1.375, "grad_norm": 0.33300408720970154, "learning_rate": 4.7988194313786275e-05, "loss": 0.8252, "step": 110 }, { "epoch": 1.5, "grad_norm": 0.29645124077796936, "learning_rate": 4.752422169756048e-05, "loss": 0.8232, "step": 120 }, { "epoch": 1.625, "grad_norm": 0.3273533284664154, "learning_rate": 4.701488829641845e-05, "loss": 0.8061, "step": 130 }, { "epoch": 1.75, "grad_norm": 0.3246830105781555, "learning_rate": 4.6461219840046654e-05, "loss": 0.8023, "step": 140 }, { "epoch": 1.875, "grad_norm": 0.3237927556037903, "learning_rate": 4.586433134303257e-05, "loss": 0.8131, "step": 150 }, { "epoch": 2.0, "grad_norm": 0.3896329402923584, "learning_rate": 4.522542485937369e-05, "loss": 0.8096, "step": 160 }, { "epoch": 2.125, "grad_norm": 0.4054020345211029, "learning_rate": 4.454578706170075e-05, "loss": 0.8003, "step": 170 }, { "epoch": 2.25, "grad_norm": 0.42092710733413696, "learning_rate": 4.382678665009028e-05, "loss": 0.7769, "step": 180 }, { "epoch": 2.375, "grad_norm": 0.35680437088012695, "learning_rate": 4.306987159568479e-05, "loss": 0.78, "step": 190 }, { "epoch": 2.5, "grad_norm": 0.379029780626297, "learning_rate": 4.227656622467162e-05, "loss": 0.7829, "step": 200 }, { "epoch": 2.5, "eval_loss": 0.7941725254058838, "eval_runtime": 158.4268, "eval_samples_per_second": 5.391, "eval_steps_per_second": 1.351, "step": 200 }, { "epoch": 2.625, "grad_norm": 0.44896718859672546, "learning_rate": 4.144846814849282e-05, "loss": 0.7874, "step": 210 }, { "epoch": 2.75, "grad_norm": 0.40545764565467834, "learning_rate": 4.058724504646834e-05, "loss": 0.7787, "step": 220 }, { "epoch": 2.875, "grad_norm": 0.37240272760391235, "learning_rate": 3.969463130731183e-05, "loss": 0.7789, "step": 230 }, { "epoch": 3.0, "grad_norm": 0.4221615493297577, "learning_rate": 3.8772424536302564e-05, "loss": 0.77, "step": 240 }, { "epoch": 3.125, "grad_norm": 0.38211557269096375, "learning_rate": 3.782248193514766e-05, "loss": 0.7775, "step": 250 }, { "epoch": 3.25, "grad_norm": 0.44149255752563477, "learning_rate": 3.6846716561824965e-05, "loss": 0.7697, "step": 260 }, { "epoch": 3.375, "grad_norm": 0.41183075308799744, "learning_rate": 3.5847093477938956e-05, "loss": 0.7688, "step": 270 }, { "epoch": 3.5, "grad_norm": 0.4264132082462311, "learning_rate": 3.4825625791348096e-05, "loss": 0.7503, "step": 280 }, { "epoch": 3.625, "grad_norm": 0.43148595094680786, "learning_rate": 3.378437060203357e-05, "loss": 0.774, "step": 290 }, { "epoch": 3.75, "grad_norm": 0.42241978645324707, "learning_rate": 3.272542485937369e-05, "loss": 0.747, "step": 300 }, { "epoch": 3.75, "eval_loss": 0.7737195491790771, "eval_runtime": 158.4242, "eval_samples_per_second": 5.391, "eval_steps_per_second": 1.351, "step": 300 }, { "epoch": 3.875, "grad_norm": 0.4389643371105194, "learning_rate": 3.165092113916688e-05, "loss": 0.7535, "step": 310 }, { "epoch": 4.0, "grad_norm": 0.4277195930480957, "learning_rate": 3.056302334890786e-05, "loss": 0.7539, "step": 320 }, { "epoch": 4.125, "grad_norm": 0.39998236298561096, "learning_rate": 2.9463922369965917e-05, "loss": 0.7722, "step": 330 }, { "epoch": 4.25, "grad_norm": 0.42156070470809937, "learning_rate": 2.8355831645441388e-05, "loss": 0.7584, "step": 340 }, { "epoch": 4.375, "grad_norm": 0.47488293051719666, "learning_rate": 2.724098272258584e-05, "loss": 0.7432, "step": 350 }, { "epoch": 4.5, "grad_norm": 0.4431322515010834, "learning_rate": 2.6121620758762877e-05, "loss": 0.7454, "step": 360 }, { "epoch": 4.625, "grad_norm": 0.5012672543525696, "learning_rate": 2.5e-05, "loss": 0.7285, "step": 370 }, { "epoch": 4.75, "grad_norm": 0.4605712592601776, "learning_rate": 2.3878379241237136e-05, "loss": 0.7352, "step": 380 }, { "epoch": 4.875, "grad_norm": 0.4491554796695709, "learning_rate": 2.2759017277414166e-05, "loss": 0.7569, "step": 390 }, { "epoch": 5.0, "grad_norm": 0.4903900921344757, "learning_rate": 2.164416835455862e-05, "loss": 0.7507, "step": 400 }, { "epoch": 5.0, "eval_loss": 0.763729989528656, "eval_runtime": 158.4134, "eval_samples_per_second": 5.391, "eval_steps_per_second": 1.351, "step": 400 }, { "epoch": 5.125, "grad_norm": 0.4337986707687378, "learning_rate": 2.0536077630034086e-05, "loss": 0.7354, "step": 410 }, { "epoch": 5.25, "grad_norm": 0.4212823212146759, "learning_rate": 1.9436976651092144e-05, "loss": 0.7396, "step": 420 }, { "epoch": 5.375, "grad_norm": 0.482803612947464, "learning_rate": 1.8349078860833123e-05, "loss": 0.7592, "step": 430 }, { "epoch": 5.5, "grad_norm": 0.4545280933380127, "learning_rate": 1.7274575140626318e-05, "loss": 0.744, "step": 440 }, { "epoch": 5.625, "grad_norm": 0.46052929759025574, "learning_rate": 1.621562939796643e-05, "loss": 0.7484, "step": 450 }, { "epoch": 5.75, "grad_norm": 0.4419727027416229, "learning_rate": 1.5174374208651912e-05, "loss": 0.7268, "step": 460 }, { "epoch": 5.875, "grad_norm": 0.48050615191459656, "learning_rate": 1.4152906522061048e-05, "loss": 0.7317, "step": 470 }, { "epoch": 6.0, "grad_norm": 0.46696364879608154, "learning_rate": 1.3153283438175034e-05, "loss": 0.7432, "step": 480 }, { "epoch": 6.125, "grad_norm": 0.5167751908302307, "learning_rate": 1.217751806485235e-05, "loss": 0.7472, "step": 490 }, { "epoch": 6.25, "grad_norm": 0.5040733814239502, "learning_rate": 1.122757546369744e-05, "loss": 0.7327, "step": 500 }, { "epoch": 6.25, "eval_loss": 0.7590272426605225, "eval_runtime": 158.301, "eval_samples_per_second": 5.395, "eval_steps_per_second": 1.352, "step": 500 }, { "epoch": 6.375, "grad_norm": 0.45372575521469116, "learning_rate": 1.0305368692688174e-05, "loss": 0.7301, "step": 510 }, { "epoch": 6.5, "grad_norm": 0.46018218994140625, "learning_rate": 9.412754953531663e-06, "loss": 0.7409, "step": 520 }, { "epoch": 6.625, "grad_norm": 0.4666420817375183, "learning_rate": 8.551531851507186e-06, "loss": 0.7433, "step": 530 }, { "epoch": 6.75, "grad_norm": 0.49834486842155457, "learning_rate": 7.723433775328384e-06, "loss": 0.7326, "step": 540 }, { "epoch": 6.875, "grad_norm": 0.4643003046512604, "learning_rate": 6.930128404315214e-06, "loss": 0.731, "step": 550 }, { "epoch": 7.0, "grad_norm": 0.47693514823913574, "learning_rate": 6.173213349909729e-06, "loss": 0.7306, "step": 560 }, { "epoch": 7.125, "grad_norm": 0.49089315533638, "learning_rate": 5.454212938299255e-06, "loss": 0.7367, "step": 570 }, { "epoch": 7.25, "grad_norm": 0.4702819585800171, "learning_rate": 4.7745751406263165e-06, "loss": 0.7283, "step": 580 }, { "epoch": 7.375, "grad_norm": 0.489766001701355, "learning_rate": 4.135668656967434e-06, "loss": 0.7387, "step": 590 }, { "epoch": 7.5, "grad_norm": 0.45880013704299927, "learning_rate": 3.5387801599533475e-06, "loss": 0.7331, "step": 600 }, { "epoch": 7.5, "eval_loss": 0.756921648979187, "eval_runtime": 158.3438, "eval_samples_per_second": 5.393, "eval_steps_per_second": 1.351, "step": 600 }, { "epoch": 7.625, "grad_norm": 0.47875136137008667, "learning_rate": 2.98511170358155e-06, "loss": 0.7366, "step": 610 }, { "epoch": 7.75, "grad_norm": 0.44256335496902466, "learning_rate": 2.475778302439524e-06, "loss": 0.7419, "step": 620 }, { "epoch": 7.875, "grad_norm": 0.4561857581138611, "learning_rate": 2.0118056862137357e-06, "loss": 0.7216, "step": 630 }, { "epoch": 8.0, "grad_norm": 0.44974443316459656, "learning_rate": 1.59412823400657e-06, "loss": 0.7312, "step": 640 }, { "epoch": 8.125, "grad_norm": 0.44745445251464844, "learning_rate": 1.2235870926211619e-06, "loss": 0.7365, "step": 650 }, { "epoch": 8.25, "grad_norm": 0.4579036235809326, "learning_rate": 9.009284826036691e-07, "loss": 0.7199, "step": 660 }, { "epoch": 8.375, "grad_norm": 0.4414173662662506, "learning_rate": 6.268021954544096e-07, "loss": 0.734, "step": 670 }, { "epoch": 8.5, "grad_norm": 0.4644961953163147, "learning_rate": 4.0176028503425835e-07, "loss": 0.725, "step": 680 }, { "epoch": 8.625, "grad_norm": 0.4445784389972687, "learning_rate": 2.262559558016325e-07, "loss": 0.7288, "step": 690 }, { "epoch": 8.75, "grad_norm": 0.4681429862976074, "learning_rate": 1.006426501190233e-07, "loss": 0.7396, "step": 700 }, { "epoch": 8.75, "eval_loss": 0.7564328908920288, "eval_runtime": 158.3416, "eval_samples_per_second": 5.393, "eval_steps_per_second": 1.352, "step": 700 }, { "epoch": 8.875, "grad_norm": 0.4473038911819458, "learning_rate": 2.5173336467135267e-08, "loss": 0.7469, "step": 710 }, { "epoch": 9.0, "grad_norm": 0.47429242730140686, "learning_rate": 0.0, "loss": 0.7312, "step": 720 }, { "epoch": 9.0, "step": 720, "total_flos": 3.187342666522165e+18, "train_loss": 0.7853862928019629, "train_runtime": 44891.6571, "train_samples_per_second": 1.539, "train_steps_per_second": 0.016 } ], "logging_steps": 10, "max_steps": 720, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 100, "total_flos": 3.187342666522165e+18, "train_batch_size": 12, "trial_name": null, "trial_params": null }