|
{ |
|
"best_metric": 0.7564328908920288, |
|
"best_model_checkpoint": "../../output/llama3-8b-sft/LangGPT/checkpoint-700", |
|
"epoch": 9.0, |
|
"eval_steps": 100, |
|
"global_step": 720, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.33964911103248596, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.1993, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.4269595146179199, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1546, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.39937543869018555, |
|
"learning_rate": 4.997482666353287e-05, |
|
"loss": 1.0663, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3308526277542114, |
|
"learning_rate": 4.989935734988098e-05, |
|
"loss": 1.0087, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.2543019950389862, |
|
"learning_rate": 4.977374404419837e-05, |
|
"loss": 0.9476, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.2518838346004486, |
|
"learning_rate": 4.959823971496574e-05, |
|
"loss": 0.9267, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.2807057201862335, |
|
"learning_rate": 4.937319780454559e-05, |
|
"loss": 0.8875, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.25611793994903564, |
|
"learning_rate": 4.909907151739633e-05, |
|
"loss": 0.8747, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 0.28418758511543274, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 0.8721, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.294350266456604, |
|
"learning_rate": 4.8405871765993433e-05, |
|
"loss": 0.8441, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.8461617231369019, |
|
"eval_runtime": 158.2385, |
|
"eval_samples_per_second": 5.397, |
|
"eval_steps_per_second": 1.352, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 0.33300408720970154, |
|
"learning_rate": 4.7988194313786275e-05, |
|
"loss": 0.8252, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.29645124077796936, |
|
"learning_rate": 4.752422169756048e-05, |
|
"loss": 0.8232, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 0.3273533284664154, |
|
"learning_rate": 4.701488829641845e-05, |
|
"loss": 0.8061, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.3246830105781555, |
|
"learning_rate": 4.6461219840046654e-05, |
|
"loss": 0.8023, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.3237927556037903, |
|
"learning_rate": 4.586433134303257e-05, |
|
"loss": 0.8131, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.3896329402923584, |
|
"learning_rate": 4.522542485937369e-05, |
|
"loss": 0.8096, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 0.4054020345211029, |
|
"learning_rate": 4.454578706170075e-05, |
|
"loss": 0.8003, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.42092710733413696, |
|
"learning_rate": 4.382678665009028e-05, |
|
"loss": 0.7769, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 0.35680437088012695, |
|
"learning_rate": 4.306987159568479e-05, |
|
"loss": 0.78, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.379029780626297, |
|
"learning_rate": 4.227656622467162e-05, |
|
"loss": 0.7829, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.7941725254058838, |
|
"eval_runtime": 158.4268, |
|
"eval_samples_per_second": 5.391, |
|
"eval_steps_per_second": 1.351, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 0.44896718859672546, |
|
"learning_rate": 4.144846814849282e-05, |
|
"loss": 0.7874, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.40545764565467834, |
|
"learning_rate": 4.058724504646834e-05, |
|
"loss": 0.7787, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 0.37240272760391235, |
|
"learning_rate": 3.969463130731183e-05, |
|
"loss": 0.7789, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.4221615493297577, |
|
"learning_rate": 3.8772424536302564e-05, |
|
"loss": 0.77, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 0.38211557269096375, |
|
"learning_rate": 3.782248193514766e-05, |
|
"loss": 0.7775, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.44149255752563477, |
|
"learning_rate": 3.6846716561824965e-05, |
|
"loss": 0.7697, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 0.41183075308799744, |
|
"learning_rate": 3.5847093477938956e-05, |
|
"loss": 0.7688, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.4264132082462311, |
|
"learning_rate": 3.4825625791348096e-05, |
|
"loss": 0.7503, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"grad_norm": 0.43148595094680786, |
|
"learning_rate": 3.378437060203357e-05, |
|
"loss": 0.774, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.42241978645324707, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 0.747, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_loss": 0.7737195491790771, |
|
"eval_runtime": 158.4242, |
|
"eval_samples_per_second": 5.391, |
|
"eval_steps_per_second": 1.351, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"grad_norm": 0.4389643371105194, |
|
"learning_rate": 3.165092113916688e-05, |
|
"loss": 0.7535, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.4277195930480957, |
|
"learning_rate": 3.056302334890786e-05, |
|
"loss": 0.7539, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"grad_norm": 0.39998236298561096, |
|
"learning_rate": 2.9463922369965917e-05, |
|
"loss": 0.7722, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.42156070470809937, |
|
"learning_rate": 2.8355831645441388e-05, |
|
"loss": 0.7584, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 0.47488293051719666, |
|
"learning_rate": 2.724098272258584e-05, |
|
"loss": 0.7432, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.4431322515010834, |
|
"learning_rate": 2.6121620758762877e-05, |
|
"loss": 0.7454, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"grad_norm": 0.5012672543525696, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.7285, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.4605712592601776, |
|
"learning_rate": 2.3878379241237136e-05, |
|
"loss": 0.7352, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"grad_norm": 0.4491554796695709, |
|
"learning_rate": 2.2759017277414166e-05, |
|
"loss": 0.7569, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.4903900921344757, |
|
"learning_rate": 2.164416835455862e-05, |
|
"loss": 0.7507, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.763729989528656, |
|
"eval_runtime": 158.4134, |
|
"eval_samples_per_second": 5.391, |
|
"eval_steps_per_second": 1.351, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 5.125, |
|
"grad_norm": 0.4337986707687378, |
|
"learning_rate": 2.0536077630034086e-05, |
|
"loss": 0.7354, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 0.4212823212146759, |
|
"learning_rate": 1.9436976651092144e-05, |
|
"loss": 0.7396, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 5.375, |
|
"grad_norm": 0.482803612947464, |
|
"learning_rate": 1.8349078860833123e-05, |
|
"loss": 0.7592, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.4545280933380127, |
|
"learning_rate": 1.7274575140626318e-05, |
|
"loss": 0.744, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"grad_norm": 0.46052929759025574, |
|
"learning_rate": 1.621562939796643e-05, |
|
"loss": 0.7484, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.4419727027416229, |
|
"learning_rate": 1.5174374208651912e-05, |
|
"loss": 0.7268, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.875, |
|
"grad_norm": 0.48050615191459656, |
|
"learning_rate": 1.4152906522061048e-05, |
|
"loss": 0.7317, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.46696364879608154, |
|
"learning_rate": 1.3153283438175034e-05, |
|
"loss": 0.7432, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 6.125, |
|
"grad_norm": 0.5167751908302307, |
|
"learning_rate": 1.217751806485235e-05, |
|
"loss": 0.7472, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.5040733814239502, |
|
"learning_rate": 1.122757546369744e-05, |
|
"loss": 0.7327, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"eval_loss": 0.7590272426605225, |
|
"eval_runtime": 158.301, |
|
"eval_samples_per_second": 5.395, |
|
"eval_steps_per_second": 1.352, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 6.375, |
|
"grad_norm": 0.45372575521469116, |
|
"learning_rate": 1.0305368692688174e-05, |
|
"loss": 0.7301, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.46018218994140625, |
|
"learning_rate": 9.412754953531663e-06, |
|
"loss": 0.7409, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 6.625, |
|
"grad_norm": 0.4666420817375183, |
|
"learning_rate": 8.551531851507186e-06, |
|
"loss": 0.7433, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 0.49834486842155457, |
|
"learning_rate": 7.723433775328384e-06, |
|
"loss": 0.7326, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"grad_norm": 0.4643003046512604, |
|
"learning_rate": 6.930128404315214e-06, |
|
"loss": 0.731, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.47693514823913574, |
|
"learning_rate": 6.173213349909729e-06, |
|
"loss": 0.7306, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 7.125, |
|
"grad_norm": 0.49089315533638, |
|
"learning_rate": 5.454212938299255e-06, |
|
"loss": 0.7367, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"grad_norm": 0.4702819585800171, |
|
"learning_rate": 4.7745751406263165e-06, |
|
"loss": 0.7283, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 7.375, |
|
"grad_norm": 0.489766001701355, |
|
"learning_rate": 4.135668656967434e-06, |
|
"loss": 0.7387, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.45880013704299927, |
|
"learning_rate": 3.5387801599533475e-06, |
|
"loss": 0.7331, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"eval_loss": 0.756921648979187, |
|
"eval_runtime": 158.3438, |
|
"eval_samples_per_second": 5.393, |
|
"eval_steps_per_second": 1.351, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 7.625, |
|
"grad_norm": 0.47875136137008667, |
|
"learning_rate": 2.98511170358155e-06, |
|
"loss": 0.7366, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.44256335496902466, |
|
"learning_rate": 2.475778302439524e-06, |
|
"loss": 0.7419, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 7.875, |
|
"grad_norm": 0.4561857581138611, |
|
"learning_rate": 2.0118056862137357e-06, |
|
"loss": 0.7216, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.44974443316459656, |
|
"learning_rate": 1.59412823400657e-06, |
|
"loss": 0.7312, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 8.125, |
|
"grad_norm": 0.44745445251464844, |
|
"learning_rate": 1.2235870926211619e-06, |
|
"loss": 0.7365, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 0.4579036235809326, |
|
"learning_rate": 9.009284826036691e-07, |
|
"loss": 0.7199, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 8.375, |
|
"grad_norm": 0.4414173662662506, |
|
"learning_rate": 6.268021954544096e-07, |
|
"loss": 0.734, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.4644961953163147, |
|
"learning_rate": 4.0176028503425835e-07, |
|
"loss": 0.725, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 8.625, |
|
"grad_norm": 0.4445784389972687, |
|
"learning_rate": 2.262559558016325e-07, |
|
"loss": 0.7288, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.4681429862976074, |
|
"learning_rate": 1.006426501190233e-07, |
|
"loss": 0.7396, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"eval_loss": 0.7564328908920288, |
|
"eval_runtime": 158.3416, |
|
"eval_samples_per_second": 5.393, |
|
"eval_steps_per_second": 1.352, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 8.875, |
|
"grad_norm": 0.4473038911819458, |
|
"learning_rate": 2.5173336467135267e-08, |
|
"loss": 0.7469, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.47429242730140686, |
|
"learning_rate": 0.0, |
|
"loss": 0.7312, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"step": 720, |
|
"total_flos": 3.187342666522165e+18, |
|
"train_loss": 0.7853862928019629, |
|
"train_runtime": 44891.6571, |
|
"train_samples_per_second": 1.539, |
|
"train_steps_per_second": 0.016 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 720, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9, |
|
"save_steps": 100, |
|
"total_flos": 3.187342666522165e+18, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|