llama3-8b_sa_v0.1 / trainer_state.json
sci-m-wang's picture
Upload 14 files
f232a59 verified
{
"best_metric": 0.7564328908920288,
"best_model_checkpoint": "../../output/llama3-8b-sft/LangGPT/checkpoint-700",
"epoch": 9.0,
"eval_steps": 100,
"global_step": 720,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.125,
"grad_norm": 0.33964911103248596,
"learning_rate": 2.5e-05,
"loss": 1.1993,
"step": 10
},
{
"epoch": 0.25,
"grad_norm": 0.4269595146179199,
"learning_rate": 5e-05,
"loss": 1.1546,
"step": 20
},
{
"epoch": 0.375,
"grad_norm": 0.39937543869018555,
"learning_rate": 4.997482666353287e-05,
"loss": 1.0663,
"step": 30
},
{
"epoch": 0.5,
"grad_norm": 0.3308526277542114,
"learning_rate": 4.989935734988098e-05,
"loss": 1.0087,
"step": 40
},
{
"epoch": 0.625,
"grad_norm": 0.2543019950389862,
"learning_rate": 4.977374404419837e-05,
"loss": 0.9476,
"step": 50
},
{
"epoch": 0.75,
"grad_norm": 0.2518838346004486,
"learning_rate": 4.959823971496574e-05,
"loss": 0.9267,
"step": 60
},
{
"epoch": 0.875,
"grad_norm": 0.2807057201862335,
"learning_rate": 4.937319780454559e-05,
"loss": 0.8875,
"step": 70
},
{
"epoch": 1.0,
"grad_norm": 0.25611793994903564,
"learning_rate": 4.909907151739633e-05,
"loss": 0.8747,
"step": 80
},
{
"epoch": 1.125,
"grad_norm": 0.28418758511543274,
"learning_rate": 4.877641290737884e-05,
"loss": 0.8721,
"step": 90
},
{
"epoch": 1.25,
"grad_norm": 0.294350266456604,
"learning_rate": 4.8405871765993433e-05,
"loss": 0.8441,
"step": 100
},
{
"epoch": 1.25,
"eval_loss": 0.8461617231369019,
"eval_runtime": 158.2385,
"eval_samples_per_second": 5.397,
"eval_steps_per_second": 1.352,
"step": 100
},
{
"epoch": 1.375,
"grad_norm": 0.33300408720970154,
"learning_rate": 4.7988194313786275e-05,
"loss": 0.8252,
"step": 110
},
{
"epoch": 1.5,
"grad_norm": 0.29645124077796936,
"learning_rate": 4.752422169756048e-05,
"loss": 0.8232,
"step": 120
},
{
"epoch": 1.625,
"grad_norm": 0.3273533284664154,
"learning_rate": 4.701488829641845e-05,
"loss": 0.8061,
"step": 130
},
{
"epoch": 1.75,
"grad_norm": 0.3246830105781555,
"learning_rate": 4.6461219840046654e-05,
"loss": 0.8023,
"step": 140
},
{
"epoch": 1.875,
"grad_norm": 0.3237927556037903,
"learning_rate": 4.586433134303257e-05,
"loss": 0.8131,
"step": 150
},
{
"epoch": 2.0,
"grad_norm": 0.3896329402923584,
"learning_rate": 4.522542485937369e-05,
"loss": 0.8096,
"step": 160
},
{
"epoch": 2.125,
"grad_norm": 0.4054020345211029,
"learning_rate": 4.454578706170075e-05,
"loss": 0.8003,
"step": 170
},
{
"epoch": 2.25,
"grad_norm": 0.42092710733413696,
"learning_rate": 4.382678665009028e-05,
"loss": 0.7769,
"step": 180
},
{
"epoch": 2.375,
"grad_norm": 0.35680437088012695,
"learning_rate": 4.306987159568479e-05,
"loss": 0.78,
"step": 190
},
{
"epoch": 2.5,
"grad_norm": 0.379029780626297,
"learning_rate": 4.227656622467162e-05,
"loss": 0.7829,
"step": 200
},
{
"epoch": 2.5,
"eval_loss": 0.7941725254058838,
"eval_runtime": 158.4268,
"eval_samples_per_second": 5.391,
"eval_steps_per_second": 1.351,
"step": 200
},
{
"epoch": 2.625,
"grad_norm": 0.44896718859672546,
"learning_rate": 4.144846814849282e-05,
"loss": 0.7874,
"step": 210
},
{
"epoch": 2.75,
"grad_norm": 0.40545764565467834,
"learning_rate": 4.058724504646834e-05,
"loss": 0.7787,
"step": 220
},
{
"epoch": 2.875,
"grad_norm": 0.37240272760391235,
"learning_rate": 3.969463130731183e-05,
"loss": 0.7789,
"step": 230
},
{
"epoch": 3.0,
"grad_norm": 0.4221615493297577,
"learning_rate": 3.8772424536302564e-05,
"loss": 0.77,
"step": 240
},
{
"epoch": 3.125,
"grad_norm": 0.38211557269096375,
"learning_rate": 3.782248193514766e-05,
"loss": 0.7775,
"step": 250
},
{
"epoch": 3.25,
"grad_norm": 0.44149255752563477,
"learning_rate": 3.6846716561824965e-05,
"loss": 0.7697,
"step": 260
},
{
"epoch": 3.375,
"grad_norm": 0.41183075308799744,
"learning_rate": 3.5847093477938956e-05,
"loss": 0.7688,
"step": 270
},
{
"epoch": 3.5,
"grad_norm": 0.4264132082462311,
"learning_rate": 3.4825625791348096e-05,
"loss": 0.7503,
"step": 280
},
{
"epoch": 3.625,
"grad_norm": 0.43148595094680786,
"learning_rate": 3.378437060203357e-05,
"loss": 0.774,
"step": 290
},
{
"epoch": 3.75,
"grad_norm": 0.42241978645324707,
"learning_rate": 3.272542485937369e-05,
"loss": 0.747,
"step": 300
},
{
"epoch": 3.75,
"eval_loss": 0.7737195491790771,
"eval_runtime": 158.4242,
"eval_samples_per_second": 5.391,
"eval_steps_per_second": 1.351,
"step": 300
},
{
"epoch": 3.875,
"grad_norm": 0.4389643371105194,
"learning_rate": 3.165092113916688e-05,
"loss": 0.7535,
"step": 310
},
{
"epoch": 4.0,
"grad_norm": 0.4277195930480957,
"learning_rate": 3.056302334890786e-05,
"loss": 0.7539,
"step": 320
},
{
"epoch": 4.125,
"grad_norm": 0.39998236298561096,
"learning_rate": 2.9463922369965917e-05,
"loss": 0.7722,
"step": 330
},
{
"epoch": 4.25,
"grad_norm": 0.42156070470809937,
"learning_rate": 2.8355831645441388e-05,
"loss": 0.7584,
"step": 340
},
{
"epoch": 4.375,
"grad_norm": 0.47488293051719666,
"learning_rate": 2.724098272258584e-05,
"loss": 0.7432,
"step": 350
},
{
"epoch": 4.5,
"grad_norm": 0.4431322515010834,
"learning_rate": 2.6121620758762877e-05,
"loss": 0.7454,
"step": 360
},
{
"epoch": 4.625,
"grad_norm": 0.5012672543525696,
"learning_rate": 2.5e-05,
"loss": 0.7285,
"step": 370
},
{
"epoch": 4.75,
"grad_norm": 0.4605712592601776,
"learning_rate": 2.3878379241237136e-05,
"loss": 0.7352,
"step": 380
},
{
"epoch": 4.875,
"grad_norm": 0.4491554796695709,
"learning_rate": 2.2759017277414166e-05,
"loss": 0.7569,
"step": 390
},
{
"epoch": 5.0,
"grad_norm": 0.4903900921344757,
"learning_rate": 2.164416835455862e-05,
"loss": 0.7507,
"step": 400
},
{
"epoch": 5.0,
"eval_loss": 0.763729989528656,
"eval_runtime": 158.4134,
"eval_samples_per_second": 5.391,
"eval_steps_per_second": 1.351,
"step": 400
},
{
"epoch": 5.125,
"grad_norm": 0.4337986707687378,
"learning_rate": 2.0536077630034086e-05,
"loss": 0.7354,
"step": 410
},
{
"epoch": 5.25,
"grad_norm": 0.4212823212146759,
"learning_rate": 1.9436976651092144e-05,
"loss": 0.7396,
"step": 420
},
{
"epoch": 5.375,
"grad_norm": 0.482803612947464,
"learning_rate": 1.8349078860833123e-05,
"loss": 0.7592,
"step": 430
},
{
"epoch": 5.5,
"grad_norm": 0.4545280933380127,
"learning_rate": 1.7274575140626318e-05,
"loss": 0.744,
"step": 440
},
{
"epoch": 5.625,
"grad_norm": 0.46052929759025574,
"learning_rate": 1.621562939796643e-05,
"loss": 0.7484,
"step": 450
},
{
"epoch": 5.75,
"grad_norm": 0.4419727027416229,
"learning_rate": 1.5174374208651912e-05,
"loss": 0.7268,
"step": 460
},
{
"epoch": 5.875,
"grad_norm": 0.48050615191459656,
"learning_rate": 1.4152906522061048e-05,
"loss": 0.7317,
"step": 470
},
{
"epoch": 6.0,
"grad_norm": 0.46696364879608154,
"learning_rate": 1.3153283438175034e-05,
"loss": 0.7432,
"step": 480
},
{
"epoch": 6.125,
"grad_norm": 0.5167751908302307,
"learning_rate": 1.217751806485235e-05,
"loss": 0.7472,
"step": 490
},
{
"epoch": 6.25,
"grad_norm": 0.5040733814239502,
"learning_rate": 1.122757546369744e-05,
"loss": 0.7327,
"step": 500
},
{
"epoch": 6.25,
"eval_loss": 0.7590272426605225,
"eval_runtime": 158.301,
"eval_samples_per_second": 5.395,
"eval_steps_per_second": 1.352,
"step": 500
},
{
"epoch": 6.375,
"grad_norm": 0.45372575521469116,
"learning_rate": 1.0305368692688174e-05,
"loss": 0.7301,
"step": 510
},
{
"epoch": 6.5,
"grad_norm": 0.46018218994140625,
"learning_rate": 9.412754953531663e-06,
"loss": 0.7409,
"step": 520
},
{
"epoch": 6.625,
"grad_norm": 0.4666420817375183,
"learning_rate": 8.551531851507186e-06,
"loss": 0.7433,
"step": 530
},
{
"epoch": 6.75,
"grad_norm": 0.49834486842155457,
"learning_rate": 7.723433775328384e-06,
"loss": 0.7326,
"step": 540
},
{
"epoch": 6.875,
"grad_norm": 0.4643003046512604,
"learning_rate": 6.930128404315214e-06,
"loss": 0.731,
"step": 550
},
{
"epoch": 7.0,
"grad_norm": 0.47693514823913574,
"learning_rate": 6.173213349909729e-06,
"loss": 0.7306,
"step": 560
},
{
"epoch": 7.125,
"grad_norm": 0.49089315533638,
"learning_rate": 5.454212938299255e-06,
"loss": 0.7367,
"step": 570
},
{
"epoch": 7.25,
"grad_norm": 0.4702819585800171,
"learning_rate": 4.7745751406263165e-06,
"loss": 0.7283,
"step": 580
},
{
"epoch": 7.375,
"grad_norm": 0.489766001701355,
"learning_rate": 4.135668656967434e-06,
"loss": 0.7387,
"step": 590
},
{
"epoch": 7.5,
"grad_norm": 0.45880013704299927,
"learning_rate": 3.5387801599533475e-06,
"loss": 0.7331,
"step": 600
},
{
"epoch": 7.5,
"eval_loss": 0.756921648979187,
"eval_runtime": 158.3438,
"eval_samples_per_second": 5.393,
"eval_steps_per_second": 1.351,
"step": 600
},
{
"epoch": 7.625,
"grad_norm": 0.47875136137008667,
"learning_rate": 2.98511170358155e-06,
"loss": 0.7366,
"step": 610
},
{
"epoch": 7.75,
"grad_norm": 0.44256335496902466,
"learning_rate": 2.475778302439524e-06,
"loss": 0.7419,
"step": 620
},
{
"epoch": 7.875,
"grad_norm": 0.4561857581138611,
"learning_rate": 2.0118056862137357e-06,
"loss": 0.7216,
"step": 630
},
{
"epoch": 8.0,
"grad_norm": 0.44974443316459656,
"learning_rate": 1.59412823400657e-06,
"loss": 0.7312,
"step": 640
},
{
"epoch": 8.125,
"grad_norm": 0.44745445251464844,
"learning_rate": 1.2235870926211619e-06,
"loss": 0.7365,
"step": 650
},
{
"epoch": 8.25,
"grad_norm": 0.4579036235809326,
"learning_rate": 9.009284826036691e-07,
"loss": 0.7199,
"step": 660
},
{
"epoch": 8.375,
"grad_norm": 0.4414173662662506,
"learning_rate": 6.268021954544096e-07,
"loss": 0.734,
"step": 670
},
{
"epoch": 8.5,
"grad_norm": 0.4644961953163147,
"learning_rate": 4.0176028503425835e-07,
"loss": 0.725,
"step": 680
},
{
"epoch": 8.625,
"grad_norm": 0.4445784389972687,
"learning_rate": 2.262559558016325e-07,
"loss": 0.7288,
"step": 690
},
{
"epoch": 8.75,
"grad_norm": 0.4681429862976074,
"learning_rate": 1.006426501190233e-07,
"loss": 0.7396,
"step": 700
},
{
"epoch": 8.75,
"eval_loss": 0.7564328908920288,
"eval_runtime": 158.3416,
"eval_samples_per_second": 5.393,
"eval_steps_per_second": 1.352,
"step": 700
},
{
"epoch": 8.875,
"grad_norm": 0.4473038911819458,
"learning_rate": 2.5173336467135267e-08,
"loss": 0.7469,
"step": 710
},
{
"epoch": 9.0,
"grad_norm": 0.47429242730140686,
"learning_rate": 0.0,
"loss": 0.7312,
"step": 720
},
{
"epoch": 9.0,
"step": 720,
"total_flos": 3.187342666522165e+18,
"train_loss": 0.7853862928019629,
"train_runtime": 44891.6571,
"train_samples_per_second": 1.539,
"train_steps_per_second": 0.016
}
],
"logging_steps": 10,
"max_steps": 720,
"num_input_tokens_seen": 0,
"num_train_epochs": 9,
"save_steps": 100,
"total_flos": 3.187342666522165e+18,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}