gemma-7b-sft-full-longest-1k-v1 / trainer_state.json
lewtun's picture
lewtun HF staff
Model save
c7cb8cc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 15.0,
"eval_steps": 30,
"global_step": 90,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.17,
"grad_norm": 1600.5392304887378,
"learning_rate": 1.999390827019096e-05,
"loss": 10.0226,
"step": 1
},
{
"epoch": 0.83,
"grad_norm": 2032.6698503030166,
"learning_rate": 1.9848077530122083e-05,
"loss": 8.797,
"step": 5
},
{
"epoch": 1.67,
"grad_norm": 213.98723674458682,
"learning_rate": 1.9396926207859085e-05,
"loss": 6.3385,
"step": 10
},
{
"epoch": 2.5,
"grad_norm": 704.9269625939797,
"learning_rate": 1.866025403784439e-05,
"loss": 6.4244,
"step": 15
},
{
"epoch": 3.33,
"grad_norm": 415.53646633609685,
"learning_rate": 1.766044443118978e-05,
"loss": 4.6473,
"step": 20
},
{
"epoch": 4.17,
"grad_norm": 117.74034120947316,
"learning_rate": 1.6427876096865394e-05,
"loss": 3.1231,
"step": 25
},
{
"epoch": 5.0,
"grad_norm": 35.05307192586602,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.9689,
"step": 30
},
{
"epoch": 5.0,
"eval_loss": 1.464415192604065,
"eval_runtime": 54.045,
"eval_samples_per_second": 18.244,
"eval_steps_per_second": 0.296,
"step": 30
},
{
"epoch": 5.83,
"grad_norm": 17.30445284838731,
"learning_rate": 1.342020143325669e-05,
"loss": 1.6624,
"step": 35
},
{
"epoch": 6.67,
"grad_norm": 18.489148704869553,
"learning_rate": 1.1736481776669307e-05,
"loss": 1.4962,
"step": 40
},
{
"epoch": 7.5,
"grad_norm": 11.164738834605398,
"learning_rate": 1e-05,
"loss": 1.3667,
"step": 45
},
{
"epoch": 8.33,
"grad_norm": 12.450510814489615,
"learning_rate": 8.263518223330698e-06,
"loss": 1.2545,
"step": 50
},
{
"epoch": 9.17,
"grad_norm": 7.10392485555226,
"learning_rate": 6.579798566743314e-06,
"loss": 1.2288,
"step": 55
},
{
"epoch": 10.0,
"grad_norm": 7.210838208620658,
"learning_rate": 5.000000000000003e-06,
"loss": 1.1624,
"step": 60
},
{
"epoch": 10.0,
"eval_loss": 1.0362504720687866,
"eval_runtime": 46.757,
"eval_samples_per_second": 21.088,
"eval_steps_per_second": 0.342,
"step": 60
},
{
"epoch": 10.83,
"grad_norm": 5.108017518056073,
"learning_rate": 3.5721239031346067e-06,
"loss": 1.1292,
"step": 65
},
{
"epoch": 11.67,
"grad_norm": 4.729793063810578,
"learning_rate": 2.339555568810221e-06,
"loss": 1.1012,
"step": 70
},
{
"epoch": 12.5,
"grad_norm": 4.227960417256771,
"learning_rate": 1.339745962155613e-06,
"loss": 1.0784,
"step": 75
},
{
"epoch": 13.33,
"grad_norm": 5.236888613480014,
"learning_rate": 6.030737921409169e-07,
"loss": 1.0902,
"step": 80
},
{
"epoch": 14.17,
"grad_norm": 5.367835987082188,
"learning_rate": 1.519224698779198e-07,
"loss": 1.0574,
"step": 85
},
{
"epoch": 15.0,
"grad_norm": 5.098249451357161,
"learning_rate": 0.0,
"loss": 1.0662,
"step": 90
},
{
"epoch": 15.0,
"eval_loss": 1.0137017965316772,
"eval_runtime": 45.4033,
"eval_samples_per_second": 21.716,
"eval_steps_per_second": 0.352,
"step": 90
},
{
"epoch": 15.0,
"step": 90,
"total_flos": 24718476312576.0,
"train_loss": 2.5687779426574706,
"train_runtime": 665.8135,
"train_samples_per_second": 16.649,
"train_steps_per_second": 0.135
}
],
"logging_steps": 5,
"max_steps": 90,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 100,
"total_flos": 24718476312576.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}