mistral-finetune-exp02 / trainer_state.json
DeL-TaiseiOzaki's picture
Upload folder using huggingface_hub
0716134 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 18512,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000108038029386344,
"grad_norm": NaN,
"learning_rate": 1e-06,
"loss": 12.3298,
"step": 1
},
{
"epoch": 0.021607605877268798,
"grad_norm": 4.446423053741455,
"learning_rate": 1e-06,
"loss": 10.8951,
"step": 200
},
{
"epoch": 0.043215211754537596,
"grad_norm": 6.528625965118408,
"learning_rate": 1e-06,
"loss": 8.5315,
"step": 400
},
{
"epoch": 0.06482281763180639,
"grad_norm": 2.4619638919830322,
"learning_rate": 1e-06,
"loss": 5.6715,
"step": 600
},
{
"epoch": 0.08643042350907519,
"grad_norm": 0.8605937361717224,
"learning_rate": 1e-06,
"loss": 4.7588,
"step": 800
},
{
"epoch": 0.10803802938634399,
"grad_norm": 1.1726713180541992,
"learning_rate": 1e-06,
"loss": 4.4643,
"step": 1000
},
{
"epoch": 0.12964563526361278,
"grad_norm": 1.2112282514572144,
"learning_rate": 1e-06,
"loss": 4.3869,
"step": 1200
},
{
"epoch": 0.1512532411408816,
"grad_norm": 2.6818201541900635,
"learning_rate": 1e-06,
"loss": 4.2508,
"step": 1400
},
{
"epoch": 0.17286084701815038,
"grad_norm": 1.3040348291397095,
"learning_rate": 1e-06,
"loss": 4.2472,
"step": 1600
},
{
"epoch": 0.1944684528954192,
"grad_norm": 0.8280265927314758,
"learning_rate": 1e-06,
"loss": 4.1989,
"step": 1800
},
{
"epoch": 0.21607605877268798,
"grad_norm": 0.9923717975616455,
"learning_rate": 1e-06,
"loss": 4.2074,
"step": 2000
},
{
"epoch": 0.23768366464995677,
"grad_norm": 0.9223962426185608,
"learning_rate": 1e-06,
"loss": 4.1729,
"step": 2200
},
{
"epoch": 0.25929127052722556,
"grad_norm": 1.61160409450531,
"learning_rate": 1e-06,
"loss": 4.109,
"step": 2400
},
{
"epoch": 0.2808988764044944,
"grad_norm": 0.8436377644538879,
"learning_rate": 1e-06,
"loss": 4.0602,
"step": 2600
},
{
"epoch": 0.3025064822817632,
"grad_norm": 1.9130096435546875,
"learning_rate": 1e-06,
"loss": 4.2987,
"step": 2800
},
{
"epoch": 0.324114088159032,
"grad_norm": 0.8813604116439819,
"learning_rate": 1e-06,
"loss": 4.1198,
"step": 3000
},
{
"epoch": 0.34572169403630076,
"grad_norm": 1.1001173257827759,
"learning_rate": 1e-06,
"loss": 4.1774,
"step": 3200
},
{
"epoch": 0.36732929991356955,
"grad_norm": 2.015709638595581,
"learning_rate": 1e-06,
"loss": 4.214,
"step": 3400
},
{
"epoch": 0.3889369057908384,
"grad_norm": 1.2437554597854614,
"learning_rate": 1e-06,
"loss": 4.0703,
"step": 3600
},
{
"epoch": 0.4105445116681072,
"grad_norm": 2.0216290950775146,
"learning_rate": 1e-06,
"loss": 4.0848,
"step": 3800
},
{
"epoch": 0.43215211754537597,
"grad_norm": 1.873824954032898,
"learning_rate": 1e-06,
"loss": 4.0322,
"step": 4000
},
{
"epoch": 0.45375972342264476,
"grad_norm": 1.8265173435211182,
"learning_rate": 1e-06,
"loss": 4.1039,
"step": 4200
},
{
"epoch": 0.47536732929991354,
"grad_norm": 0.9985150098800659,
"learning_rate": 1e-06,
"loss": 4.1367,
"step": 4400
},
{
"epoch": 0.4969749351771824,
"grad_norm": 1.2112445831298828,
"learning_rate": 1e-06,
"loss": 4.1724,
"step": 4600
},
{
"epoch": 0.5185825410544511,
"grad_norm": 1.2440255880355835,
"learning_rate": 1e-06,
"loss": 4.1264,
"step": 4800
},
{
"epoch": 0.54019014693172,
"grad_norm": 2.387599468231201,
"learning_rate": 1e-06,
"loss": 4.0228,
"step": 5000
},
{
"epoch": 0.5617977528089888,
"grad_norm": 1.3351703882217407,
"learning_rate": 1e-06,
"loss": 4.0719,
"step": 5200
},
{
"epoch": 0.5834053586862575,
"grad_norm": 1.3799022436141968,
"learning_rate": 1e-06,
"loss": 4.0888,
"step": 5400
},
{
"epoch": 0.6050129645635264,
"grad_norm": 0.9727766513824463,
"learning_rate": 1e-06,
"loss": 3.9904,
"step": 5600
},
{
"epoch": 0.6266205704407951,
"grad_norm": 1.5948766469955444,
"learning_rate": 1e-06,
"loss": 4.0368,
"step": 5800
},
{
"epoch": 0.648228176318064,
"grad_norm": 0.8475677967071533,
"learning_rate": 1e-06,
"loss": 4.1044,
"step": 6000
},
{
"epoch": 0.6698357821953328,
"grad_norm": 2.8822379112243652,
"learning_rate": 1e-06,
"loss": 3.9987,
"step": 6200
},
{
"epoch": 0.6914433880726015,
"grad_norm": 2.800276517868042,
"learning_rate": 1e-06,
"loss": 4.0029,
"step": 6400
},
{
"epoch": 0.7130509939498704,
"grad_norm": 1.2083728313446045,
"learning_rate": 1e-06,
"loss": 4.0523,
"step": 6600
},
{
"epoch": 0.7346585998271391,
"grad_norm": 1.1583646535873413,
"learning_rate": 1e-06,
"loss": 3.9988,
"step": 6800
},
{
"epoch": 0.756266205704408,
"grad_norm": 1.3725389242172241,
"learning_rate": 1e-06,
"loss": 4.0924,
"step": 7000
},
{
"epoch": 0.7778738115816768,
"grad_norm": 0.9432379603385925,
"learning_rate": 1e-06,
"loss": 3.9962,
"step": 7200
},
{
"epoch": 0.7994814174589455,
"grad_norm": 1.1160651445388794,
"learning_rate": 1e-06,
"loss": 3.9588,
"step": 7400
},
{
"epoch": 0.8210890233362144,
"grad_norm": 0.8223551511764526,
"learning_rate": 1e-06,
"loss": 4.0476,
"step": 7600
},
{
"epoch": 0.8426966292134831,
"grad_norm": 1.2660095691680908,
"learning_rate": 1e-06,
"loss": 4.0649,
"step": 7800
},
{
"epoch": 0.8643042350907519,
"grad_norm": 1.3660492897033691,
"learning_rate": 1e-06,
"loss": 4.037,
"step": 8000
},
{
"epoch": 0.8859118409680208,
"grad_norm": 1.07527494430542,
"learning_rate": 1e-06,
"loss": 4.039,
"step": 8200
},
{
"epoch": 0.9075194468452895,
"grad_norm": 1.5177347660064697,
"learning_rate": 1e-06,
"loss": 3.9457,
"step": 8400
},
{
"epoch": 0.9291270527225584,
"grad_norm": 1.4206831455230713,
"learning_rate": 1e-06,
"loss": 4.0433,
"step": 8600
},
{
"epoch": 0.9507346585998271,
"grad_norm": 1.1375796794891357,
"learning_rate": 1e-06,
"loss": 4.0525,
"step": 8800
},
{
"epoch": 0.9723422644770959,
"grad_norm": 0.930182695388794,
"learning_rate": 1e-06,
"loss": 4.1834,
"step": 9000
},
{
"epoch": 0.9939498703543648,
"grad_norm": 0.952041506767273,
"learning_rate": 1e-06,
"loss": 4.0379,
"step": 9200
},
{
"epoch": 1.0155574762316335,
"grad_norm": 1.0867968797683716,
"learning_rate": 1e-06,
"loss": 4.0694,
"step": 9400
},
{
"epoch": 1.0371650821089022,
"grad_norm": 1.4420335292816162,
"learning_rate": 1e-06,
"loss": 4.0624,
"step": 9600
},
{
"epoch": 1.0587726879861712,
"grad_norm": 2.323235273361206,
"learning_rate": 1e-06,
"loss": 4.0579,
"step": 9800
},
{
"epoch": 1.08038029386344,
"grad_norm": 1.2302740812301636,
"learning_rate": 1e-06,
"loss": 3.9257,
"step": 10000
},
{
"epoch": 1.1019878997407087,
"grad_norm": 1.3144605159759521,
"learning_rate": 1e-06,
"loss": 4.1395,
"step": 10200
},
{
"epoch": 1.1235955056179776,
"grad_norm": 1.1223151683807373,
"learning_rate": 1e-06,
"loss": 4.1545,
"step": 10400
},
{
"epoch": 1.1452031114952463,
"grad_norm": 1.8404920101165771,
"learning_rate": 1e-06,
"loss": 4.0319,
"step": 10600
},
{
"epoch": 1.166810717372515,
"grad_norm": 1.0687180757522583,
"learning_rate": 1e-06,
"loss": 4.032,
"step": 10800
},
{
"epoch": 1.1884183232497838,
"grad_norm": 1.3874478340148926,
"learning_rate": 1e-06,
"loss": 3.9799,
"step": 11000
},
{
"epoch": 1.2100259291270528,
"grad_norm": 1.4990215301513672,
"learning_rate": 1e-06,
"loss": 4.054,
"step": 11200
},
{
"epoch": 1.2316335350043215,
"grad_norm": 1.4241687059402466,
"learning_rate": 1e-06,
"loss": 4.0389,
"step": 11400
},
{
"epoch": 1.2532411408815904,
"grad_norm": 1.4653874635696411,
"learning_rate": 1e-06,
"loss": 3.9616,
"step": 11600
},
{
"epoch": 1.2748487467588592,
"grad_norm": 1.548032522201538,
"learning_rate": 1e-06,
"loss": 3.9782,
"step": 11800
},
{
"epoch": 1.296456352636128,
"grad_norm": 1.1580474376678467,
"learning_rate": 1e-06,
"loss": 4.14,
"step": 12000
},
{
"epoch": 1.3180639585133966,
"grad_norm": 0.9656199216842651,
"learning_rate": 1e-06,
"loss": 4.0842,
"step": 12200
},
{
"epoch": 1.3396715643906656,
"grad_norm": 1.5421985387802124,
"learning_rate": 1e-06,
"loss": 3.9982,
"step": 12400
},
{
"epoch": 1.3612791702679343,
"grad_norm": 1.0267407894134521,
"learning_rate": 1e-06,
"loss": 4.1628,
"step": 12600
},
{
"epoch": 1.382886776145203,
"grad_norm": 12.301884651184082,
"learning_rate": 1e-06,
"loss": 4.0779,
"step": 12800
},
{
"epoch": 1.404494382022472,
"grad_norm": 2.006171941757202,
"learning_rate": 1e-06,
"loss": 3.9639,
"step": 13000
},
{
"epoch": 1.4261019878997407,
"grad_norm": 1.1518200635910034,
"learning_rate": 1e-06,
"loss": 3.9573,
"step": 13200
},
{
"epoch": 1.4477095937770095,
"grad_norm": 1.313721776008606,
"learning_rate": 1e-06,
"loss": 4.0322,
"step": 13400
},
{
"epoch": 1.4693171996542782,
"grad_norm": 1.4003313779830933,
"learning_rate": 1e-06,
"loss": 4.0324,
"step": 13600
},
{
"epoch": 1.4909248055315472,
"grad_norm": 1.6102566719055176,
"learning_rate": 1e-06,
"loss": 4.1216,
"step": 13800
},
{
"epoch": 1.512532411408816,
"grad_norm": 1.2825450897216797,
"learning_rate": 1e-06,
"loss": 4.0533,
"step": 14000
},
{
"epoch": 1.5341400172860848,
"grad_norm": 1.838724136352539,
"learning_rate": 1e-06,
"loss": 3.9472,
"step": 14200
},
{
"epoch": 1.5557476231633536,
"grad_norm": 1.0912243127822876,
"learning_rate": 1e-06,
"loss": 4.032,
"step": 14400
},
{
"epoch": 1.5773552290406223,
"grad_norm": 1.6895837783813477,
"learning_rate": 1e-06,
"loss": 4.0008,
"step": 14600
},
{
"epoch": 1.598962834917891,
"grad_norm": 1.318581461906433,
"learning_rate": 1e-06,
"loss": 3.9793,
"step": 14800
},
{
"epoch": 1.6205704407951598,
"grad_norm": 1.5134785175323486,
"learning_rate": 1e-06,
"loss": 3.8919,
"step": 15000
},
{
"epoch": 1.6421780466724287,
"grad_norm": 1.3244537115097046,
"learning_rate": 1e-06,
"loss": 4.0801,
"step": 15200
},
{
"epoch": 1.6637856525496975,
"grad_norm": 1.1500438451766968,
"learning_rate": 1e-06,
"loss": 3.9891,
"step": 15400
},
{
"epoch": 1.6853932584269664,
"grad_norm": 1.626980185508728,
"learning_rate": 1e-06,
"loss": 3.9193,
"step": 15600
},
{
"epoch": 1.7070008643042351,
"grad_norm": 1.4879544973373413,
"learning_rate": 1e-06,
"loss": 4.0747,
"step": 15800
},
{
"epoch": 1.7286084701815039,
"grad_norm": 1.372431755065918,
"learning_rate": 1e-06,
"loss": 3.8365,
"step": 16000
},
{
"epoch": 1.7502160760587726,
"grad_norm": 1.2123242616653442,
"learning_rate": 1e-06,
"loss": 4.0404,
"step": 16200
},
{
"epoch": 1.7718236819360413,
"grad_norm": 1.8157856464385986,
"learning_rate": 1e-06,
"loss": 4.0526,
"step": 16400
},
{
"epoch": 1.7934312878133103,
"grad_norm": 2.1040592193603516,
"learning_rate": 1e-06,
"loss": 3.922,
"step": 16600
},
{
"epoch": 1.8150388936905792,
"grad_norm": 1.7988275289535522,
"learning_rate": 1e-06,
"loss": 3.9249,
"step": 16800
},
{
"epoch": 1.836646499567848,
"grad_norm": 1.3246694803237915,
"learning_rate": 1e-06,
"loss": 4.0381,
"step": 17000
},
{
"epoch": 1.8582541054451167,
"grad_norm": 1.6709383726119995,
"learning_rate": 1e-06,
"loss": 3.9925,
"step": 17200
},
{
"epoch": 1.8798617113223854,
"grad_norm": 1.984506368637085,
"learning_rate": 1e-06,
"loss": 3.9312,
"step": 17400
},
{
"epoch": 1.9014693171996542,
"grad_norm": 1.2894401550292969,
"learning_rate": 1e-06,
"loss": 4.0016,
"step": 17600
},
{
"epoch": 1.9230769230769231,
"grad_norm": 1.7018927335739136,
"learning_rate": 1e-06,
"loss": 3.9602,
"step": 17800
},
{
"epoch": 1.9446845289541919,
"grad_norm": 1.5799185037612915,
"learning_rate": 1e-06,
"loss": 4.0009,
"step": 18000
},
{
"epoch": 1.9662921348314608,
"grad_norm": 1.5885943174362183,
"learning_rate": 1e-06,
"loss": 4.1532,
"step": 18200
},
{
"epoch": 1.9878997407087295,
"grad_norm": 1.1633445024490356,
"learning_rate": 1e-06,
"loss": 4.0068,
"step": 18400
}
],
"logging_steps": 200,
"max_steps": 18512,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.143556685374423e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}