|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 18512, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000108038029386344, |
|
"grad_norm": NaN, |
|
"learning_rate": 1e-06, |
|
"loss": 12.3298, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.021607605877268798, |
|
"grad_norm": 4.446423053741455, |
|
"learning_rate": 1e-06, |
|
"loss": 10.8951, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.043215211754537596, |
|
"grad_norm": 6.528625965118408, |
|
"learning_rate": 1e-06, |
|
"loss": 8.5315, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06482281763180639, |
|
"grad_norm": 2.4619638919830322, |
|
"learning_rate": 1e-06, |
|
"loss": 5.6715, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.08643042350907519, |
|
"grad_norm": 0.8605937361717224, |
|
"learning_rate": 1e-06, |
|
"loss": 4.7588, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.10803802938634399, |
|
"grad_norm": 1.1726713180541992, |
|
"learning_rate": 1e-06, |
|
"loss": 4.4643, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.12964563526361278, |
|
"grad_norm": 1.2112282514572144, |
|
"learning_rate": 1e-06, |
|
"loss": 4.3869, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1512532411408816, |
|
"grad_norm": 2.6818201541900635, |
|
"learning_rate": 1e-06, |
|
"loss": 4.2508, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.17286084701815038, |
|
"grad_norm": 1.3040348291397095, |
|
"learning_rate": 1e-06, |
|
"loss": 4.2472, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.1944684528954192, |
|
"grad_norm": 0.8280265927314758, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1989, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.21607605877268798, |
|
"grad_norm": 0.9923717975616455, |
|
"learning_rate": 1e-06, |
|
"loss": 4.2074, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.23768366464995677, |
|
"grad_norm": 0.9223962426185608, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1729, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.25929127052722556, |
|
"grad_norm": 1.61160409450531, |
|
"learning_rate": 1e-06, |
|
"loss": 4.109, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.2808988764044944, |
|
"grad_norm": 0.8436377644538879, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0602, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.3025064822817632, |
|
"grad_norm": 1.9130096435546875, |
|
"learning_rate": 1e-06, |
|
"loss": 4.2987, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.324114088159032, |
|
"grad_norm": 0.8813604116439819, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1198, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.34572169403630076, |
|
"grad_norm": 1.1001173257827759, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1774, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.36732929991356955, |
|
"grad_norm": 2.015709638595581, |
|
"learning_rate": 1e-06, |
|
"loss": 4.214, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.3889369057908384, |
|
"grad_norm": 1.2437554597854614, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0703, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.4105445116681072, |
|
"grad_norm": 2.0216290950775146, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0848, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.43215211754537597, |
|
"grad_norm": 1.873824954032898, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0322, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.45375972342264476, |
|
"grad_norm": 1.8265173435211182, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1039, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.47536732929991354, |
|
"grad_norm": 0.9985150098800659, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1367, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.4969749351771824, |
|
"grad_norm": 1.2112445831298828, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1724, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.5185825410544511, |
|
"grad_norm": 1.2440255880355835, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1264, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.54019014693172, |
|
"grad_norm": 2.387599468231201, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0228, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5617977528089888, |
|
"grad_norm": 1.3351703882217407, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0719, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.5834053586862575, |
|
"grad_norm": 1.3799022436141968, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0888, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.6050129645635264, |
|
"grad_norm": 0.9727766513824463, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9904, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.6266205704407951, |
|
"grad_norm": 1.5948766469955444, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0368, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.648228176318064, |
|
"grad_norm": 0.8475677967071533, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1044, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6698357821953328, |
|
"grad_norm": 2.8822379112243652, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9987, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.6914433880726015, |
|
"grad_norm": 2.800276517868042, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0029, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.7130509939498704, |
|
"grad_norm": 1.2083728313446045, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0523, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.7346585998271391, |
|
"grad_norm": 1.1583646535873413, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9988, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.756266205704408, |
|
"grad_norm": 1.3725389242172241, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0924, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.7778738115816768, |
|
"grad_norm": 0.9432379603385925, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9962, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.7994814174589455, |
|
"grad_norm": 1.1160651445388794, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9588, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.8210890233362144, |
|
"grad_norm": 0.8223551511764526, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0476, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.8426966292134831, |
|
"grad_norm": 1.2660095691680908, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0649, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.8643042350907519, |
|
"grad_norm": 1.3660492897033691, |
|
"learning_rate": 1e-06, |
|
"loss": 4.037, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.8859118409680208, |
|
"grad_norm": 1.07527494430542, |
|
"learning_rate": 1e-06, |
|
"loss": 4.039, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.9075194468452895, |
|
"grad_norm": 1.5177347660064697, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9457, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.9291270527225584, |
|
"grad_norm": 1.4206831455230713, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0433, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.9507346585998271, |
|
"grad_norm": 1.1375796794891357, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0525, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.9723422644770959, |
|
"grad_norm": 0.930182695388794, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1834, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.9939498703543648, |
|
"grad_norm": 0.952041506767273, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0379, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.0155574762316335, |
|
"grad_norm": 1.0867968797683716, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0694, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.0371650821089022, |
|
"grad_norm": 1.4420335292816162, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0624, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.0587726879861712, |
|
"grad_norm": 2.323235273361206, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0579, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 1.08038029386344, |
|
"grad_norm": 1.2302740812301636, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9257, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.1019878997407087, |
|
"grad_norm": 1.3144605159759521, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1395, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 1.1235955056179776, |
|
"grad_norm": 1.1223151683807373, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1545, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 1.1452031114952463, |
|
"grad_norm": 1.8404920101165771, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0319, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 1.166810717372515, |
|
"grad_norm": 1.0687180757522583, |
|
"learning_rate": 1e-06, |
|
"loss": 4.032, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 1.1884183232497838, |
|
"grad_norm": 1.3874478340148926, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9799, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.2100259291270528, |
|
"grad_norm": 1.4990215301513672, |
|
"learning_rate": 1e-06, |
|
"loss": 4.054, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 1.2316335350043215, |
|
"grad_norm": 1.4241687059402466, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0389, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 1.2532411408815904, |
|
"grad_norm": 1.4653874635696411, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9616, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 1.2748487467588592, |
|
"grad_norm": 1.548032522201538, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9782, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 1.296456352636128, |
|
"grad_norm": 1.1580474376678467, |
|
"learning_rate": 1e-06, |
|
"loss": 4.14, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.3180639585133966, |
|
"grad_norm": 0.9656199216842651, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0842, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 1.3396715643906656, |
|
"grad_norm": 1.5421985387802124, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9982, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 1.3612791702679343, |
|
"grad_norm": 1.0267407894134521, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1628, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 1.382886776145203, |
|
"grad_norm": 12.301884651184082, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0779, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 1.404494382022472, |
|
"grad_norm": 2.006171941757202, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9639, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.4261019878997407, |
|
"grad_norm": 1.1518200635910034, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9573, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 1.4477095937770095, |
|
"grad_norm": 1.313721776008606, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0322, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 1.4693171996542782, |
|
"grad_norm": 1.4003313779830933, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0324, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 1.4909248055315472, |
|
"grad_norm": 1.6102566719055176, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1216, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 1.512532411408816, |
|
"grad_norm": 1.2825450897216797, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0533, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.5341400172860848, |
|
"grad_norm": 1.838724136352539, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9472, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 1.5557476231633536, |
|
"grad_norm": 1.0912243127822876, |
|
"learning_rate": 1e-06, |
|
"loss": 4.032, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 1.5773552290406223, |
|
"grad_norm": 1.6895837783813477, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0008, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 1.598962834917891, |
|
"grad_norm": 1.318581461906433, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9793, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 1.6205704407951598, |
|
"grad_norm": 1.5134785175323486, |
|
"learning_rate": 1e-06, |
|
"loss": 3.8919, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.6421780466724287, |
|
"grad_norm": 1.3244537115097046, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0801, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 1.6637856525496975, |
|
"grad_norm": 1.1500438451766968, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9891, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 1.6853932584269664, |
|
"grad_norm": 1.626980185508728, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9193, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 1.7070008643042351, |
|
"grad_norm": 1.4879544973373413, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0747, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 1.7286084701815039, |
|
"grad_norm": 1.372431755065918, |
|
"learning_rate": 1e-06, |
|
"loss": 3.8365, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.7502160760587726, |
|
"grad_norm": 1.2123242616653442, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0404, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 1.7718236819360413, |
|
"grad_norm": 1.8157856464385986, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0526, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 1.7934312878133103, |
|
"grad_norm": 2.1040592193603516, |
|
"learning_rate": 1e-06, |
|
"loss": 3.922, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 1.8150388936905792, |
|
"grad_norm": 1.7988275289535522, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9249, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.836646499567848, |
|
"grad_norm": 1.3246694803237915, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0381, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.8582541054451167, |
|
"grad_norm": 1.6709383726119995, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9925, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.8798617113223854, |
|
"grad_norm": 1.984506368637085, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9312, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 1.9014693171996542, |
|
"grad_norm": 1.2894401550292969, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0016, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 1.7018927335739136, |
|
"learning_rate": 1e-06, |
|
"loss": 3.9602, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 1.9446845289541919, |
|
"grad_norm": 1.5799185037612915, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0009, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.9662921348314608, |
|
"grad_norm": 1.5885943174362183, |
|
"learning_rate": 1e-06, |
|
"loss": 4.1532, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 1.9878997407087295, |
|
"grad_norm": 1.1633445024490356, |
|
"learning_rate": 1e-06, |
|
"loss": 4.0068, |
|
"step": 18400 |
|
} |
|
], |
|
"logging_steps": 200, |
|
"max_steps": 18512, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 400, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.143556685374423e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|