{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 18512, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000108038029386344, "grad_norm": NaN, "learning_rate": 1e-06, "loss": 12.3298, "step": 1 }, { "epoch": 0.021607605877268798, "grad_norm": 4.446423053741455, "learning_rate": 1e-06, "loss": 10.8951, "step": 200 }, { "epoch": 0.043215211754537596, "grad_norm": 6.528625965118408, "learning_rate": 1e-06, "loss": 8.5315, "step": 400 }, { "epoch": 0.06482281763180639, "grad_norm": 2.4619638919830322, "learning_rate": 1e-06, "loss": 5.6715, "step": 600 }, { "epoch": 0.08643042350907519, "grad_norm": 0.8605937361717224, "learning_rate": 1e-06, "loss": 4.7588, "step": 800 }, { "epoch": 0.10803802938634399, "grad_norm": 1.1726713180541992, "learning_rate": 1e-06, "loss": 4.4643, "step": 1000 }, { "epoch": 0.12964563526361278, "grad_norm": 1.2112282514572144, "learning_rate": 1e-06, "loss": 4.3869, "step": 1200 }, { "epoch": 0.1512532411408816, "grad_norm": 2.6818201541900635, "learning_rate": 1e-06, "loss": 4.2508, "step": 1400 }, { "epoch": 0.17286084701815038, "grad_norm": 1.3040348291397095, "learning_rate": 1e-06, "loss": 4.2472, "step": 1600 }, { "epoch": 0.1944684528954192, "grad_norm": 0.8280265927314758, "learning_rate": 1e-06, "loss": 4.1989, "step": 1800 }, { "epoch": 0.21607605877268798, "grad_norm": 0.9923717975616455, "learning_rate": 1e-06, "loss": 4.2074, "step": 2000 }, { "epoch": 0.23768366464995677, "grad_norm": 0.9223962426185608, "learning_rate": 1e-06, "loss": 4.1729, "step": 2200 }, { "epoch": 0.25929127052722556, "grad_norm": 1.61160409450531, "learning_rate": 1e-06, "loss": 4.109, "step": 2400 }, { "epoch": 0.2808988764044944, "grad_norm": 0.8436377644538879, "learning_rate": 1e-06, "loss": 4.0602, "step": 2600 }, { "epoch": 0.3025064822817632, "grad_norm": 1.9130096435546875, "learning_rate": 1e-06, "loss": 4.2987, "step": 2800 }, { "epoch": 0.324114088159032, "grad_norm": 0.8813604116439819, "learning_rate": 1e-06, "loss": 4.1198, "step": 3000 }, { "epoch": 0.34572169403630076, "grad_norm": 1.1001173257827759, "learning_rate": 1e-06, "loss": 4.1774, "step": 3200 }, { "epoch": 0.36732929991356955, "grad_norm": 2.015709638595581, "learning_rate": 1e-06, "loss": 4.214, "step": 3400 }, { "epoch": 0.3889369057908384, "grad_norm": 1.2437554597854614, "learning_rate": 1e-06, "loss": 4.0703, "step": 3600 }, { "epoch": 0.4105445116681072, "grad_norm": 2.0216290950775146, "learning_rate": 1e-06, "loss": 4.0848, "step": 3800 }, { "epoch": 0.43215211754537597, "grad_norm": 1.873824954032898, "learning_rate": 1e-06, "loss": 4.0322, "step": 4000 }, { "epoch": 0.45375972342264476, "grad_norm": 1.8265173435211182, "learning_rate": 1e-06, "loss": 4.1039, "step": 4200 }, { "epoch": 0.47536732929991354, "grad_norm": 0.9985150098800659, "learning_rate": 1e-06, "loss": 4.1367, "step": 4400 }, { "epoch": 0.4969749351771824, "grad_norm": 1.2112445831298828, "learning_rate": 1e-06, "loss": 4.1724, "step": 4600 }, { "epoch": 0.5185825410544511, "grad_norm": 1.2440255880355835, "learning_rate": 1e-06, "loss": 4.1264, "step": 4800 }, { "epoch": 0.54019014693172, "grad_norm": 2.387599468231201, "learning_rate": 1e-06, "loss": 4.0228, "step": 5000 }, { "epoch": 0.5617977528089888, "grad_norm": 1.3351703882217407, "learning_rate": 1e-06, "loss": 4.0719, "step": 5200 }, { "epoch": 0.5834053586862575, "grad_norm": 1.3799022436141968, "learning_rate": 1e-06, "loss": 4.0888, "step": 5400 }, { "epoch": 0.6050129645635264, "grad_norm": 0.9727766513824463, "learning_rate": 1e-06, "loss": 3.9904, "step": 5600 }, { "epoch": 0.6266205704407951, "grad_norm": 1.5948766469955444, "learning_rate": 1e-06, "loss": 4.0368, "step": 5800 }, { "epoch": 0.648228176318064, "grad_norm": 0.8475677967071533, "learning_rate": 1e-06, "loss": 4.1044, "step": 6000 }, { "epoch": 0.6698357821953328, "grad_norm": 2.8822379112243652, "learning_rate": 1e-06, "loss": 3.9987, "step": 6200 }, { "epoch": 0.6914433880726015, "grad_norm": 2.800276517868042, "learning_rate": 1e-06, "loss": 4.0029, "step": 6400 }, { "epoch": 0.7130509939498704, "grad_norm": 1.2083728313446045, "learning_rate": 1e-06, "loss": 4.0523, "step": 6600 }, { "epoch": 0.7346585998271391, "grad_norm": 1.1583646535873413, "learning_rate": 1e-06, "loss": 3.9988, "step": 6800 }, { "epoch": 0.756266205704408, "grad_norm": 1.3725389242172241, "learning_rate": 1e-06, "loss": 4.0924, "step": 7000 }, { "epoch": 0.7778738115816768, "grad_norm": 0.9432379603385925, "learning_rate": 1e-06, "loss": 3.9962, "step": 7200 }, { "epoch": 0.7994814174589455, "grad_norm": 1.1160651445388794, "learning_rate": 1e-06, "loss": 3.9588, "step": 7400 }, { "epoch": 0.8210890233362144, "grad_norm": 0.8223551511764526, "learning_rate": 1e-06, "loss": 4.0476, "step": 7600 }, { "epoch": 0.8426966292134831, "grad_norm": 1.2660095691680908, "learning_rate": 1e-06, "loss": 4.0649, "step": 7800 }, { "epoch": 0.8643042350907519, "grad_norm": 1.3660492897033691, "learning_rate": 1e-06, "loss": 4.037, "step": 8000 }, { "epoch": 0.8859118409680208, "grad_norm": 1.07527494430542, "learning_rate": 1e-06, "loss": 4.039, "step": 8200 }, { "epoch": 0.9075194468452895, "grad_norm": 1.5177347660064697, "learning_rate": 1e-06, "loss": 3.9457, "step": 8400 }, { "epoch": 0.9291270527225584, "grad_norm": 1.4206831455230713, "learning_rate": 1e-06, "loss": 4.0433, "step": 8600 }, { "epoch": 0.9507346585998271, "grad_norm": 1.1375796794891357, "learning_rate": 1e-06, "loss": 4.0525, "step": 8800 }, { "epoch": 0.9723422644770959, "grad_norm": 0.930182695388794, "learning_rate": 1e-06, "loss": 4.1834, "step": 9000 }, { "epoch": 0.9939498703543648, "grad_norm": 0.952041506767273, "learning_rate": 1e-06, "loss": 4.0379, "step": 9200 }, { "epoch": 1.0155574762316335, "grad_norm": 1.0867968797683716, "learning_rate": 1e-06, "loss": 4.0694, "step": 9400 }, { "epoch": 1.0371650821089022, "grad_norm": 1.4420335292816162, "learning_rate": 1e-06, "loss": 4.0624, "step": 9600 }, { "epoch": 1.0587726879861712, "grad_norm": 2.323235273361206, "learning_rate": 1e-06, "loss": 4.0579, "step": 9800 }, { "epoch": 1.08038029386344, "grad_norm": 1.2302740812301636, "learning_rate": 1e-06, "loss": 3.9257, "step": 10000 }, { "epoch": 1.1019878997407087, "grad_norm": 1.3144605159759521, "learning_rate": 1e-06, "loss": 4.1395, "step": 10200 }, { "epoch": 1.1235955056179776, "grad_norm": 1.1223151683807373, "learning_rate": 1e-06, "loss": 4.1545, "step": 10400 }, { "epoch": 1.1452031114952463, "grad_norm": 1.8404920101165771, "learning_rate": 1e-06, "loss": 4.0319, "step": 10600 }, { "epoch": 1.166810717372515, "grad_norm": 1.0687180757522583, "learning_rate": 1e-06, "loss": 4.032, "step": 10800 }, { "epoch": 1.1884183232497838, "grad_norm": 1.3874478340148926, "learning_rate": 1e-06, "loss": 3.9799, "step": 11000 }, { "epoch": 1.2100259291270528, "grad_norm": 1.4990215301513672, "learning_rate": 1e-06, "loss": 4.054, "step": 11200 }, { "epoch": 1.2316335350043215, "grad_norm": 1.4241687059402466, "learning_rate": 1e-06, "loss": 4.0389, "step": 11400 }, { "epoch": 1.2532411408815904, "grad_norm": 1.4653874635696411, "learning_rate": 1e-06, "loss": 3.9616, "step": 11600 }, { "epoch": 1.2748487467588592, "grad_norm": 1.548032522201538, "learning_rate": 1e-06, "loss": 3.9782, "step": 11800 }, { "epoch": 1.296456352636128, "grad_norm": 1.1580474376678467, "learning_rate": 1e-06, "loss": 4.14, "step": 12000 }, { "epoch": 1.3180639585133966, "grad_norm": 0.9656199216842651, "learning_rate": 1e-06, "loss": 4.0842, "step": 12200 }, { "epoch": 1.3396715643906656, "grad_norm": 1.5421985387802124, "learning_rate": 1e-06, "loss": 3.9982, "step": 12400 }, { "epoch": 1.3612791702679343, "grad_norm": 1.0267407894134521, "learning_rate": 1e-06, "loss": 4.1628, "step": 12600 }, { "epoch": 1.382886776145203, "grad_norm": 12.301884651184082, "learning_rate": 1e-06, "loss": 4.0779, "step": 12800 }, { "epoch": 1.404494382022472, "grad_norm": 2.006171941757202, "learning_rate": 1e-06, "loss": 3.9639, "step": 13000 }, { "epoch": 1.4261019878997407, "grad_norm": 1.1518200635910034, "learning_rate": 1e-06, "loss": 3.9573, "step": 13200 }, { "epoch": 1.4477095937770095, "grad_norm": 1.313721776008606, "learning_rate": 1e-06, "loss": 4.0322, "step": 13400 }, { "epoch": 1.4693171996542782, "grad_norm": 1.4003313779830933, "learning_rate": 1e-06, "loss": 4.0324, "step": 13600 }, { "epoch": 1.4909248055315472, "grad_norm": 1.6102566719055176, "learning_rate": 1e-06, "loss": 4.1216, "step": 13800 }, { "epoch": 1.512532411408816, "grad_norm": 1.2825450897216797, "learning_rate": 1e-06, "loss": 4.0533, "step": 14000 }, { "epoch": 1.5341400172860848, "grad_norm": 1.838724136352539, "learning_rate": 1e-06, "loss": 3.9472, "step": 14200 }, { "epoch": 1.5557476231633536, "grad_norm": 1.0912243127822876, "learning_rate": 1e-06, "loss": 4.032, "step": 14400 }, { "epoch": 1.5773552290406223, "grad_norm": 1.6895837783813477, "learning_rate": 1e-06, "loss": 4.0008, "step": 14600 }, { "epoch": 1.598962834917891, "grad_norm": 1.318581461906433, "learning_rate": 1e-06, "loss": 3.9793, "step": 14800 }, { "epoch": 1.6205704407951598, "grad_norm": 1.5134785175323486, "learning_rate": 1e-06, "loss": 3.8919, "step": 15000 }, { "epoch": 1.6421780466724287, "grad_norm": 1.3244537115097046, "learning_rate": 1e-06, "loss": 4.0801, "step": 15200 }, { "epoch": 1.6637856525496975, "grad_norm": 1.1500438451766968, "learning_rate": 1e-06, "loss": 3.9891, "step": 15400 }, { "epoch": 1.6853932584269664, "grad_norm": 1.626980185508728, "learning_rate": 1e-06, "loss": 3.9193, "step": 15600 }, { "epoch": 1.7070008643042351, "grad_norm": 1.4879544973373413, "learning_rate": 1e-06, "loss": 4.0747, "step": 15800 }, { "epoch": 1.7286084701815039, "grad_norm": 1.372431755065918, "learning_rate": 1e-06, "loss": 3.8365, "step": 16000 }, { "epoch": 1.7502160760587726, "grad_norm": 1.2123242616653442, "learning_rate": 1e-06, "loss": 4.0404, "step": 16200 }, { "epoch": 1.7718236819360413, "grad_norm": 1.8157856464385986, "learning_rate": 1e-06, "loss": 4.0526, "step": 16400 }, { "epoch": 1.7934312878133103, "grad_norm": 2.1040592193603516, "learning_rate": 1e-06, "loss": 3.922, "step": 16600 }, { "epoch": 1.8150388936905792, "grad_norm": 1.7988275289535522, "learning_rate": 1e-06, "loss": 3.9249, "step": 16800 }, { "epoch": 1.836646499567848, "grad_norm": 1.3246694803237915, "learning_rate": 1e-06, "loss": 4.0381, "step": 17000 }, { "epoch": 1.8582541054451167, "grad_norm": 1.6709383726119995, "learning_rate": 1e-06, "loss": 3.9925, "step": 17200 }, { "epoch": 1.8798617113223854, "grad_norm": 1.984506368637085, "learning_rate": 1e-06, "loss": 3.9312, "step": 17400 }, { "epoch": 1.9014693171996542, "grad_norm": 1.2894401550292969, "learning_rate": 1e-06, "loss": 4.0016, "step": 17600 }, { "epoch": 1.9230769230769231, "grad_norm": 1.7018927335739136, "learning_rate": 1e-06, "loss": 3.9602, "step": 17800 }, { "epoch": 1.9446845289541919, "grad_norm": 1.5799185037612915, "learning_rate": 1e-06, "loss": 4.0009, "step": 18000 }, { "epoch": 1.9662921348314608, "grad_norm": 1.5885943174362183, "learning_rate": 1e-06, "loss": 4.1532, "step": 18200 }, { "epoch": 1.9878997407087295, "grad_norm": 1.1633445024490356, "learning_rate": 1e-06, "loss": 4.0068, "step": 18400 } ], "logging_steps": 200, "max_steps": 18512, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.143556685374423e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }