{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9948051948051948, "eval_steps": 500, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06393606393606394, "grad_norm": 3.3491440222437325, "learning_rate": 6.25e-06, "loss": 0.7822, "step": 10 }, { "epoch": 0.12787212787212787, "grad_norm": 1.3482958544910333, "learning_rate": 1.25e-05, "loss": 0.6792, "step": 20 }, { "epoch": 0.1918081918081918, "grad_norm": 1.2739067783847393, "learning_rate": 1.8750000000000002e-05, "loss": 0.6622, "step": 30 }, { "epoch": 0.25574425574425574, "grad_norm": 1.4126840004973826, "learning_rate": 1.9959742939952393e-05, "loss": 0.6878, "step": 40 }, { "epoch": 0.3196803196803197, "grad_norm": 1.47342628958793, "learning_rate": 1.9796753984232357e-05, "loss": 0.6892, "step": 50 }, { "epoch": 0.3836163836163836, "grad_norm": 1.1817516625418099, "learning_rate": 1.9510565162951538e-05, "loss": 0.6479, "step": 60 }, { "epoch": 0.44755244755244755, "grad_norm": 0.9607345989729658, "learning_rate": 1.9104775466588162e-05, "loss": 0.6489, "step": 70 }, { "epoch": 0.5114885114885115, "grad_norm": 0.9554376895753519, "learning_rate": 1.8584487936018663e-05, "loss": 0.6422, "step": 80 }, { "epoch": 0.5754245754245755, "grad_norm": 0.8588614657212066, "learning_rate": 1.795624548881781e-05, "loss": 0.6491, "step": 90 }, { "epoch": 0.6393606393606394, "grad_norm": 1.4520057403638673, "learning_rate": 1.7227948638273918e-05, "loss": 0.6619, "step": 100 }, { "epoch": 0.7032967032967034, "grad_norm": 1.0532745435041297, "learning_rate": 1.6408756139850243e-05, "loss": 0.6289, "step": 110 }, { "epoch": 0.7672327672327672, "grad_norm": 0.9433790775247752, "learning_rate": 1.5508969814521026e-05, "loss": 0.629, "step": 120 }, { "epoch": 0.8311688311688312, "grad_norm": 0.8339993692400886, "learning_rate": 1.4539904997395468e-05, "loss": 0.6141, "step": 130 }, { "epoch": 0.8951048951048951, "grad_norm": 0.7531686166026066, "learning_rate": 1.3513748240813429e-05, "loss": 0.6362, "step": 140 }, { "epoch": 0.9590409590409591, "grad_norm": 1.2639265367588755, "learning_rate": 1.2443404061378941e-05, "loss": 0.6476, "step": 150 }, { "epoch": 0.9974025974025974, "eval_loss": 0.4589642882347107, "eval_runtime": 14.7204, "eval_samples_per_second": 67.933, "eval_steps_per_second": 4.28, "step": 156 }, { "epoch": 1.022977022977023, "grad_norm": 0.679884478272212, "learning_rate": 1.1342332658176556e-05, "loss": 0.5553, "step": 160 }, { "epoch": 1.086913086913087, "grad_norm": 1.5094669264122595, "learning_rate": 1.0224380642958052e-05, "loss": 0.4756, "step": 170 }, { "epoch": 1.150849150849151, "grad_norm": 0.9718462479778894, "learning_rate": 9.103606910965666e-06, "loss": 0.4631, "step": 180 }, { "epoch": 1.2147852147852147, "grad_norm": 0.8605924611208216, "learning_rate": 7.994105842167274e-06, "loss": 0.4556, "step": 190 }, { "epoch": 1.2787212787212787, "grad_norm": 0.8216831927901586, "learning_rate": 6.909830056250527e-06, "loss": 0.4543, "step": 200 }, { "epoch": 1.3426573426573427, "grad_norm": 0.7371920955382064, "learning_rate": 5.864414950334796e-06, "loss": 0.4646, "step": 210 }, { "epoch": 1.4065934065934065, "grad_norm": 0.6686619113001689, "learning_rate": 4.87100722594094e-06, "loss": 0.4512, "step": 220 }, { "epoch": 1.4705294705294705, "grad_norm": 0.9735666328660604, "learning_rate": 3.942099561591802e-06, "loss": 0.4536, "step": 230 }, { "epoch": 1.5344655344655345, "grad_norm": 0.8730632761771147, "learning_rate": 3.089373510131354e-06, "loss": 0.4431, "step": 240 }, { "epoch": 1.5984015984015985, "grad_norm": 0.7626268236928374, "learning_rate": 2.323552596419889e-06, "loss": 0.4482, "step": 250 }, { "epoch": 1.6623376623376624, "grad_norm": 0.732159467773922, "learning_rate": 1.6542674627869738e-06, "loss": 0.447, "step": 260 }, { "epoch": 1.7262737262737264, "grad_norm": 0.6142439228810679, "learning_rate": 1.0899347581163222e-06, "loss": 0.454, "step": 270 }, { "epoch": 1.7902097902097902, "grad_norm": 0.887179653704021, "learning_rate": 6.37651293602628e-07, "loss": 0.4459, "step": 280 }, { "epoch": 1.8541458541458542, "grad_norm": 0.9443697905388456, "learning_rate": 3.0310479623313125e-07, "loss": 0.4389, "step": 290 }, { "epoch": 1.918081918081918, "grad_norm": 0.7646985408632654, "learning_rate": 9.0502382320653e-08, "loss": 0.4491, "step": 300 }, { "epoch": 1.982017982017982, "grad_norm": 0.8089131548200502, "learning_rate": 2.5176505749346937e-09, "loss": 0.4531, "step": 310 }, { "epoch": 1.9948051948051948, "eval_loss": 0.36759093403816223, "eval_runtime": 17.3353, "eval_samples_per_second": 57.686, "eval_steps_per_second": 3.634, "step": 312 }, { "epoch": 1.9948051948051948, "step": 312, "total_flos": 7897890299904.0, "train_loss": 0.556142369332986, "train_runtime": 8555.603, "train_samples_per_second": 4.68, "train_steps_per_second": 0.036 } ], "logging_steps": 10, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7897890299904.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }