{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.011189437171310284, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005594718585655141, "grad_norm": 0.87109375, "learning_rate": 1.3986013986013987e-08, "loss": 1.4848, "step": 20 }, { "epoch": 0.0011189437171310282, "grad_norm": 1.0, "learning_rate": 2.7972027972027974e-08, "loss": 1.4825, "step": 40 }, { "epoch": 0.0016784155756965425, "grad_norm": 0.94921875, "learning_rate": 4.195804195804196e-08, "loss": 1.452, "step": 60 }, { "epoch": 0.0022378874342620565, "grad_norm": 0.7265625, "learning_rate": 5.594405594405595e-08, "loss": 1.5023, "step": 80 }, { "epoch": 0.002797359292827571, "grad_norm": 0.640625, "learning_rate": 6.993006993006993e-08, "loss": 1.5079, "step": 100 }, { "epoch": 0.003356831151393085, "grad_norm": 0.81640625, "learning_rate": 8.391608391608393e-08, "loss": 1.4874, "step": 120 }, { "epoch": 0.003916303009958599, "grad_norm": 0.75390625, "learning_rate": 9.790209790209792e-08, "loss": 1.4273, "step": 140 }, { "epoch": 0.004475774868524113, "grad_norm": 0.8828125, "learning_rate": 1.118881118881119e-07, "loss": 1.4738, "step": 160 }, { "epoch": 0.005035246727089627, "grad_norm": 0.83203125, "learning_rate": 1.258741258741259e-07, "loss": 1.4768, "step": 180 }, { "epoch": 0.005594718585655142, "grad_norm": 0.67578125, "learning_rate": 1.3986013986013987e-07, "loss": 1.4834, "step": 200 }, { "epoch": 0.006154190444220655, "grad_norm": 0.98828125, "learning_rate": 1.5384615384615387e-07, "loss": 1.5255, "step": 220 }, { "epoch": 0.00671366230278617, "grad_norm": 0.70703125, "learning_rate": 1.6783216783216785e-07, "loss": 1.4849, "step": 240 }, { "epoch": 0.007273134161351684, "grad_norm": 0.9140625, "learning_rate": 1.8181818181818183e-07, "loss": 1.5231, "step": 260 }, { "epoch": 0.007832606019917199, "grad_norm": 0.87109375, "learning_rate": 1.9580419580419583e-07, "loss": 1.4558, "step": 280 }, { "epoch": 0.008392077878482713, "grad_norm": 1.2890625, "learning_rate": 2.097902097902098e-07, "loss": 1.5004, "step": 300 }, { "epoch": 0.008951549737048226, "grad_norm": 0.765625, "learning_rate": 2.237762237762238e-07, "loss": 1.4348, "step": 320 }, { "epoch": 0.00951102159561374, "grad_norm": 0.76953125, "learning_rate": 2.3776223776223777e-07, "loss": 1.4752, "step": 340 }, { "epoch": 0.010070493454179255, "grad_norm": 1.015625, "learning_rate": 2.517482517482518e-07, "loss": 1.4899, "step": 360 }, { "epoch": 0.01062996531274477, "grad_norm": 0.78515625, "learning_rate": 2.6573426573426575e-07, "loss": 1.4307, "step": 380 }, { "epoch": 0.011189437171310284, "grad_norm": 0.71875, "learning_rate": 2.7972027972027973e-07, "loss": 1.4233, "step": 400 } ], "logging_steps": 20, "max_steps": 35748, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 7.36836603346944e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }