{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5417, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018460402436773122, "grad_norm": 72293.6015625, "learning_rate": 4.907697987816135e-05, "loss": 0.7984, "step": 100 }, { "epoch": 0.036920804873546244, "grad_norm": 86618.71875, "learning_rate": 4.815395975632269e-05, "loss": 0.7874, "step": 200 }, { "epoch": 0.05538120731031936, "grad_norm": 82291.1953125, "learning_rate": 4.723093963448403e-05, "loss": 0.7683, "step": 300 }, { "epoch": 0.07384160974709249, "grad_norm": 72406.7265625, "learning_rate": 4.6307919512645376e-05, "loss": 0.7721, "step": 400 }, { "epoch": 0.0923020121838656, "grad_norm": 82859.0, "learning_rate": 4.538489939080672e-05, "loss": 0.7672, "step": 500 }, { "epoch": 0.11076241462063872, "grad_norm": 72674.453125, "learning_rate": 4.4461879268968066e-05, "loss": 0.7741, "step": 600 }, { "epoch": 0.12922281705741184, "grad_norm": 74598.8828125, "learning_rate": 4.3538859147129405e-05, "loss": 0.7684, "step": 700 }, { "epoch": 0.14768321949418498, "grad_norm": 75325.09375, "learning_rate": 4.261583902529076e-05, "loss": 0.7706, "step": 800 }, { "epoch": 0.1661436219309581, "grad_norm": 80050.4921875, "learning_rate": 4.1692818903452095e-05, "loss": 0.771, "step": 900 }, { "epoch": 0.1846040243677312, "grad_norm": 71205.3359375, "learning_rate": 4.076979878161344e-05, "loss": 0.7458, "step": 1000 }, { "epoch": 0.20306442680450434, "grad_norm": 71098.4765625, "learning_rate": 3.9846778659774785e-05, "loss": 0.7632, "step": 1100 }, { "epoch": 0.22152482924127745, "grad_norm": 79350.5859375, "learning_rate": 3.892375853793613e-05, "loss": 0.7715, "step": 1200 }, { "epoch": 0.23998523167805058, "grad_norm": 78053.2578125, "learning_rate": 3.800073841609747e-05, "loss": 0.7582, "step": 1300 }, { "epoch": 0.2584456341148237, "grad_norm": 72609.0078125, "learning_rate": 3.707771829425882e-05, "loss": 0.7547, "step": 1400 }, { "epoch": 0.27690603655159685, "grad_norm": 75624.9453125, "learning_rate": 3.615469817242016e-05, "loss": 0.7432, "step": 1500 }, { "epoch": 0.29536643898836995, "grad_norm": 67118.828125, "learning_rate": 3.5231678050581504e-05, "loss": 0.7522, "step": 1600 }, { "epoch": 0.31382684142514305, "grad_norm": 77685.6171875, "learning_rate": 3.430865792874285e-05, "loss": 0.7529, "step": 1700 }, { "epoch": 0.3322872438619162, "grad_norm": 80151.4609375, "learning_rate": 3.338563780690419e-05, "loss": 0.7558, "step": 1800 }, { "epoch": 0.3507476462986893, "grad_norm": 68374.609375, "learning_rate": 3.246261768506554e-05, "loss": 0.7547, "step": 1900 }, { "epoch": 0.3692080487354624, "grad_norm": 69225.65625, "learning_rate": 3.153959756322688e-05, "loss": 0.7283, "step": 2000 }, { "epoch": 0.38766845117223553, "grad_norm": 93264.3984375, "learning_rate": 3.061657744138822e-05, "loss": 0.7368, "step": 2100 }, { "epoch": 0.4061288536090087, "grad_norm": 81487.3125, "learning_rate": 2.9693557319549568e-05, "loss": 0.7531, "step": 2200 }, { "epoch": 0.4245892560457818, "grad_norm": 72823.8125, "learning_rate": 2.8770537197710913e-05, "loss": 0.7216, "step": 2300 }, { "epoch": 0.4430496584825549, "grad_norm": 73118.8203125, "learning_rate": 2.7847517075872255e-05, "loss": 0.7356, "step": 2400 }, { "epoch": 0.46151006091932806, "grad_norm": 68678.484375, "learning_rate": 2.69244969540336e-05, "loss": 0.7308, "step": 2500 }, { "epoch": 0.47997046335610116, "grad_norm": 73918.640625, "learning_rate": 2.6001476832194942e-05, "loss": 0.7235, "step": 2600 }, { "epoch": 0.49843086579287427, "grad_norm": 66172.796875, "learning_rate": 2.5078456710356284e-05, "loss": 0.7283, "step": 2700 }, { "epoch": 0.5168912682296474, "grad_norm": 66544.1015625, "learning_rate": 2.415543658851763e-05, "loss": 0.7228, "step": 2800 }, { "epoch": 0.5353516706664205, "grad_norm": 66939.625, "learning_rate": 2.3232416466678974e-05, "loss": 0.7095, "step": 2900 }, { "epoch": 0.5538120731031937, "grad_norm": 67865.15625, "learning_rate": 2.230939634484032e-05, "loss": 0.7248, "step": 3000 }, { "epoch": 0.5722724755399667, "grad_norm": 63117.76953125, "learning_rate": 2.138637622300166e-05, "loss": 0.7147, "step": 3100 }, { "epoch": 0.5907328779767399, "grad_norm": 59398.1796875, "learning_rate": 2.0463356101163006e-05, "loss": 0.7031, "step": 3200 }, { "epoch": 0.6091932804135131, "grad_norm": 63816.08203125, "learning_rate": 1.954033597932435e-05, "loss": 0.7144, "step": 3300 }, { "epoch": 0.6276536828502861, "grad_norm": 70370.5625, "learning_rate": 1.8617315857485696e-05, "loss": 0.7141, "step": 3400 }, { "epoch": 0.6461140852870593, "grad_norm": 63610.734375, "learning_rate": 1.7694295735647038e-05, "loss": 0.694, "step": 3500 }, { "epoch": 0.6645744877238324, "grad_norm": 59664.05078125, "learning_rate": 1.6771275613808383e-05, "loss": 0.7086, "step": 3600 }, { "epoch": 0.6830348901606055, "grad_norm": 64207.88671875, "learning_rate": 1.5848255491969728e-05, "loss": 0.7138, "step": 3700 }, { "epoch": 0.7014952925973786, "grad_norm": 79748.2734375, "learning_rate": 1.4925235370131068e-05, "loss": 0.6985, "step": 3800 }, { "epoch": 0.7199556950341518, "grad_norm": 72216.671875, "learning_rate": 1.4002215248292413e-05, "loss": 0.7019, "step": 3900 }, { "epoch": 0.7384160974709248, "grad_norm": 74815.6328125, "learning_rate": 1.3079195126453758e-05, "loss": 0.687, "step": 4000 }, { "epoch": 0.756876499907698, "grad_norm": 61482.41796875, "learning_rate": 1.2156175004615102e-05, "loss": 0.6983, "step": 4100 }, { "epoch": 0.7753369023444711, "grad_norm": 73800.75, "learning_rate": 1.1233154882776445e-05, "loss": 0.7004, "step": 4200 }, { "epoch": 0.7937973047812442, "grad_norm": 75985.421875, "learning_rate": 1.0310134760937789e-05, "loss": 0.7097, "step": 4300 }, { "epoch": 0.8122577072180174, "grad_norm": 67175.03125, "learning_rate": 9.387114639099132e-06, "loss": 0.6781, "step": 4400 }, { "epoch": 0.8307181096547904, "grad_norm": 68520.875, "learning_rate": 8.464094517260476e-06, "loss": 0.6857, "step": 4500 }, { "epoch": 0.8491785120915636, "grad_norm": 65612.703125, "learning_rate": 7.541074395421821e-06, "loss": 0.6971, "step": 4600 }, { "epoch": 0.8676389145283367, "grad_norm": 63280.83203125, "learning_rate": 6.618054273583164e-06, "loss": 0.6942, "step": 4700 }, { "epoch": 0.8860993169651098, "grad_norm": 68478.2890625, "learning_rate": 5.6950341517445085e-06, "loss": 0.7028, "step": 4800 }, { "epoch": 0.904559719401883, "grad_norm": 68026.8125, "learning_rate": 4.772014029905853e-06, "loss": 0.6694, "step": 4900 }, { "epoch": 0.9230201218386561, "grad_norm": 64797.25390625, "learning_rate": 3.848993908067195e-06, "loss": 0.704, "step": 5000 }, { "epoch": 0.9414805242754292, "grad_norm": 71872.890625, "learning_rate": 2.9259737862285397e-06, "loss": 0.6946, "step": 5100 }, { "epoch": 0.9599409267122023, "grad_norm": 74217.3125, "learning_rate": 2.002953664389884e-06, "loss": 0.6796, "step": 5200 }, { "epoch": 0.9784013291489755, "grad_norm": 68286.15625, "learning_rate": 1.0799335425512278e-06, "loss": 0.6966, "step": 5300 }, { "epoch": 0.9968617315857485, "grad_norm": 137991.28125, "learning_rate": 1.5691342071257153e-07, "loss": 0.7014, "step": 5400 }, { "epoch": 1.0, "step": 5417, "total_flos": 8.2628573134848e+17, "train_loss": 0.7277259675255335, "train_runtime": 75113.2183, "train_samples_per_second": 0.865, "train_steps_per_second": 0.072 } ], "logging_steps": 100, "max_steps": 5417, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5417, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.2628573134848e+17, "train_batch_size": 12, "trial_name": null, "trial_params": null }