|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 5417, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.018460402436773122, |
|
"grad_norm": 72293.6015625, |
|
"learning_rate": 4.907697987816135e-05, |
|
"loss": 0.7984, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.036920804873546244, |
|
"grad_norm": 86618.71875, |
|
"learning_rate": 4.815395975632269e-05, |
|
"loss": 0.7874, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05538120731031936, |
|
"grad_norm": 82291.1953125, |
|
"learning_rate": 4.723093963448403e-05, |
|
"loss": 0.7683, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07384160974709249, |
|
"grad_norm": 72406.7265625, |
|
"learning_rate": 4.6307919512645376e-05, |
|
"loss": 0.7721, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0923020121838656, |
|
"grad_norm": 82859.0, |
|
"learning_rate": 4.538489939080672e-05, |
|
"loss": 0.7672, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.11076241462063872, |
|
"grad_norm": 72674.453125, |
|
"learning_rate": 4.4461879268968066e-05, |
|
"loss": 0.7741, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.12922281705741184, |
|
"grad_norm": 74598.8828125, |
|
"learning_rate": 4.3538859147129405e-05, |
|
"loss": 0.7684, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.14768321949418498, |
|
"grad_norm": 75325.09375, |
|
"learning_rate": 4.261583902529076e-05, |
|
"loss": 0.7706, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.1661436219309581, |
|
"grad_norm": 80050.4921875, |
|
"learning_rate": 4.1692818903452095e-05, |
|
"loss": 0.771, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.1846040243677312, |
|
"grad_norm": 71205.3359375, |
|
"learning_rate": 4.076979878161344e-05, |
|
"loss": 0.7458, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.20306442680450434, |
|
"grad_norm": 71098.4765625, |
|
"learning_rate": 3.9846778659774785e-05, |
|
"loss": 0.7632, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.22152482924127745, |
|
"grad_norm": 79350.5859375, |
|
"learning_rate": 3.892375853793613e-05, |
|
"loss": 0.7715, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.23998523167805058, |
|
"grad_norm": 78053.2578125, |
|
"learning_rate": 3.800073841609747e-05, |
|
"loss": 0.7582, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.2584456341148237, |
|
"grad_norm": 72609.0078125, |
|
"learning_rate": 3.707771829425882e-05, |
|
"loss": 0.7547, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.27690603655159685, |
|
"grad_norm": 75624.9453125, |
|
"learning_rate": 3.615469817242016e-05, |
|
"loss": 0.7432, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.29536643898836995, |
|
"grad_norm": 67118.828125, |
|
"learning_rate": 3.5231678050581504e-05, |
|
"loss": 0.7522, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.31382684142514305, |
|
"grad_norm": 77685.6171875, |
|
"learning_rate": 3.430865792874285e-05, |
|
"loss": 0.7529, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.3322872438619162, |
|
"grad_norm": 80151.4609375, |
|
"learning_rate": 3.338563780690419e-05, |
|
"loss": 0.7558, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3507476462986893, |
|
"grad_norm": 68374.609375, |
|
"learning_rate": 3.246261768506554e-05, |
|
"loss": 0.7547, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3692080487354624, |
|
"grad_norm": 69225.65625, |
|
"learning_rate": 3.153959756322688e-05, |
|
"loss": 0.7283, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.38766845117223553, |
|
"grad_norm": 93264.3984375, |
|
"learning_rate": 3.061657744138822e-05, |
|
"loss": 0.7368, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.4061288536090087, |
|
"grad_norm": 81487.3125, |
|
"learning_rate": 2.9693557319549568e-05, |
|
"loss": 0.7531, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.4245892560457818, |
|
"grad_norm": 72823.8125, |
|
"learning_rate": 2.8770537197710913e-05, |
|
"loss": 0.7216, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.4430496584825549, |
|
"grad_norm": 73118.8203125, |
|
"learning_rate": 2.7847517075872255e-05, |
|
"loss": 0.7356, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.46151006091932806, |
|
"grad_norm": 68678.484375, |
|
"learning_rate": 2.69244969540336e-05, |
|
"loss": 0.7308, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.47997046335610116, |
|
"grad_norm": 73918.640625, |
|
"learning_rate": 2.6001476832194942e-05, |
|
"loss": 0.7235, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.49843086579287427, |
|
"grad_norm": 66172.796875, |
|
"learning_rate": 2.5078456710356284e-05, |
|
"loss": 0.7283, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5168912682296474, |
|
"grad_norm": 66544.1015625, |
|
"learning_rate": 2.415543658851763e-05, |
|
"loss": 0.7228, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5353516706664205, |
|
"grad_norm": 66939.625, |
|
"learning_rate": 2.3232416466678974e-05, |
|
"loss": 0.7095, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.5538120731031937, |
|
"grad_norm": 67865.15625, |
|
"learning_rate": 2.230939634484032e-05, |
|
"loss": 0.7248, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5722724755399667, |
|
"grad_norm": 63117.76953125, |
|
"learning_rate": 2.138637622300166e-05, |
|
"loss": 0.7147, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5907328779767399, |
|
"grad_norm": 59398.1796875, |
|
"learning_rate": 2.0463356101163006e-05, |
|
"loss": 0.7031, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.6091932804135131, |
|
"grad_norm": 63816.08203125, |
|
"learning_rate": 1.954033597932435e-05, |
|
"loss": 0.7144, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.6276536828502861, |
|
"grad_norm": 70370.5625, |
|
"learning_rate": 1.8617315857485696e-05, |
|
"loss": 0.7141, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6461140852870593, |
|
"grad_norm": 63610.734375, |
|
"learning_rate": 1.7694295735647038e-05, |
|
"loss": 0.694, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6645744877238324, |
|
"grad_norm": 59664.05078125, |
|
"learning_rate": 1.6771275613808383e-05, |
|
"loss": 0.7086, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6830348901606055, |
|
"grad_norm": 64207.88671875, |
|
"learning_rate": 1.5848255491969728e-05, |
|
"loss": 0.7138, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.7014952925973786, |
|
"grad_norm": 79748.2734375, |
|
"learning_rate": 1.4925235370131068e-05, |
|
"loss": 0.6985, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.7199556950341518, |
|
"grad_norm": 72216.671875, |
|
"learning_rate": 1.4002215248292413e-05, |
|
"loss": 0.7019, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.7384160974709248, |
|
"grad_norm": 74815.6328125, |
|
"learning_rate": 1.3079195126453758e-05, |
|
"loss": 0.687, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.756876499907698, |
|
"grad_norm": 61482.41796875, |
|
"learning_rate": 1.2156175004615102e-05, |
|
"loss": 0.6983, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.7753369023444711, |
|
"grad_norm": 73800.75, |
|
"learning_rate": 1.1233154882776445e-05, |
|
"loss": 0.7004, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.7937973047812442, |
|
"grad_norm": 75985.421875, |
|
"learning_rate": 1.0310134760937789e-05, |
|
"loss": 0.7097, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.8122577072180174, |
|
"grad_norm": 67175.03125, |
|
"learning_rate": 9.387114639099132e-06, |
|
"loss": 0.6781, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.8307181096547904, |
|
"grad_norm": 68520.875, |
|
"learning_rate": 8.464094517260476e-06, |
|
"loss": 0.6857, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.8491785120915636, |
|
"grad_norm": 65612.703125, |
|
"learning_rate": 7.541074395421821e-06, |
|
"loss": 0.6971, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.8676389145283367, |
|
"grad_norm": 63280.83203125, |
|
"learning_rate": 6.618054273583164e-06, |
|
"loss": 0.6942, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.8860993169651098, |
|
"grad_norm": 68478.2890625, |
|
"learning_rate": 5.6950341517445085e-06, |
|
"loss": 0.7028, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.904559719401883, |
|
"grad_norm": 68026.8125, |
|
"learning_rate": 4.772014029905853e-06, |
|
"loss": 0.6694, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.9230201218386561, |
|
"grad_norm": 64797.25390625, |
|
"learning_rate": 3.848993908067195e-06, |
|
"loss": 0.704, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9414805242754292, |
|
"grad_norm": 71872.890625, |
|
"learning_rate": 2.9259737862285397e-06, |
|
"loss": 0.6946, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.9599409267122023, |
|
"grad_norm": 74217.3125, |
|
"learning_rate": 2.002953664389884e-06, |
|
"loss": 0.6796, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.9784013291489755, |
|
"grad_norm": 68286.15625, |
|
"learning_rate": 1.0799335425512278e-06, |
|
"loss": 0.6966, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.9968617315857485, |
|
"grad_norm": 137991.28125, |
|
"learning_rate": 1.5691342071257153e-07, |
|
"loss": 0.7014, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 5417, |
|
"total_flos": 8.2628573134848e+17, |
|
"train_loss": 0.7277259675255335, |
|
"train_runtime": 75113.2183, |
|
"train_samples_per_second": 0.865, |
|
"train_steps_per_second": 0.072 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 5417, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 5417, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.2628573134848e+17, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|