|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 50.0, |
|
"eval_steps": 500, |
|
"global_step": 104000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.975465714931488, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9891, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.3330873250961304, |
|
"learning_rate": 4e-05, |
|
"loss": 0.8643, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.202699899673462, |
|
"learning_rate": 6e-05, |
|
"loss": 0.8188, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.5446971654891968, |
|
"learning_rate": 8e-05, |
|
"loss": 0.7587, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.8479889631271362, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6784, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 1.271431803703308, |
|
"learning_rate": 0.00012, |
|
"loss": 0.6332, |
|
"step": 12480 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.828289806842804, |
|
"learning_rate": 0.00014, |
|
"loss": 0.6135, |
|
"step": 14560 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.5532117486000061, |
|
"learning_rate": 0.00016, |
|
"loss": 0.6021, |
|
"step": 16640 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.7734587788581848, |
|
"learning_rate": 0.00018, |
|
"loss": 0.5932, |
|
"step": 18720 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.9202972650527954, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5841, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 1.1167834997177124, |
|
"learning_rate": 0.0001996917333733128, |
|
"loss": 0.5765, |
|
"step": 22880 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.7493125796318054, |
|
"learning_rate": 0.00019876883405951377, |
|
"loss": 0.5694, |
|
"step": 24960 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 0.9302449226379395, |
|
"learning_rate": 0.00019723699203976766, |
|
"loss": 0.5617, |
|
"step": 27040 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.9727580547332764, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.5553, |
|
"step": 29120 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.6252313852310181, |
|
"learning_rate": 0.0001923879532511287, |
|
"loss": 0.5488, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.6910933256149292, |
|
"learning_rate": 0.0001891006524188368, |
|
"loss": 0.5406, |
|
"step": 33280 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.6691188812255859, |
|
"learning_rate": 0.00018526401643540922, |
|
"loss": 0.5347, |
|
"step": 35360 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.656794548034668, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.5267, |
|
"step": 37440 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.528489887714386, |
|
"learning_rate": 0.0001760405965600031, |
|
"loss": 0.5184, |
|
"step": 39520 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.7208832502365112, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.5087, |
|
"step": 41600 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 0.6855819225311279, |
|
"learning_rate": 0.00016494480483301836, |
|
"loss": 0.4984, |
|
"step": 43680 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.9891296625137329, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 0.4858, |
|
"step": 45760 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 1.0229650735855103, |
|
"learning_rate": 0.0001522498564715949, |
|
"loss": 0.4719, |
|
"step": 47840 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 0.9607130289077759, |
|
"learning_rate": 0.00014539904997395468, |
|
"loss": 0.4558, |
|
"step": 49920 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 1.1026415824890137, |
|
"learning_rate": 0.000138268343236509, |
|
"loss": 0.4375, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 0.9125514030456543, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.4175, |
|
"step": 54080 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"grad_norm": 0.9571768045425415, |
|
"learning_rate": 0.00012334453638559057, |
|
"loss": 0.3942, |
|
"step": 56160 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 1.0941965579986572, |
|
"learning_rate": 0.0001156434465040231, |
|
"loss": 0.3696, |
|
"step": 58240 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"grad_norm": 1.6244746446609497, |
|
"learning_rate": 0.0001078459095727845, |
|
"loss": 0.3424, |
|
"step": 60320 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 1.409400463104248, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3138, |
|
"step": 62400 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"grad_norm": 1.7202110290527344, |
|
"learning_rate": 9.215409042721552e-05, |
|
"loss": 0.2836, |
|
"step": 64480 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 1.6299169063568115, |
|
"learning_rate": 8.435655349597689e-05, |
|
"loss": 0.2525, |
|
"step": 66560 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"grad_norm": 1.7053184509277344, |
|
"learning_rate": 7.66554636144095e-05, |
|
"loss": 0.2217, |
|
"step": 68640 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"grad_norm": 1.7824922800064087, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.192, |
|
"step": 70720 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"grad_norm": 1.8197373151779175, |
|
"learning_rate": 6.173165676349103e-05, |
|
"loss": 0.1645, |
|
"step": 72800 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 1.625858187675476, |
|
"learning_rate": 5.4600950026045326e-05, |
|
"loss": 0.1394, |
|
"step": 74880 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"grad_norm": 1.8438661098480225, |
|
"learning_rate": 4.7750143528405126e-05, |
|
"loss": 0.1171, |
|
"step": 76960 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"grad_norm": 1.7667992115020752, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 0.0976, |
|
"step": 79040 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"grad_norm": 1.7099578380584717, |
|
"learning_rate": 3.5055195166981645e-05, |
|
"loss": 0.0814, |
|
"step": 81120 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 1.9202967882156372, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 0.0679, |
|
"step": 83200 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"grad_norm": 1.7177019119262695, |
|
"learning_rate": 2.3959403439996907e-05, |
|
"loss": 0.0569, |
|
"step": 85280 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"grad_norm": 1.6319146156311035, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.0482, |
|
"step": 87360 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"grad_norm": 1.3095797300338745, |
|
"learning_rate": 1.4735983564590783e-05, |
|
"loss": 0.0412, |
|
"step": 89440 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 1.5144089460372925, |
|
"learning_rate": 1.0899347581163221e-05, |
|
"loss": 0.0362, |
|
"step": 91520 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"grad_norm": 1.2941925525665283, |
|
"learning_rate": 7.612046748871327e-06, |
|
"loss": 0.0321, |
|
"step": 93600 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"grad_norm": 1.1236457824707031, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 0.0293, |
|
"step": 95680 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"grad_norm": 1.4110456705093384, |
|
"learning_rate": 2.7630079602323442e-06, |
|
"loss": 0.027, |
|
"step": 97760 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 1.5882657766342163, |
|
"learning_rate": 1.231165940486234e-06, |
|
"loss": 0.0258, |
|
"step": 99840 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"grad_norm": 1.0106526613235474, |
|
"learning_rate": 3.0826662668720364e-07, |
|
"loss": 0.0249, |
|
"step": 101920 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 0.8798067569732666, |
|
"learning_rate": 0.0, |
|
"loss": 0.0246, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"step": 104000, |
|
"total_flos": 4.3571154598634004e+18, |
|
"train_loss": 0.37453963272388163, |
|
"train_runtime": 280880.5286, |
|
"train_samples_per_second": 0.741, |
|
"train_steps_per_second": 0.37 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 104000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.3571154598634004e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|