|
{ |
|
"best_metric": 1.2635489702224731, |
|
"best_model_checkpoint": "data/Llama-31-8B_task-1_120-samples_config-4/checkpoint-137", |
|
"epoch": 32.0, |
|
"eval_steps": 500, |
|
"global_step": 176, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 1.874562382698059, |
|
"learning_rate": 1.3333333333333336e-07, |
|
"loss": 2.2898, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 1.8042479753494263, |
|
"learning_rate": 2.666666666666667e-07, |
|
"loss": 2.0811, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 1.760155200958252, |
|
"learning_rate": 5.333333333333335e-07, |
|
"loss": 2.121, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"eval_loss": 2.101999044418335, |
|
"eval_runtime": 9.6381, |
|
"eval_samples_per_second": 2.49, |
|
"eval_steps_per_second": 2.49, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"grad_norm": 1.5958633422851562, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 1.9468, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"grad_norm": 1.8069952726364136, |
|
"learning_rate": 1.066666666666667e-06, |
|
"loss": 2.1471, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 1.736191749572754, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 2.0709, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.0930726528167725, |
|
"eval_runtime": 9.6259, |
|
"eval_samples_per_second": 2.493, |
|
"eval_steps_per_second": 2.493, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 2.1818181818181817, |
|
"grad_norm": 1.6809439659118652, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 2.1306, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 1.747848391532898, |
|
"learning_rate": 1.8666666666666669e-06, |
|
"loss": 2.0776, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 2.909090909090909, |
|
"grad_norm": 1.8399686813354492, |
|
"learning_rate": 2.133333333333334e-06, |
|
"loss": 2.0454, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 2.909090909090909, |
|
"eval_loss": 2.075482130050659, |
|
"eval_runtime": 9.6366, |
|
"eval_samples_per_second": 2.491, |
|
"eval_steps_per_second": 2.491, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 3.2727272727272725, |
|
"grad_norm": 1.8816026449203491, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 2.1284, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 1.5757534503936768, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 2.03, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.9466415643692017, |
|
"learning_rate": 2.9333333333333338e-06, |
|
"loss": 2.0502, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.0472075939178467, |
|
"eval_runtime": 9.6277, |
|
"eval_samples_per_second": 2.493, |
|
"eval_steps_per_second": 2.493, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 4.363636363636363, |
|
"grad_norm": 1.6615264415740967, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 2.0197, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 4.7272727272727275, |
|
"grad_norm": 1.7619231939315796, |
|
"learning_rate": 3.4666666666666672e-06, |
|
"loss": 2.0511, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 4.909090909090909, |
|
"eval_loss": 2.0100386142730713, |
|
"eval_runtime": 9.6313, |
|
"eval_samples_per_second": 2.492, |
|
"eval_steps_per_second": 2.492, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 5.090909090909091, |
|
"grad_norm": 1.8135912418365479, |
|
"learning_rate": 3.7333333333333337e-06, |
|
"loss": 1.9759, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 5.454545454545454, |
|
"grad_norm": 1.8354995250701904, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.0128, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 5.818181818181818, |
|
"grad_norm": 1.8015000820159912, |
|
"learning_rate": 4.266666666666668e-06, |
|
"loss": 1.9554, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.9471648931503296, |
|
"eval_runtime": 9.6325, |
|
"eval_samples_per_second": 2.492, |
|
"eval_steps_per_second": 2.492, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 6.181818181818182, |
|
"grad_norm": 1.7802411317825317, |
|
"learning_rate": 4.533333333333334e-06, |
|
"loss": 1.9607, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 6.545454545454545, |
|
"grad_norm": 1.5615451335906982, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 1.9032, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 6.909090909090909, |
|
"grad_norm": 1.8741137981414795, |
|
"learning_rate": 5.0666666666666676e-06, |
|
"loss": 1.8921, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 6.909090909090909, |
|
"eval_loss": 1.8795281648635864, |
|
"eval_runtime": 9.6403, |
|
"eval_samples_per_second": 2.49, |
|
"eval_steps_per_second": 2.49, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 7.2727272727272725, |
|
"grad_norm": 1.7604111433029175, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 1.8681, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 7.636363636363637, |
|
"grad_norm": 1.6821084022521973, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 1.769, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 1.665964126586914, |
|
"learning_rate": 5.8666666666666675e-06, |
|
"loss": 1.8104, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.7812843322753906, |
|
"eval_runtime": 9.6236, |
|
"eval_samples_per_second": 2.494, |
|
"eval_steps_per_second": 2.494, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 8.363636363636363, |
|
"grad_norm": 1.5216890573501587, |
|
"learning_rate": 6.133333333333334e-06, |
|
"loss": 1.7145, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 8.727272727272727, |
|
"grad_norm": 1.4722410440444946, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 1.7636, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 8.909090909090908, |
|
"eval_loss": 1.6937414407730103, |
|
"eval_runtime": 9.6259, |
|
"eval_samples_per_second": 2.493, |
|
"eval_steps_per_second": 2.493, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 9.090909090909092, |
|
"grad_norm": 1.2136281728744507, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.659, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 9.454545454545455, |
|
"grad_norm": 1.0023685693740845, |
|
"learning_rate": 6.9333333333333344e-06, |
|
"loss": 1.6509, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 9.818181818181818, |
|
"grad_norm": 1.0440162420272827, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 1.6011, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 1.6141911745071411, |
|
"eval_runtime": 9.6302, |
|
"eval_samples_per_second": 2.492, |
|
"eval_steps_per_second": 2.492, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 10.181818181818182, |
|
"grad_norm": 0.7877157926559448, |
|
"learning_rate": 7.4666666666666675e-06, |
|
"loss": 1.5814, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 10.545454545454545, |
|
"grad_norm": 0.6534942984580994, |
|
"learning_rate": 7.733333333333334e-06, |
|
"loss": 1.5824, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 10.909090909090908, |
|
"grad_norm": 0.6240991950035095, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.5128, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 10.909090909090908, |
|
"eval_loss": 1.5751093626022339, |
|
"eval_runtime": 9.6475, |
|
"eval_samples_per_second": 2.488, |
|
"eval_steps_per_second": 2.488, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 11.272727272727273, |
|
"grad_norm": 0.6224139928817749, |
|
"learning_rate": 8.266666666666667e-06, |
|
"loss": 1.5444, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 11.636363636363637, |
|
"grad_norm": 0.6345284581184387, |
|
"learning_rate": 8.533333333333335e-06, |
|
"loss": 1.5682, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.5680299997329712, |
|
"learning_rate": 8.8e-06, |
|
"loss": 1.4277, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 1.5352739095687866, |
|
"eval_runtime": 9.6312, |
|
"eval_samples_per_second": 2.492, |
|
"eval_steps_per_second": 2.492, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 12.363636363636363, |
|
"grad_norm": 0.5991209745407104, |
|
"learning_rate": 9.066666666666667e-06, |
|
"loss": 1.4703, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 12.727272727272727, |
|
"grad_norm": 0.5993205308914185, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 1.4998, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 12.909090909090908, |
|
"eval_loss": 1.5001062154769897, |
|
"eval_runtime": 9.6248, |
|
"eval_samples_per_second": 2.494, |
|
"eval_steps_per_second": 2.494, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 13.090909090909092, |
|
"grad_norm": 0.5633314251899719, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 1.445, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 13.454545454545455, |
|
"grad_norm": 0.5419648885726929, |
|
"learning_rate": 9.866666666666668e-06, |
|
"loss": 1.4256, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 13.818181818181818, |
|
"grad_norm": 0.5384172201156616, |
|
"learning_rate": 9.999945845889795e-06, |
|
"loss": 1.4154, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 1.4582971334457397, |
|
"eval_runtime": 9.6218, |
|
"eval_samples_per_second": 2.494, |
|
"eval_steps_per_second": 2.494, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 14.181818181818182, |
|
"grad_norm": 0.6161755323410034, |
|
"learning_rate": 9.999512620046523e-06, |
|
"loss": 1.4459, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 14.545454545454545, |
|
"grad_norm": 0.5570430159568787, |
|
"learning_rate": 9.99864620589731e-06, |
|
"loss": 1.3661, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 14.909090909090908, |
|
"grad_norm": 0.5637471675872803, |
|
"learning_rate": 9.99734667851357e-06, |
|
"loss": 1.4201, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 14.909090909090908, |
|
"eval_loss": 1.4252301454544067, |
|
"eval_runtime": 9.6319, |
|
"eval_samples_per_second": 2.492, |
|
"eval_steps_per_second": 2.492, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 15.272727272727273, |
|
"grad_norm": 0.5539014935493469, |
|
"learning_rate": 9.995614150494293e-06, |
|
"loss": 1.3497, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 15.636363636363637, |
|
"grad_norm": 0.5583813786506653, |
|
"learning_rate": 9.993448771956285e-06, |
|
"loss": 1.3512, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.5377728939056396, |
|
"learning_rate": 9.99085073052117e-06, |
|
"loss": 1.3364, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 1.3921159505844116, |
|
"eval_runtime": 9.635, |
|
"eval_samples_per_second": 2.491, |
|
"eval_steps_per_second": 2.491, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 16.363636363636363, |
|
"grad_norm": 0.5390649437904358, |
|
"learning_rate": 9.987820251299121e-06, |
|
"loss": 1.3614, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 16.727272727272727, |
|
"grad_norm": 0.5126790404319763, |
|
"learning_rate": 9.984357596869369e-06, |
|
"loss": 1.2762, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 16.90909090909091, |
|
"eval_loss": 1.3691315650939941, |
|
"eval_runtime": 9.6319, |
|
"eval_samples_per_second": 2.492, |
|
"eval_steps_per_second": 2.492, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 17.09090909090909, |
|
"grad_norm": 0.5642189383506775, |
|
"learning_rate": 9.980463067257437e-06, |
|
"loss": 1.2961, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 17.454545454545453, |
|
"grad_norm": 0.5290245413780212, |
|
"learning_rate": 9.976136999909156e-06, |
|
"loss": 1.1987, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 17.818181818181817, |
|
"grad_norm": 0.5963008403778076, |
|
"learning_rate": 9.971379769661422e-06, |
|
"loss": 1.2851, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 1.3436861038208008, |
|
"eval_runtime": 9.6214, |
|
"eval_samples_per_second": 2.494, |
|
"eval_steps_per_second": 2.494, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 18.181818181818183, |
|
"grad_norm": 0.5820615291595459, |
|
"learning_rate": 9.966191788709716e-06, |
|
"loss": 1.329, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 18.545454545454547, |
|
"grad_norm": 0.5619508624076843, |
|
"learning_rate": 9.960573506572391e-06, |
|
"loss": 1.2428, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 18.90909090909091, |
|
"grad_norm": 0.5272645950317383, |
|
"learning_rate": 9.95452541005172e-06, |
|
"loss": 1.2239, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 18.90909090909091, |
|
"eval_loss": 1.3261139392852783, |
|
"eval_runtime": 9.6475, |
|
"eval_samples_per_second": 2.488, |
|
"eval_steps_per_second": 2.488, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 19.272727272727273, |
|
"grad_norm": 0.5720901489257812, |
|
"learning_rate": 9.948048023191728e-06, |
|
"loss": 1.1753, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 19.636363636363637, |
|
"grad_norm": 0.5877869725227356, |
|
"learning_rate": 9.941141907232766e-06, |
|
"loss": 1.2334, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.5674625039100647, |
|
"learning_rate": 9.933807660562898e-06, |
|
"loss": 1.221, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 1.308407187461853, |
|
"eval_runtime": 9.6226, |
|
"eval_samples_per_second": 2.494, |
|
"eval_steps_per_second": 2.494, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 20.363636363636363, |
|
"grad_norm": 0.5934170484542847, |
|
"learning_rate": 9.926045918666045e-06, |
|
"loss": 1.1685, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 20.727272727272727, |
|
"grad_norm": 0.6199212670326233, |
|
"learning_rate": 9.91785735406693e-06, |
|
"loss": 1.2011, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 20.90909090909091, |
|
"eval_loss": 1.2950953245162964, |
|
"eval_runtime": 9.6285, |
|
"eval_samples_per_second": 2.493, |
|
"eval_steps_per_second": 2.493, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 21.09090909090909, |
|
"grad_norm": 0.5995011329650879, |
|
"learning_rate": 9.909242676272797e-06, |
|
"loss": 1.1717, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 21.454545454545453, |
|
"grad_norm": 0.6024748682975769, |
|
"learning_rate": 9.90020263171194e-06, |
|
"loss": 1.1654, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 21.818181818181817, |
|
"grad_norm": 0.6147428750991821, |
|
"learning_rate": 9.890738003669029e-06, |
|
"loss": 1.1433, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 1.2823587656021118, |
|
"eval_runtime": 9.6228, |
|
"eval_samples_per_second": 2.494, |
|
"eval_steps_per_second": 2.494, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 22.181818181818183, |
|
"grad_norm": 0.612140953540802, |
|
"learning_rate": 9.880849612217238e-06, |
|
"loss": 1.0765, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 22.545454545454547, |
|
"grad_norm": 0.647298276424408, |
|
"learning_rate": 9.870538314147194e-06, |
|
"loss": 1.1183, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 22.90909090909091, |
|
"grad_norm": 0.6705971360206604, |
|
"learning_rate": 9.859805002892733e-06, |
|
"loss": 1.1579, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 22.90909090909091, |
|
"eval_loss": 1.2746213674545288, |
|
"eval_runtime": 9.6328, |
|
"eval_samples_per_second": 2.491, |
|
"eval_steps_per_second": 2.491, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 23.272727272727273, |
|
"grad_norm": 0.670023500919342, |
|
"learning_rate": 9.84865060845349e-06, |
|
"loss": 1.0965, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 23.636363636363637, |
|
"grad_norm": 0.6824691891670227, |
|
"learning_rate": 9.83707609731432e-06, |
|
"loss": 1.061, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 0.6598721146583557, |
|
"learning_rate": 9.825082472361558e-06, |
|
"loss": 1.0871, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 1.268039345741272, |
|
"eval_runtime": 9.6219, |
|
"eval_samples_per_second": 2.494, |
|
"eval_steps_per_second": 2.494, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 24.363636363636363, |
|
"grad_norm": 0.6824683547019958, |
|
"learning_rate": 9.812670772796113e-06, |
|
"loss": 1.0733, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 24.727272727272727, |
|
"grad_norm": 0.7309969663619995, |
|
"learning_rate": 9.799842074043438e-06, |
|
"loss": 1.0745, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 24.90909090909091, |
|
"eval_loss": 1.2635489702224731, |
|
"eval_runtime": 9.6334, |
|
"eval_samples_per_second": 2.491, |
|
"eval_steps_per_second": 2.491, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 25.09090909090909, |
|
"grad_norm": 0.8717047572135925, |
|
"learning_rate": 9.786597487660336e-06, |
|
"loss": 1.0049, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 25.454545454545453, |
|
"grad_norm": 0.7290262579917908, |
|
"learning_rate": 9.77293816123866e-06, |
|
"loss": 1.0355, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 25.818181818181817, |
|
"grad_norm": 0.8125291466712952, |
|
"learning_rate": 9.75886527830587e-06, |
|
"loss": 1.0006, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_loss": 1.2674241065979004, |
|
"eval_runtime": 9.6242, |
|
"eval_samples_per_second": 2.494, |
|
"eval_steps_per_second": 2.494, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 26.181818181818183, |
|
"grad_norm": 0.777037501335144, |
|
"learning_rate": 9.744380058222483e-06, |
|
"loss": 1.0235, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 26.545454545454547, |
|
"grad_norm": 0.7910988330841064, |
|
"learning_rate": 9.729483756076436e-06, |
|
"loss": 1.0119, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 26.90909090909091, |
|
"grad_norm": 0.8250744342803955, |
|
"learning_rate": 9.714177662574316e-06, |
|
"loss": 0.9628, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 26.90909090909091, |
|
"eval_loss": 1.2688733339309692, |
|
"eval_runtime": 9.6388, |
|
"eval_samples_per_second": 2.49, |
|
"eval_steps_per_second": 2.49, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 27.272727272727273, |
|
"grad_norm": 0.9542063474655151, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 0.9176, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 27.636363636363637, |
|
"grad_norm": 0.8577086925506592, |
|
"learning_rate": 9.682341441747446e-06, |
|
"loss": 0.9908, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.8569504022598267, |
|
"learning_rate": 9.665814072907293e-06, |
|
"loss": 0.9237, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_loss": 1.2716994285583496, |
|
"eval_runtime": 9.6271, |
|
"eval_samples_per_second": 2.493, |
|
"eval_steps_per_second": 2.493, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 28.363636363636363, |
|
"grad_norm": 0.933702826499939, |
|
"learning_rate": 9.648882429441258e-06, |
|
"loss": 0.9053, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 28.727272727272727, |
|
"grad_norm": 1.002100944519043, |
|
"learning_rate": 9.63154797841033e-06, |
|
"loss": 0.8824, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 28.90909090909091, |
|
"eval_loss": 1.2879880666732788, |
|
"eval_runtime": 9.6501, |
|
"eval_samples_per_second": 2.487, |
|
"eval_steps_per_second": 2.487, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 29.09090909090909, |
|
"grad_norm": 0.9883065819740295, |
|
"learning_rate": 9.613812221777212e-06, |
|
"loss": 0.9089, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 29.454545454545453, |
|
"grad_norm": 1.0561895370483398, |
|
"learning_rate": 9.595676696276173e-06, |
|
"loss": 0.9285, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 29.818181818181817, |
|
"grad_norm": 1.1776067018508911, |
|
"learning_rate": 9.577142973279896e-06, |
|
"loss": 0.8706, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 1.296054482460022, |
|
"eval_runtime": 9.6279, |
|
"eval_samples_per_second": 2.493, |
|
"eval_steps_per_second": 2.493, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 30.181818181818183, |
|
"grad_norm": 1.0879513025283813, |
|
"learning_rate": 9.55821265866333e-06, |
|
"loss": 0.7961, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 30.545454545454547, |
|
"grad_norm": 1.1668307781219482, |
|
"learning_rate": 9.538887392664544e-06, |
|
"loss": 0.7865, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 30.90909090909091, |
|
"grad_norm": 1.065364956855774, |
|
"learning_rate": 9.519168849742603e-06, |
|
"loss": 0.8328, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 30.90909090909091, |
|
"eval_loss": 1.326621174812317, |
|
"eval_runtime": 9.6286, |
|
"eval_samples_per_second": 2.493, |
|
"eval_steps_per_second": 2.493, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 31.272727272727273, |
|
"grad_norm": 1.143373727798462, |
|
"learning_rate": 9.499058738432492e-06, |
|
"loss": 0.8381, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 31.636363636363637, |
|
"grad_norm": 1.1452257633209229, |
|
"learning_rate": 9.478558801197065e-06, |
|
"loss": 0.7725, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 1.2163513898849487, |
|
"learning_rate": 9.457670814276083e-06, |
|
"loss": 0.7667, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_loss": 1.344734787940979, |
|
"eval_runtime": 9.6252, |
|
"eval_samples_per_second": 2.493, |
|
"eval_steps_per_second": 2.493, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"step": 176, |
|
"total_flos": 4.322507713465549e+16, |
|
"train_loss": 1.4077397015961735, |
|
"train_runtime": 3298.1261, |
|
"train_samples_per_second": 4.002, |
|
"train_steps_per_second": 0.227 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 750, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 150, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 7, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.322507713465549e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|