{ "best_metric": 1.159261703491211, "best_model_checkpoint": "data/tinyllama_moe_sft_ultrachat200k_v2/checkpoint-1100", "epoch": 0.9995619798510732, "eval_steps": 100, "global_step": 1141, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.7391304347826088e-07, "loss": 2.7477, "step": 1 }, { "epoch": 0.0, "learning_rate": 8.695652173913044e-07, "loss": 2.6989, "step": 5 }, { "epoch": 0.01, "learning_rate": 1.7391304347826088e-06, "loss": 2.7026, "step": 10 }, { "epoch": 0.01, "learning_rate": 2.6086956521739132e-06, "loss": 2.5994, "step": 15 }, { "epoch": 0.02, "learning_rate": 3.4782608695652175e-06, "loss": 2.4353, "step": 20 }, { "epoch": 0.02, "learning_rate": 4.347826086956522e-06, "loss": 2.0773, "step": 25 }, { "epoch": 0.03, "learning_rate": 5.2173913043478265e-06, "loss": 1.8706, "step": 30 }, { "epoch": 0.03, "learning_rate": 6.086956521739132e-06, "loss": 1.7352, "step": 35 }, { "epoch": 0.04, "learning_rate": 6.956521739130435e-06, "loss": 1.6574, "step": 40 }, { "epoch": 0.04, "learning_rate": 7.82608695652174e-06, "loss": 1.5879, "step": 45 }, { "epoch": 0.04, "learning_rate": 8.695652173913044e-06, "loss": 1.5429, "step": 50 }, { "epoch": 0.05, "learning_rate": 9.565217391304349e-06, "loss": 1.4804, "step": 55 }, { "epoch": 0.05, "learning_rate": 1.0434782608695653e-05, "loss": 1.4633, "step": 60 }, { "epoch": 0.06, "learning_rate": 1.1304347826086957e-05, "loss": 1.4199, "step": 65 }, { "epoch": 0.06, "learning_rate": 1.2173913043478263e-05, "loss": 1.4117, "step": 70 }, { "epoch": 0.07, "learning_rate": 1.3043478260869566e-05, "loss": 1.3667, "step": 75 }, { "epoch": 0.07, "learning_rate": 1.391304347826087e-05, "loss": 1.359, "step": 80 }, { "epoch": 0.07, "learning_rate": 1.4782608695652174e-05, "loss": 1.3563, "step": 85 }, { "epoch": 0.08, "learning_rate": 1.565217391304348e-05, "loss": 1.3425, "step": 90 }, { "epoch": 0.08, "learning_rate": 1.6521739130434785e-05, "loss": 1.3117, "step": 95 }, { "epoch": 0.09, "learning_rate": 1.739130434782609e-05, "loss": 1.336, "step": 100 }, { "epoch": 0.09, "eval_loss": 1.3140063285827637, "eval_runtime": 441.4184, "eval_samples_per_second": 36.623, "eval_steps_per_second": 1.146, "step": 100 }, { "epoch": 0.09, "learning_rate": 1.8260869565217393e-05, "loss": 1.3087, "step": 105 }, { "epoch": 0.1, "learning_rate": 1.9130434782608697e-05, "loss": 1.2993, "step": 110 }, { "epoch": 0.1, "learning_rate": 2e-05, "loss": 1.2998, "step": 115 }, { "epoch": 0.11, "learning_rate": 1.99988280568259e-05, "loss": 1.2738, "step": 120 }, { "epoch": 0.11, "learning_rate": 1.9995312501993765e-05, "loss": 1.277, "step": 125 }, { "epoch": 0.11, "learning_rate": 1.998945415950969e-05, "loss": 1.255, "step": 130 }, { "epoch": 0.12, "learning_rate": 1.9981254402502568e-05, "loss": 1.271, "step": 135 }, { "epoch": 0.12, "learning_rate": 1.9970715152902257e-05, "loss": 1.282, "step": 140 }, { "epoch": 0.13, "learning_rate": 1.9957838880989076e-05, "loss": 1.2598, "step": 145 }, { "epoch": 0.13, "learning_rate": 1.9942628604814827e-05, "loss": 1.2623, "step": 150 }, { "epoch": 0.14, "learning_rate": 1.9925087889495374e-05, "loss": 1.2542, "step": 155 }, { "epoch": 0.14, "learning_rate": 1.990522084637503e-05, "loss": 1.2481, "step": 160 }, { "epoch": 0.14, "learning_rate": 1.9883032132062926e-05, "loss": 1.2445, "step": 165 }, { "epoch": 0.15, "learning_rate": 1.98585269473415e-05, "loss": 1.2483, "step": 170 }, { "epoch": 0.15, "learning_rate": 1.9831711035947552e-05, "loss": 1.2575, "step": 175 }, { "epoch": 0.16, "learning_rate": 1.9802590683225945e-05, "loss": 1.2308, "step": 180 }, { "epoch": 0.16, "learning_rate": 1.97711727146564e-05, "loss": 1.2328, "step": 185 }, { "epoch": 0.17, "learning_rate": 1.973746449425368e-05, "loss": 1.2423, "step": 190 }, { "epoch": 0.17, "learning_rate": 1.970147392284154e-05, "loss": 1.2272, "step": 195 }, { "epoch": 0.18, "learning_rate": 1.9663209436200887e-05, "loss": 1.2426, "step": 200 }, { "epoch": 0.18, "eval_loss": 1.2375621795654297, "eval_runtime": 439.5821, "eval_samples_per_second": 36.776, "eval_steps_per_second": 1.151, "step": 200 }, { "epoch": 0.18, "learning_rate": 1.9622680003092503e-05, "loss": 1.2404, "step": 205 }, { "epoch": 0.18, "learning_rate": 1.957989512315489e-05, "loss": 1.2137, "step": 210 }, { "epoch": 0.19, "learning_rate": 1.953486482467764e-05, "loss": 1.2462, "step": 215 }, { "epoch": 0.19, "learning_rate": 1.9487599662250945e-05, "loss": 1.2199, "step": 220 }, { "epoch": 0.2, "learning_rate": 1.9438110714291697e-05, "loss": 1.2038, "step": 225 }, { "epoch": 0.2, "learning_rate": 1.9386409580446846e-05, "loss": 1.2223, "step": 230 }, { "epoch": 0.21, "learning_rate": 1.933250837887457e-05, "loss": 1.2325, "step": 235 }, { "epoch": 0.21, "learning_rate": 1.9276419743403934e-05, "loss": 1.2049, "step": 240 }, { "epoch": 0.21, "learning_rate": 1.9218156820573618e-05, "loss": 1.2341, "step": 245 }, { "epoch": 0.22, "learning_rate": 1.9157733266550577e-05, "loss": 1.2086, "step": 250 }, { "epoch": 0.22, "learning_rate": 1.9095163243929143e-05, "loss": 1.2172, "step": 255 }, { "epoch": 0.23, "learning_rate": 1.9030461418411498e-05, "loss": 1.2212, "step": 260 }, { "epoch": 0.23, "learning_rate": 1.8963642955370203e-05, "loss": 1.2019, "step": 265 }, { "epoch": 0.24, "learning_rate": 1.889472351629358e-05, "loss": 1.2031, "step": 270 }, { "epoch": 0.24, "learning_rate": 1.882371925511488e-05, "loss": 1.2174, "step": 275 }, { "epoch": 0.25, "learning_rate": 1.875064681442594e-05, "loss": 1.1925, "step": 280 }, { "epoch": 0.25, "learning_rate": 1.867552332157637e-05, "loss": 1.2244, "step": 285 }, { "epoch": 0.25, "learning_rate": 1.8598366384659113e-05, "loss": 1.1928, "step": 290 }, { "epoch": 0.26, "learning_rate": 1.851919408838327e-05, "loss": 1.2125, "step": 295 }, { "epoch": 0.26, "learning_rate": 1.843802498983529e-05, "loss": 1.2083, "step": 300 }, { "epoch": 0.26, "eval_loss": 1.2099547386169434, "eval_runtime": 440.0729, "eval_samples_per_second": 36.735, "eval_steps_per_second": 1.15, "step": 300 }, { "epoch": 0.27, "learning_rate": 1.8354878114129368e-05, "loss": 1.2108, "step": 305 }, { "epoch": 0.27, "learning_rate": 1.8269772949948185e-05, "loss": 1.1779, "step": 310 }, { "epoch": 0.28, "learning_rate": 1.8182729444974993e-05, "loss": 1.2009, "step": 315 }, { "epoch": 0.28, "learning_rate": 1.8093768001218096e-05, "loss": 1.1952, "step": 320 }, { "epoch": 0.28, "learning_rate": 1.800290947022884e-05, "loss": 1.1917, "step": 325 }, { "epoch": 0.29, "learning_rate": 1.7910175148214274e-05, "loss": 1.1865, "step": 330 }, { "epoch": 0.29, "learning_rate": 1.7815586771045535e-05, "loss": 1.238, "step": 335 }, { "epoch": 0.3, "learning_rate": 1.771916650916321e-05, "loss": 1.1902, "step": 340 }, { "epoch": 0.3, "learning_rate": 1.762093696238086e-05, "loss": 1.1978, "step": 345 }, { "epoch": 0.31, "learning_rate": 1.752092115458784e-05, "loss": 1.1917, "step": 350 }, { "epoch": 0.31, "learning_rate": 1.7419142528352815e-05, "loss": 1.1908, "step": 355 }, { "epoch": 0.32, "learning_rate": 1.731562493942904e-05, "loss": 1.1964, "step": 360 }, { "epoch": 0.32, "learning_rate": 1.721039265116285e-05, "loss": 1.1929, "step": 365 }, { "epoch": 0.32, "learning_rate": 1.710347032880664e-05, "loss": 1.1871, "step": 370 }, { "epoch": 0.33, "learning_rate": 1.6994883033737582e-05, "loss": 1.1787, "step": 375 }, { "epoch": 0.33, "learning_rate": 1.688465621758352e-05, "loss": 1.1942, "step": 380 }, { "epoch": 0.34, "learning_rate": 1.6772815716257414e-05, "loss": 1.1841, "step": 385 }, { "epoch": 0.34, "learning_rate": 1.6659387743901688e-05, "loss": 1.1773, "step": 390 }, { "epoch": 0.35, "learning_rate": 1.6544398886743934e-05, "loss": 1.1789, "step": 395 }, { "epoch": 0.35, "learning_rate": 1.6427876096865394e-05, "loss": 1.1862, "step": 400 }, { "epoch": 0.35, "eval_loss": 1.1934226751327515, "eval_runtime": 441.3165, "eval_samples_per_second": 36.631, "eval_steps_per_second": 1.147, "step": 400 }, { "epoch": 0.35, "learning_rate": 1.6309846685883726e-05, "loss": 1.2072, "step": 405 }, { "epoch": 0.36, "learning_rate": 1.6190338318551426e-05, "loss": 1.1989, "step": 410 }, { "epoch": 0.36, "learning_rate": 1.606937900627157e-05, "loss": 1.1853, "step": 415 }, { "epoch": 0.37, "learning_rate": 1.594699710053223e-05, "loss": 1.1869, "step": 420 }, { "epoch": 0.37, "learning_rate": 1.5823221286261217e-05, "loss": 1.1782, "step": 425 }, { "epoch": 0.38, "learning_rate": 1.5698080575102662e-05, "loss": 1.1856, "step": 430 }, { "epoch": 0.38, "learning_rate": 1.557160429861702e-05, "loss": 1.1724, "step": 435 }, { "epoch": 0.39, "learning_rate": 1.5443822101406066e-05, "loss": 1.1868, "step": 440 }, { "epoch": 0.39, "learning_rate": 1.531476393416456e-05, "loss": 1.1764, "step": 445 }, { "epoch": 0.39, "learning_rate": 1.5184460046660139e-05, "loss": 1.2012, "step": 450 }, { "epoch": 0.4, "learning_rate": 1.50529409806431e-05, "loss": 1.1718, "step": 455 }, { "epoch": 0.4, "learning_rate": 1.4920237562687784e-05, "loss": 1.1708, "step": 460 }, { "epoch": 0.41, "learning_rate": 1.478638089696716e-05, "loss": 1.1811, "step": 465 }, { "epoch": 0.41, "learning_rate": 1.4651402357962368e-05, "loss": 1.1852, "step": 470 }, { "epoch": 0.42, "learning_rate": 1.4515333583108896e-05, "loss": 1.1792, "step": 475 }, { "epoch": 0.42, "learning_rate": 1.4378206465381122e-05, "loss": 1.1866, "step": 480 }, { "epoch": 0.42, "learning_rate": 1.4240053145816968e-05, "loss": 1.1529, "step": 485 }, { "epoch": 0.43, "learning_rate": 1.4100906005984404e-05, "loss": 1.1858, "step": 490 }, { "epoch": 0.43, "learning_rate": 1.396079766039157e-05, "loss": 1.1694, "step": 495 }, { "epoch": 0.44, "learning_rate": 1.381976094884232e-05, "loss": 1.1567, "step": 500 }, { "epoch": 0.44, "eval_loss": 1.181981086730957, "eval_runtime": 440.5529, "eval_samples_per_second": 36.695, "eval_steps_per_second": 1.149, "step": 500 }, { "epoch": 0.44, "learning_rate": 1.3677828928738934e-05, "loss": 1.1706, "step": 505 }, { "epoch": 0.45, "learning_rate": 1.3535034867333838e-05, "loss": 1.1738, "step": 510 }, { "epoch": 0.45, "learning_rate": 1.3391412233932148e-05, "loss": 1.1816, "step": 515 }, { "epoch": 0.46, "learning_rate": 1.3246994692046837e-05, "loss": 1.1519, "step": 520 }, { "epoch": 0.46, "learning_rate": 1.3101816091508389e-05, "loss": 1.1715, "step": 525 }, { "epoch": 0.46, "learning_rate": 1.2955910460530787e-05, "loss": 1.162, "step": 530 }, { "epoch": 0.47, "learning_rate": 1.2809311997735697e-05, "loss": 1.1583, "step": 535 }, { "epoch": 0.47, "learning_rate": 1.266205506413667e-05, "loss": 1.173, "step": 540 }, { "epoch": 0.48, "learning_rate": 1.2514174175085346e-05, "loss": 1.1622, "step": 545 }, { "epoch": 0.48, "learning_rate": 1.2365703992181425e-05, "loss": 1.1742, "step": 550 }, { "epoch": 0.49, "learning_rate": 1.2216679315148388e-05, "loss": 1.1752, "step": 555 }, { "epoch": 0.49, "learning_rate": 1.2067135073676841e-05, "loss": 1.1819, "step": 560 }, { "epoch": 0.49, "learning_rate": 1.1917106319237386e-05, "loss": 1.179, "step": 565 }, { "epoch": 0.5, "learning_rate": 1.1766628216864961e-05, "loss": 1.1706, "step": 570 }, { "epoch": 0.5, "learning_rate": 1.161573603691655e-05, "loss": 1.1717, "step": 575 }, { "epoch": 0.51, "learning_rate": 1.1464465146804218e-05, "loss": 1.1563, "step": 580 }, { "epoch": 0.51, "learning_rate": 1.1312851002705383e-05, "loss": 1.1721, "step": 585 }, { "epoch": 0.52, "learning_rate": 1.1160929141252303e-05, "loss": 1.1635, "step": 590 }, { "epoch": 0.52, "learning_rate": 1.1008735171202685e-05, "loss": 1.1883, "step": 595 }, { "epoch": 0.53, "learning_rate": 1.0856304765093391e-05, "loss": 1.1777, "step": 600 }, { "epoch": 0.53, "eval_loss": 1.1737236976623535, "eval_runtime": 439.8017, "eval_samples_per_second": 36.757, "eval_steps_per_second": 1.151, "step": 600 }, { "epoch": 0.53, "learning_rate": 1.0703673650879219e-05, "loss": 1.1483, "step": 605 }, { "epoch": 0.53, "learning_rate": 1.0550877603558656e-05, "loss": 1.1606, "step": 610 }, { "epoch": 0.54, "learning_rate": 1.0397952436788643e-05, "loss": 1.1813, "step": 615 }, { "epoch": 0.54, "learning_rate": 1.024493399449025e-05, "loss": 1.1574, "step": 620 }, { "epoch": 0.55, "learning_rate": 1.0091858142447266e-05, "loss": 1.1614, "step": 625 }, { "epoch": 0.55, "learning_rate": 9.938760759899674e-06, "loss": 1.1617, "step": 630 }, { "epoch": 0.56, "learning_rate": 9.785677731133972e-06, "loss": 1.1814, "step": 635 }, { "epoch": 0.56, "learning_rate": 9.632644937072277e-06, "loss": 1.1725, "step": 640 }, { "epoch": 0.57, "learning_rate": 9.479698246862277e-06, "loss": 1.1692, "step": 645 }, { "epoch": 0.57, "learning_rate": 9.326873509469887e-06, "loss": 1.1591, "step": 650 }, { "epoch": 0.57, "learning_rate": 9.174206545276678e-06, "loss": 1.1652, "step": 655 }, { "epoch": 0.58, "learning_rate": 9.021733137683963e-06, "loss": 1.1701, "step": 660 }, { "epoch": 0.58, "learning_rate": 8.869489024725595e-06, "loss": 1.1608, "step": 665 }, { "epoch": 0.59, "learning_rate": 8.717509890691369e-06, "loss": 1.1485, "step": 670 }, { "epoch": 0.59, "learning_rate": 8.565831357763039e-06, "loss": 1.1717, "step": 675 }, { "epoch": 0.6, "learning_rate": 8.414488977664858e-06, "loss": 1.1788, "step": 680 }, { "epoch": 0.6, "learning_rate": 8.263518223330698e-06, "loss": 1.1595, "step": 685 }, { "epoch": 0.6, "learning_rate": 8.112954480589558e-06, "loss": 1.1612, "step": 690 }, { "epoch": 0.61, "learning_rate": 7.962833039871562e-06, "loss": 1.1542, "step": 695 }, { "epoch": 0.61, "learning_rate": 7.813189087936243e-06, "loss": 1.1666, "step": 700 }, { "epoch": 0.61, "eval_loss": 1.167747974395752, "eval_runtime": 441.5379, "eval_samples_per_second": 36.613, "eval_steps_per_second": 1.146, "step": 700 }, { "epoch": 0.62, "learning_rate": 7.664057699625215e-06, "loss": 1.1644, "step": 705 }, { "epoch": 0.62, "learning_rate": 7.515473829640987e-06, "loss": 1.1369, "step": 710 }, { "epoch": 0.63, "learning_rate": 7.367472304354011e-06, "loss": 1.1445, "step": 715 }, { "epoch": 0.63, "learning_rate": 7.2200878136397355e-06, "loss": 1.1747, "step": 720 }, { "epoch": 0.64, "learning_rate": 7.073354902747742e-06, "loss": 1.1431, "step": 725 }, { "epoch": 0.64, "learning_rate": 6.927307964204695e-06, "loss": 1.1846, "step": 730 }, { "epoch": 0.64, "learning_rate": 6.781981229753145e-06, "loss": 1.1693, "step": 735 }, { "epoch": 0.65, "learning_rate": 6.637408762327972e-06, "loss": 1.1575, "step": 740 }, { "epoch": 0.65, "learning_rate": 6.4936244480724575e-06, "loss": 1.1708, "step": 745 }, { "epoch": 0.66, "learning_rate": 6.350661988395723e-06, "loss": 1.156, "step": 750 }, { "epoch": 0.66, "learning_rate": 6.208554892073528e-06, "loss": 1.1704, "step": 755 }, { "epoch": 0.67, "learning_rate": 6.067336467394169e-06, "loss": 1.1596, "step": 760 }, { "epoch": 0.67, "learning_rate": 5.927039814351426e-06, "loss": 1.1438, "step": 765 }, { "epoch": 0.67, "learning_rate": 5.787697816886273e-06, "loss": 1.1443, "step": 770 }, { "epoch": 0.68, "learning_rate": 5.649343135179271e-06, "loss": 1.1509, "step": 775 }, { "epoch": 0.68, "learning_rate": 5.512008197995379e-06, "loss": 1.1432, "step": 780 }, { "epoch": 0.69, "learning_rate": 5.375725195083046e-06, "loss": 1.1613, "step": 785 }, { "epoch": 0.69, "learning_rate": 5.240526069629265e-06, "loss": 1.1385, "step": 790 }, { "epoch": 0.7, "learning_rate": 5.106442510772489e-06, "loss": 1.1672, "step": 795 }, { "epoch": 0.7, "learning_rate": 4.97350594617502e-06, "loss": 1.1531, "step": 800 }, { "epoch": 0.7, "eval_loss": 1.1635608673095703, "eval_runtime": 440.8581, "eval_samples_per_second": 36.669, "eval_steps_per_second": 1.148, "step": 800 }, { "epoch": 0.71, "learning_rate": 4.8417475346567635e-06, "loss": 1.138, "step": 805 }, { "epoch": 0.71, "learning_rate": 4.711198158891909e-06, "loss": 1.1437, "step": 810 }, { "epoch": 0.71, "learning_rate": 4.581888418170429e-06, "loss": 1.141, "step": 815 }, { "epoch": 0.72, "learning_rate": 4.453848621225913e-06, "loss": 1.1341, "step": 820 }, { "epoch": 0.72, "learning_rate": 4.327108779131573e-06, "loss": 1.1565, "step": 825 }, { "epoch": 0.73, "learning_rate": 4.201698598265973e-06, "loss": 1.1516, "step": 830 }, { "epoch": 0.73, "learning_rate": 4.077647473350201e-06, "loss": 1.1764, "step": 835 }, { "epoch": 0.74, "learning_rate": 3.954984480558071e-06, "loss": 1.1538, "step": 840 }, { "epoch": 0.74, "learning_rate": 3.83373837070101e-06, "loss": 1.1691, "step": 845 }, { "epoch": 0.74, "learning_rate": 3.7139375624891795e-06, "loss": 1.1479, "step": 850 }, { "epoch": 0.75, "learning_rate": 3.595610135870472e-06, "loss": 1.1609, "step": 855 }, { "epoch": 0.75, "learning_rate": 3.478783825448869e-06, "loss": 1.1556, "step": 860 }, { "epoch": 0.76, "learning_rate": 3.3634860139837877e-06, "loss": 1.1711, "step": 865 }, { "epoch": 0.76, "learning_rate": 3.249743725971849e-06, "loss": 1.1623, "step": 870 }, { "epoch": 0.77, "learning_rate": 3.1375836213126653e-06, "loss": 1.144, "step": 875 }, { "epoch": 0.77, "learning_rate": 3.0270319890600465e-06, "loss": 1.1572, "step": 880 }, { "epoch": 0.78, "learning_rate": 2.918114741260156e-06, "loss": 1.1427, "step": 885 }, { "epoch": 0.78, "learning_rate": 2.8108574068780093e-06, "loss": 1.1427, "step": 890 }, { "epoch": 0.78, "learning_rate": 2.7052851258137936e-06, "loss": 1.1488, "step": 895 }, { "epoch": 0.79, "learning_rate": 2.601422643010335e-06, "loss": 1.1525, "step": 900 }, { "epoch": 0.79, "eval_loss": 1.1609553098678589, "eval_runtime": 441.3737, "eval_samples_per_second": 36.627, "eval_steps_per_second": 1.146, "step": 900 }, { "epoch": 0.79, "learning_rate": 2.4992943026531935e-06, "loss": 1.1582, "step": 905 }, { "epoch": 0.8, "learning_rate": 2.3989240424646355e-06, "loss": 1.1551, "step": 910 }, { "epoch": 0.8, "learning_rate": 2.300335388092929e-06, "loss": 1.1683, "step": 915 }, { "epoch": 0.81, "learning_rate": 2.2035514475981756e-06, "loss": 1.1447, "step": 920 }, { "epoch": 0.81, "learning_rate": 2.1085949060360654e-06, "loss": 1.1543, "step": 925 }, { "epoch": 0.81, "learning_rate": 2.015488020140737e-06, "loss": 1.1448, "step": 930 }, { "epoch": 0.82, "learning_rate": 1.924252613108073e-06, "loss": 1.1677, "step": 935 }, { "epoch": 0.82, "learning_rate": 1.8349100694805711e-06, "loss": 1.1586, "step": 940 }, { "epoch": 0.83, "learning_rate": 1.7474813301350668e-06, "loss": 1.1592, "step": 945 }, { "epoch": 0.83, "learning_rate": 1.661986887374415e-06, "loss": 1.1702, "step": 950 }, { "epoch": 0.84, "learning_rate": 1.578446780124344e-06, "loss": 1.1667, "step": 955 }, { "epoch": 0.84, "learning_rate": 1.49688058923654e-06, "loss": 1.1489, "step": 960 }, { "epoch": 0.85, "learning_rate": 1.4173074328991376e-06, "loss": 1.1448, "step": 965 }, { "epoch": 0.85, "learning_rate": 1.339745962155613e-06, "loss": 1.1531, "step": 970 }, { "epoch": 0.85, "learning_rate": 1.2642143565332154e-06, "loss": 1.1668, "step": 975 }, { "epoch": 0.86, "learning_rate": 1.1907303197818665e-06, "loss": 1.1618, "step": 980 }, { "epoch": 0.86, "learning_rate": 1.1193110757246251e-06, "loss": 1.1456, "step": 985 }, { "epoch": 0.87, "learning_rate": 1.0499733642206034e-06, "loss": 1.1493, "step": 990 }, { "epoch": 0.87, "learning_rate": 9.827334372413444e-07, "loss": 1.1657, "step": 995 }, { "epoch": 0.88, "learning_rate": 9.176070550615379e-07, "loss": 1.1396, "step": 1000 }, { "epoch": 0.88, "eval_loss": 1.1596382856369019, "eval_runtime": 440.1747, "eval_samples_per_second": 36.726, "eval_steps_per_second": 1.15, "step": 1000 }, { "epoch": 0.88, "learning_rate": 8.546094825649909e-07, "loss": 1.1409, "step": 1005 }, { "epoch": 0.88, "learning_rate": 7.937554856667196e-07, "loss": 1.1476, "step": 1010 }, { "epoch": 0.89, "learning_rate": 7.350593278519824e-07, "loss": 1.1424, "step": 1015 }, { "epoch": 0.89, "learning_rate": 6.785347668330777e-07, "loss": 1.1394, "step": 1020 }, { "epoch": 0.9, "learning_rate": 6.241950513246931e-07, "loss": 1.159, "step": 1025 }, { "epoch": 0.9, "learning_rate": 5.720529179385659e-07, "loss": 1.139, "step": 1030 }, { "epoch": 0.91, "learning_rate": 5.221205881981594e-07, "loss": 1.15, "step": 1035 }, { "epoch": 0.91, "learning_rate": 4.7440976567407096e-07, "loss": 1.1592, "step": 1040 }, { "epoch": 0.92, "learning_rate": 4.2893163324085886e-07, "loss": 1.1574, "step": 1045 }, { "epoch": 0.92, "learning_rate": 3.856968504558989e-07, "loss": 1.1491, "step": 1050 }, { "epoch": 0.92, "learning_rate": 3.4471555106090573e-07, "loss": 1.1452, "step": 1055 }, { "epoch": 0.93, "learning_rate": 3.059973406066963e-07, "loss": 1.1886, "step": 1060 }, { "epoch": 0.93, "learning_rate": 2.6955129420176193e-07, "loss": 1.1573, "step": 1065 }, { "epoch": 0.94, "learning_rate": 2.3538595438516442e-07, "loss": 1.1489, "step": 1070 }, { "epoch": 0.94, "learning_rate": 2.035093291242607e-07, "loss": 1.1433, "step": 1075 }, { "epoch": 0.95, "learning_rate": 1.7392888993773005e-07, "loss": 1.1521, "step": 1080 }, { "epoch": 0.95, "learning_rate": 1.466515701443294e-07, "loss": 1.1386, "step": 1085 }, { "epoch": 0.95, "learning_rate": 1.2168376323780652e-07, "loss": 1.1377, "step": 1090 }, { "epoch": 0.96, "learning_rate": 9.90313213883376e-08, "loss": 1.1554, "step": 1095 }, { "epoch": 0.96, "learning_rate": 7.86995540708424e-08, "loss": 1.1681, "step": 1100 }, { "epoch": 0.96, "eval_loss": 1.159261703491211, "eval_runtime": 441.2771, "eval_samples_per_second": 36.635, "eval_steps_per_second": 1.147, "step": 1100 }, { "epoch": 0.97, "learning_rate": 6.069322682050516e-08, "loss": 1.1621, "step": 1105 }, { "epoch": 0.97, "learning_rate": 4.501656011579037e-08, "loss": 1.135, "step": 1110 }, { "epoch": 0.98, "learning_rate": 3.167322838920406e-08, "loss": 1.1301, "step": 1115 }, { "epoch": 0.98, "learning_rate": 2.066635916605386e-08, "loss": 1.1462, "step": 1120 }, { "epoch": 0.99, "learning_rate": 1.1998532331389812e-08, "loss": 1.1398, "step": 1125 }, { "epoch": 0.99, "learning_rate": 5.671779525311394e-09, "loss": 1.1599, "step": 1130 }, { "epoch": 0.99, "learning_rate": 1.6875836667729073e-09, "loss": 1.1463, "step": 1135 }, { "epoch": 1.0, "learning_rate": 4.687860599927874e-11, "loss": 1.1576, "step": 1140 }, { "epoch": 1.0, "step": 1141, "total_flos": 1.1420601686146679e+19, "train_loss": 1.2278979039839963, "train_runtime": 17901.7657, "train_samples_per_second": 8.159, "train_steps_per_second": 0.064 } ], "logging_steps": 5, "max_steps": 1141, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 1.1420601686146679e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }