{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 199, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01507537688442211, "grad_norm": 1.6901472806930542, "learning_rate": 4e-05, "loss": 2.1346, "step": 1 }, { "epoch": 0.03015075376884422, "grad_norm": 1.9821981191635132, "learning_rate": 8e-05, "loss": 2.2738, "step": 2 }, { "epoch": 0.04522613065326633, "grad_norm": 2.084845781326294, "learning_rate": 0.00012, "loss": 2.2929, "step": 3 }, { "epoch": 0.06030150753768844, "grad_norm": 1.9646457433700562, "learning_rate": 0.00016, "loss": 2.209, "step": 4 }, { "epoch": 0.07537688442211055, "grad_norm": 1.6424840688705444, "learning_rate": 0.0002, "loss": 1.9858, "step": 5 }, { "epoch": 0.09045226130653267, "grad_norm": 1.2806246280670166, "learning_rate": 0.0001999881995299069, "loss": 1.7872, "step": 6 }, { "epoch": 0.10552763819095477, "grad_norm": 1.2541472911834717, "learning_rate": 0.00019995280121409636, "loss": 1.5485, "step": 7 }, { "epoch": 0.12060301507537688, "grad_norm": 1.6472628116607666, "learning_rate": 0.00019989381433516316, "loss": 1.4097, "step": 8 }, { "epoch": 0.135678391959799, "grad_norm": 0.9935224056243896, "learning_rate": 0.00019981125436139405, "loss": 1.218, "step": 9 }, { "epoch": 0.1507537688442211, "grad_norm": 1.0851980447769165, "learning_rate": 0.00019970514294271124, "loss": 1.1863, "step": 10 }, { "epoch": 0.1658291457286432, "grad_norm": 0.9265928268432617, "learning_rate": 0.00019957550790499526, "loss": 1.0748, "step": 11 }, { "epoch": 0.18090452261306533, "grad_norm": 0.8681296110153198, "learning_rate": 0.00019942238324278803, "loss": 0.9753, "step": 12 }, { "epoch": 0.19597989949748743, "grad_norm": 1.4610599279403687, "learning_rate": 0.00019924580911037827, "loss": 1.2652, "step": 13 }, { "epoch": 0.21105527638190955, "grad_norm": 2.1508164405822754, "learning_rate": 0.00019904583181127206, "loss": 1.4455, "step": 14 }, { "epoch": 0.22613065326633167, "grad_norm": 1.1126750707626343, "learning_rate": 0.00019882250378605015, "loss": 1.3346, "step": 15 }, { "epoch": 0.24120603015075376, "grad_norm": 0.8268455266952515, "learning_rate": 0.0001985758835986167, "loss": 1.2101, "step": 16 }, { "epoch": 0.2562814070351759, "grad_norm": 1.060780644416809, "learning_rate": 0.0001983060359208415, "loss": 1.1943, "step": 17 }, { "epoch": 0.271356783919598, "grad_norm": 0.8756381273269653, "learning_rate": 0.00019801303151560138, "loss": 1.1907, "step": 18 }, { "epoch": 0.2864321608040201, "grad_norm": 0.548784077167511, "learning_rate": 0.00019769694721822337, "loss": 1.0515, "step": 19 }, { "epoch": 0.3015075376884422, "grad_norm": 0.5907925367355347, "learning_rate": 0.00019735786591633633, "loss": 1.0562, "step": 20 }, { "epoch": 0.3165829145728643, "grad_norm": 0.6034897565841675, "learning_rate": 0.00019699587652813503, "loss": 1.0364, "step": 21 }, { "epoch": 0.3316582914572864, "grad_norm": 0.5467134714126587, "learning_rate": 0.00019661107397906275, "loss": 1.0261, "step": 22 }, { "epoch": 0.34673366834170855, "grad_norm": 0.5134853720664978, "learning_rate": 0.00019620355917691884, "loss": 1.028, "step": 23 }, { "epoch": 0.36180904522613067, "grad_norm": 0.46228307485580444, "learning_rate": 0.00019577343898539748, "loss": 0.8807, "step": 24 }, { "epoch": 0.3768844221105528, "grad_norm": 0.7203683257102966, "learning_rate": 0.00019532082619606436, "loss": 0.8448, "step": 25 }, { "epoch": 0.39195979899497485, "grad_norm": 0.8550315499305725, "learning_rate": 0.00019484583949877908, "loss": 1.316, "step": 26 }, { "epoch": 0.40703517587939697, "grad_norm": 0.7188591957092285, "learning_rate": 0.00019434860345057096, "loss": 1.1819, "step": 27 }, { "epoch": 0.4221105527638191, "grad_norm": 0.4791303873062134, "learning_rate": 0.00019382924844297582, "loss": 1.1297, "step": 28 }, { "epoch": 0.4371859296482412, "grad_norm": 0.4127240478992462, "learning_rate": 0.0001932879106678434, "loss": 1.1469, "step": 29 }, { "epoch": 0.45226130653266333, "grad_norm": 0.5261316895484924, "learning_rate": 0.00019272473208162313, "loss": 1.0941, "step": 30 }, { "epoch": 0.46733668341708545, "grad_norm": 0.5941463708877563, "learning_rate": 0.00019213986036813863, "loss": 1.0389, "step": 31 }, { "epoch": 0.4824120603015075, "grad_norm": 0.6233500838279724, "learning_rate": 0.00019153344889986023, "loss": 1.0168, "step": 32 }, { "epoch": 0.49748743718592964, "grad_norm": 0.5181835889816284, "learning_rate": 0.0001909056566976856, "loss": 1.0081, "step": 33 }, { "epoch": 0.5125628140703518, "grad_norm": 0.4656156301498413, "learning_rate": 0.0001902566483892393, "loss": 0.9737, "step": 34 }, { "epoch": 0.5276381909547738, "grad_norm": 0.4048329293727875, "learning_rate": 0.00018958659416570212, "loss": 0.9317, "step": 35 }, { "epoch": 0.542713567839196, "grad_norm": 0.44168567657470703, "learning_rate": 0.0001888956697371813, "loss": 0.9283, "step": 36 }, { "epoch": 0.5577889447236181, "grad_norm": 0.4204486906528473, "learning_rate": 0.0001881840562866336, "loss": 0.8511, "step": 37 }, { "epoch": 0.5728643216080402, "grad_norm": 0.5346744656562805, "learning_rate": 0.0001874519404223533, "loss": 1.046, "step": 38 }, { "epoch": 0.5879396984924623, "grad_norm": 0.8491763472557068, "learning_rate": 0.00018669951412903725, "loss": 1.2271, "step": 39 }, { "epoch": 0.6030150753768844, "grad_norm": 0.7040531039237976, "learning_rate": 0.0001859269747174404, "loss": 1.1974, "step": 40 }, { "epoch": 0.6180904522613065, "grad_norm": 0.5016492605209351, "learning_rate": 0.0001851345247726344, "loss": 1.1492, "step": 41 }, { "epoch": 0.6331658291457286, "grad_norm": 0.37369680404663086, "learning_rate": 0.00018432237210088307, "loss": 1.0994, "step": 42 }, { "epoch": 0.6482412060301508, "grad_norm": 0.44978925585746765, "learning_rate": 0.00018349072967514896, "loss": 1.0548, "step": 43 }, { "epoch": 0.6633165829145728, "grad_norm": 0.6317250728607178, "learning_rate": 0.00018263981557924483, "loss": 1.0422, "step": 44 }, { "epoch": 0.678391959798995, "grad_norm": 0.6987444162368774, "learning_rate": 0.00018176985295064487, "loss": 1.0336, "step": 45 }, { "epoch": 0.6934673366834171, "grad_norm": 0.6445143818855286, "learning_rate": 0.00018088106992197091, "loss": 0.9867, "step": 46 }, { "epoch": 0.7085427135678392, "grad_norm": 0.5495292544364929, "learning_rate": 0.00017997369956116845, "loss": 0.9443, "step": 47 }, { "epoch": 0.7236180904522613, "grad_norm": 0.4450797438621521, "learning_rate": 0.00017904797981038874, "loss": 0.9513, "step": 48 }, { "epoch": 0.7386934673366834, "grad_norm": 0.43298906087875366, "learning_rate": 0.00017810415342359257, "loss": 0.8488, "step": 49 }, { "epoch": 0.7537688442211056, "grad_norm": 0.4602642059326172, "learning_rate": 0.00017714246790289214, "loss": 0.7555, "step": 50 }, { "epoch": 0.7688442211055276, "grad_norm": 0.7265316247940063, "learning_rate": 0.00017616317543364804, "loss": 1.2305, "step": 51 }, { "epoch": 0.7839195979899497, "grad_norm": 0.749794065952301, "learning_rate": 0.00017516653281833794, "loss": 1.1962, "step": 52 }, { "epoch": 0.7989949748743719, "grad_norm": 0.688567042350769, "learning_rate": 0.00017415280140921463, "loss": 1.1589, "step": 53 }, { "epoch": 0.8140703517587939, "grad_norm": 0.5382574200630188, "learning_rate": 0.00017312224703977094, "loss": 1.115, "step": 54 }, { "epoch": 0.8291457286432161, "grad_norm": 0.44251611828804016, "learning_rate": 0.00017207513995502939, "loss": 1.0856, "step": 55 }, { "epoch": 0.8442211055276382, "grad_norm": 0.3398284614086151, "learning_rate": 0.0001710117547406753, "loss": 1.0265, "step": 56 }, { "epoch": 0.8592964824120602, "grad_norm": 0.34842661023139954, "learning_rate": 0.0001699323702510513, "loss": 0.9424, "step": 57 }, { "epoch": 0.8743718592964824, "grad_norm": 0.45725005865097046, "learning_rate": 0.00016883726953603273, "loss": 0.9276, "step": 58 }, { "epoch": 0.8894472361809045, "grad_norm": 0.5306225419044495, "learning_rate": 0.0001677267397668026, "loss": 0.8902, "step": 59 }, { "epoch": 0.9045226130653267, "grad_norm": 0.6097261309623718, "learning_rate": 0.000166601072160546, "loss": 0.9818, "step": 60 }, { "epoch": 0.9195979899497487, "grad_norm": 0.6303573846817017, "learning_rate": 0.0001654605619040835, "loss": 0.9683, "step": 61 }, { "epoch": 0.9346733668341709, "grad_norm": 0.5870427489280701, "learning_rate": 0.00016430550807646323, "loss": 0.8378, "step": 62 }, { "epoch": 0.949748743718593, "grad_norm": 0.4092012941837311, "learning_rate": 0.00016313621357053306, "loss": 0.9721, "step": 63 }, { "epoch": 0.964824120603015, "grad_norm": 0.3502945899963379, "learning_rate": 0.00016195298501351177, "loss": 1.087, "step": 64 }, { "epoch": 0.9798994974874372, "grad_norm": 0.4699198305606842, "learning_rate": 0.00016075613268658157, "loss": 0.9937, "step": 65 }, { "epoch": 0.9949748743718593, "grad_norm": 0.469721257686615, "learning_rate": 0.00015954597044352234, "loss": 0.8848, "step": 66 }, { "epoch": 1.0100502512562815, "grad_norm": 0.39075490832328796, "learning_rate": 0.00015832281562840856, "loss": 1.0625, "step": 67 }, { "epoch": 1.0251256281407035, "grad_norm": 0.42702537775039673, "learning_rate": 0.00015708698899239172, "loss": 1.1133, "step": 68 }, { "epoch": 1.0402010050251256, "grad_norm": 0.4092056155204773, "learning_rate": 0.00015583881460958868, "loss": 1.0902, "step": 69 }, { "epoch": 1.0552763819095476, "grad_norm": 0.37982651591300964, "learning_rate": 0.0001545786197920989, "loss": 1.0461, "step": 70 }, { "epoch": 1.07035175879397, "grad_norm": 0.3565632402896881, "learning_rate": 0.0001533067350041725, "loss": 1.0392, "step": 71 }, { "epoch": 1.085427135678392, "grad_norm": 0.309384822845459, "learning_rate": 0.00015202349377555166, "loss": 0.9599, "step": 72 }, { "epoch": 1.100502512562814, "grad_norm": 0.3414682149887085, "learning_rate": 0.0001507292326140085, "loss": 0.9381, "step": 73 }, { "epoch": 1.1155778894472361, "grad_norm": 0.36480242013931274, "learning_rate": 0.00014942429091710141, "loss": 0.8947, "step": 74 }, { "epoch": 1.1306532663316582, "grad_norm": 0.42747026681900024, "learning_rate": 0.00014810901088317414, "loss": 0.8695, "step": 75 }, { "epoch": 1.1457286432160805, "grad_norm": 0.46534058451652527, "learning_rate": 0.00014678373742162007, "loss": 0.9033, "step": 76 }, { "epoch": 1.1608040201005025, "grad_norm": 0.48176416754722595, "learning_rate": 0.00014544881806243583, "loss": 0.8566, "step": 77 }, { "epoch": 1.1758793969849246, "grad_norm": 0.4470821022987366, "learning_rate": 0.00014410460286508762, "loss": 0.81, "step": 78 }, { "epoch": 1.1909547738693467, "grad_norm": 0.4553522765636444, "learning_rate": 0.0001427514443267139, "loss": 0.8016, "step": 79 }, { "epoch": 1.2060301507537687, "grad_norm": 0.37652871012687683, "learning_rate": 0.0001413896972896894, "loss": 1.1188, "step": 80 }, { "epoch": 1.221105527638191, "grad_norm": 0.36291682720184326, "learning_rate": 0.0001400197188485739, "loss": 1.109, "step": 81 }, { "epoch": 1.236180904522613, "grad_norm": 0.43163785338401794, "learning_rate": 0.00013864186825646995, "loss": 1.0842, "step": 82 }, { "epoch": 1.2512562814070352, "grad_norm": 0.4752475619316101, "learning_rate": 0.00013725650683081556, "loss": 1.0216, "step": 83 }, { "epoch": 1.2663316582914572, "grad_norm": 0.47904953360557556, "learning_rate": 0.00013586399785863454, "loss": 0.9856, "step": 84 }, { "epoch": 1.2814070351758793, "grad_norm": 0.4482368230819702, "learning_rate": 0.0001344647065012709, "loss": 0.962, "step": 85 }, { "epoch": 1.2964824120603016, "grad_norm": 0.40822041034698486, "learning_rate": 0.0001330589996986315, "loss": 0.8775, "step": 86 }, { "epoch": 1.3115577889447236, "grad_norm": 0.39425739645957947, "learning_rate": 0.00013164724607296285, "loss": 0.8851, "step": 87 }, { "epoch": 1.3266331658291457, "grad_norm": 0.40144336223602295, "learning_rate": 0.00013022981583218565, "loss": 0.8532, "step": 88 }, { "epoch": 1.3417085427135678, "grad_norm": 0.4694520831108093, "learning_rate": 0.00012880708067281477, "loss": 0.8961, "step": 89 }, { "epoch": 1.3567839195979898, "grad_norm": 0.467843234539032, "learning_rate": 0.00012737941368248792, "loss": 0.7872, "step": 90 }, { "epoch": 1.3718592964824121, "grad_norm": 0.5175732374191284, "learning_rate": 0.00012594718924213008, "loss": 0.7416, "step": 91 }, { "epoch": 1.3869346733668342, "grad_norm": 0.44360604882240295, "learning_rate": 0.00012451078292777837, "loss": 1.0284, "step": 92 }, { "epoch": 1.4020100502512562, "grad_norm": 0.4175772964954376, "learning_rate": 0.00012307057141209415, "loss": 1.1058, "step": 93 }, { "epoch": 1.4170854271356783, "grad_norm": 0.36412638425827026, "learning_rate": 0.00012162693236558658, "loss": 1.071, "step": 94 }, { "epoch": 1.4321608040201004, "grad_norm": 0.35895222425460815, "learning_rate": 0.0001201802443575756, "loss": 1.0445, "step": 95 }, { "epoch": 1.4472361809045227, "grad_norm": 0.41202953457832336, "learning_rate": 0.00011873088675691835, "loss": 0.9895, "step": 96 }, { "epoch": 1.4623115577889447, "grad_norm": 0.49337854981422424, "learning_rate": 0.0001172792396325264, "loss": 1.0112, "step": 97 }, { "epoch": 1.4773869346733668, "grad_norm": 0.45489662885665894, "learning_rate": 0.00011582568365369924, "loss": 0.9251, "step": 98 }, { "epoch": 1.492462311557789, "grad_norm": 0.476253479719162, "learning_rate": 0.00011437059999030035, "loss": 0.9277, "step": 99 }, { "epoch": 1.507537688442211, "grad_norm": 0.37511464953422546, "learning_rate": 0.00011291437021280205, "loss": 0.8723, "step": 100 }, { "epoch": 1.5226130653266332, "grad_norm": 0.4103985130786896, "learning_rate": 0.00011145737619222516, "loss": 0.8183, "step": 101 }, { "epoch": 1.5376884422110553, "grad_norm": 0.4598234295845032, "learning_rate": 0.00011000000000000002, "loss": 0.8668, "step": 102 }, { "epoch": 1.5527638190954773, "grad_norm": 0.45140963792800903, "learning_rate": 0.00010854262380777486, "loss": 0.7755, "step": 103 }, { "epoch": 1.5678391959798996, "grad_norm": 0.5093435645103455, "learning_rate": 0.000107085629787198, "loss": 0.7673, "step": 104 }, { "epoch": 1.5829145728643215, "grad_norm": 0.360904723405838, "learning_rate": 0.0001056294000096997, "loss": 1.1205, "step": 105 }, { "epoch": 1.5979899497487438, "grad_norm": 0.377037912607193, "learning_rate": 0.0001041743163463008, "loss": 1.0803, "step": 106 }, { "epoch": 1.6130653266331658, "grad_norm": 0.35898640751838684, "learning_rate": 0.00010272076036747365, "loss": 1.0416, "step": 107 }, { "epoch": 1.6281407035175879, "grad_norm": 0.36311081051826477, "learning_rate": 0.00010126911324308168, "loss": 0.9846, "step": 108 }, { "epoch": 1.6432160804020102, "grad_norm": 0.3633027970790863, "learning_rate": 9.981975564242443e-05, "loss": 0.9981, "step": 109 }, { "epoch": 1.658291457286432, "grad_norm": 0.3827294409275055, "learning_rate": 9.837306763441345e-05, "loss": 0.9411, "step": 110 }, { "epoch": 1.6733668341708543, "grad_norm": 0.39575091004371643, "learning_rate": 9.692942858790591e-05, "loss": 0.9055, "step": 111 }, { "epoch": 1.6884422110552764, "grad_norm": 0.43744927644729614, "learning_rate": 9.548921707222163e-05, "loss": 0.8801, "step": 112 }, { "epoch": 1.7035175879396984, "grad_norm": 0.4338492453098297, "learning_rate": 9.405281075786995e-05, "loss": 0.8653, "step": 113 }, { "epoch": 1.7185929648241207, "grad_norm": 0.39823731780052185, "learning_rate": 9.26205863175121e-05, "loss": 0.8548, "step": 114 }, { "epoch": 1.7336683417085426, "grad_norm": 0.42383068799972534, "learning_rate": 9.119291932718525e-05, "loss": 0.7412, "step": 115 }, { "epoch": 1.7487437185929648, "grad_norm": 0.46130019426345825, "learning_rate": 8.97701841678144e-05, "loss": 0.6798, "step": 116 }, { "epoch": 1.763819095477387, "grad_norm": 0.3968804180622101, "learning_rate": 8.835275392703721e-05, "loss": 0.9816, "step": 117 }, { "epoch": 1.778894472361809, "grad_norm": 0.37212514877319336, "learning_rate": 8.694100030136849e-05, "loss": 1.0961, "step": 118 }, { "epoch": 1.7939698492462313, "grad_norm": 0.35753509402275085, "learning_rate": 8.553529349872916e-05, "loss": 1.0364, "step": 119 }, { "epoch": 1.809045226130653, "grad_norm": 0.3995687961578369, "learning_rate": 8.413600214136548e-05, "loss": 1.0853, "step": 120 }, { "epoch": 1.8241206030150754, "grad_norm": 0.35571298003196716, "learning_rate": 8.274349316918446e-05, "loss": 0.9598, "step": 121 }, { "epoch": 1.8391959798994975, "grad_norm": 0.36723199486732483, "learning_rate": 8.135813174353008e-05, "loss": 0.9179, "step": 122 }, { "epoch": 1.8542713567839195, "grad_norm": 0.3806018531322479, "learning_rate": 7.998028115142617e-05, "loss": 0.9019, "step": 123 }, { "epoch": 1.8693467336683418, "grad_norm": 0.413316547870636, "learning_rate": 7.86103027103106e-05, "loss": 0.8889, "step": 124 }, { "epoch": 1.8844221105527639, "grad_norm": 0.4195359945297241, "learning_rate": 7.724855567328613e-05, "loss": 0.8677, "step": 125 }, { "epoch": 1.899497487437186, "grad_norm": 0.42447102069854736, "learning_rate": 7.58953971349124e-05, "loss": 0.8344, "step": 126 }, { "epoch": 1.914572864321608, "grad_norm": 0.42873820662498474, "learning_rate": 7.455118193756419e-05, "loss": 0.8165, "step": 127 }, { "epoch": 1.92964824120603, "grad_norm": 0.43426066637039185, "learning_rate": 7.321626257837996e-05, "loss": 0.7341, "step": 128 }, { "epoch": 1.9447236180904524, "grad_norm": 0.4637242257595062, "learning_rate": 7.189098911682592e-05, "loss": 0.7844, "step": 129 }, { "epoch": 1.9597989949748744, "grad_norm": 0.3749074637889862, "learning_rate": 7.05757090828986e-05, "loss": 1.0112, "step": 130 }, { "epoch": 1.9748743718592965, "grad_norm": 0.3993157744407654, "learning_rate": 6.927076738599152e-05, "loss": 0.9289, "step": 131 }, { "epoch": 1.9899497487437185, "grad_norm": 0.44096246361732483, "learning_rate": 6.797650622444836e-05, "loss": 0.8243, "step": 132 }, { "epoch": 2.0050251256281406, "grad_norm": 0.4166644215583801, "learning_rate": 6.669326499582755e-05, "loss": 0.9168, "step": 133 }, { "epoch": 2.020100502512563, "grad_norm": 0.33699873089790344, "learning_rate": 6.542138020790116e-05, "loss": 1.0537, "step": 134 }, { "epoch": 2.0351758793969847, "grad_norm": 0.34573739767074585, "learning_rate": 6.416118539041135e-05, "loss": 1.0553, "step": 135 }, { "epoch": 2.050251256281407, "grad_norm": 0.3347012400627136, "learning_rate": 6.291301100760829e-05, "loss": 0.9871, "step": 136 }, { "epoch": 2.0653266331658293, "grad_norm": 0.3667552173137665, "learning_rate": 6.167718437159147e-05, "loss": 0.9605, "step": 137 }, { "epoch": 2.080402010050251, "grad_norm": 0.3843131363391876, "learning_rate": 6.045402955647769e-05, "loss": 0.9183, "step": 138 }, { "epoch": 2.0954773869346734, "grad_norm": 0.417459636926651, "learning_rate": 5.924386731341842e-05, "loss": 0.8738, "step": 139 }, { "epoch": 2.1105527638190953, "grad_norm": 0.41590002179145813, "learning_rate": 5.804701498648828e-05, "loss": 0.872, "step": 140 }, { "epoch": 2.1256281407035176, "grad_norm": 0.38869982957839966, "learning_rate": 5.686378642946699e-05, "loss": 0.793, "step": 141 }, { "epoch": 2.14070351758794, "grad_norm": 0.4300014078617096, "learning_rate": 5.569449192353678e-05, "loss": 0.7924, "step": 142 }, { "epoch": 2.1557788944723617, "grad_norm": 0.4311830997467041, "learning_rate": 5.453943809591654e-05, "loss": 0.7704, "step": 143 }, { "epoch": 2.170854271356784, "grad_norm": 0.44092732667922974, "learning_rate": 5.3398927839453996e-05, "loss": 0.7094, "step": 144 }, { "epoch": 2.185929648241206, "grad_norm": 0.47999846935272217, "learning_rate": 5.227326023319743e-05, "loss": 0.6027, "step": 145 }, { "epoch": 2.201005025125628, "grad_norm": 0.39457985758781433, "learning_rate": 5.1162730463967304e-05, "loss": 1.0607, "step": 146 }, { "epoch": 2.2160804020100504, "grad_norm": 0.4095008075237274, "learning_rate": 5.006762974894872e-05, "loss": 1.0481, "step": 147 }, { "epoch": 2.2311557788944723, "grad_norm": 0.4314271807670593, "learning_rate": 4.898824525932471e-05, "loss": 1.052, "step": 148 }, { "epoch": 2.2462311557788945, "grad_norm": 0.393932580947876, "learning_rate": 4.7924860044970615e-05, "loss": 0.9618, "step": 149 }, { "epoch": 2.2613065326633164, "grad_norm": 0.4005722403526306, "learning_rate": 4.687775296022908e-05, "loss": 0.9511, "step": 150 }, { "epoch": 2.2763819095477387, "grad_norm": 0.4252696633338928, "learning_rate": 4.5847198590785394e-05, "loss": 0.9415, "step": 151 }, { "epoch": 2.291457286432161, "grad_norm": 0.40069398283958435, "learning_rate": 4.4833467181662086e-05, "loss": 0.8626, "step": 152 }, { "epoch": 2.306532663316583, "grad_norm": 0.43201375007629395, "learning_rate": 4.383682456635199e-05, "loss": 0.8382, "step": 153 }, { "epoch": 2.321608040201005, "grad_norm": 0.40895000100135803, "learning_rate": 4.285753209710786e-05, "loss": 0.789, "step": 154 }, { "epoch": 2.3366834170854274, "grad_norm": 0.4883180558681488, "learning_rate": 4.1895846576407424e-05, "loss": 0.8078, "step": 155 }, { "epoch": 2.351758793969849, "grad_norm": 0.4921092689037323, "learning_rate": 4.095202018961125e-05, "loss": 0.7998, "step": 156 }, { "epoch": 2.3668341708542715, "grad_norm": 0.4970828592777252, "learning_rate": 4.002630043883159e-05, "loss": 0.6696, "step": 157 }, { "epoch": 2.3819095477386933, "grad_norm": 0.45936843752861023, "learning_rate": 3.911893007802913e-05, "loss": 0.8207, "step": 158 }, { "epoch": 2.3969849246231156, "grad_norm": 0.3807404637336731, "learning_rate": 3.8230147049355147e-05, "loss": 1.069, "step": 159 }, { "epoch": 2.4120603015075375, "grad_norm": 0.39869654178619385, "learning_rate": 3.7360184420755165e-05, "loss": 0.9586, "step": 160 }, { "epoch": 2.4271356783919598, "grad_norm": 0.39717385172843933, "learning_rate": 3.650927032485101e-05, "loss": 0.997, "step": 161 }, { "epoch": 2.442211055276382, "grad_norm": 0.41050049662590027, "learning_rate": 3.567762789911693e-05, "loss": 0.9516, "step": 162 }, { "epoch": 2.457286432160804, "grad_norm": 0.4008517265319824, "learning_rate": 3.486547522736562e-05, "loss": 0.888, "step": 163 }, { "epoch": 2.472361809045226, "grad_norm": 0.4284372627735138, "learning_rate": 3.407302528255961e-05, "loss": 0.9073, "step": 164 }, { "epoch": 2.4874371859296485, "grad_norm": 0.44693753123283386, "learning_rate": 3.3300485870962776e-05, "loss": 0.8223, "step": 165 }, { "epoch": 2.5025125628140703, "grad_norm": 0.4429689347743988, "learning_rate": 3.254805957764673e-05, "loss": 0.786, "step": 166 }, { "epoch": 2.5175879396984926, "grad_norm": 0.47324490547180176, "learning_rate": 3.1815943713366404e-05, "loss": 0.8126, "step": 167 }, { "epoch": 2.5326633165829144, "grad_norm": 0.4703764021396637, "learning_rate": 3.110433026281872e-05, "loss": 0.8371, "step": 168 }, { "epoch": 2.5477386934673367, "grad_norm": 0.49376046657562256, "learning_rate": 3.041340583429789e-05, "loss": 0.7322, "step": 169 }, { "epoch": 2.5628140703517586, "grad_norm": 0.5173198580741882, "learning_rate": 2.9743351610760716e-05, "loss": 0.6607, "step": 170 }, { "epoch": 2.577889447236181, "grad_norm": 0.4035499691963196, "learning_rate": 2.9094343302314432e-05, "loss": 1.0153, "step": 171 }, { "epoch": 2.592964824120603, "grad_norm": 0.3791813850402832, "learning_rate": 2.846655110013978e-05, "loss": 1.0356, "step": 172 }, { "epoch": 2.608040201005025, "grad_norm": 0.41843312978744507, "learning_rate": 2.78601396318614e-05, "loss": 0.9886, "step": 173 }, { "epoch": 2.6231155778894473, "grad_norm": 0.4282153248786926, "learning_rate": 2.7275267918376912e-05, "loss": 0.9719, "step": 174 }, { "epoch": 2.6381909547738696, "grad_norm": 0.42072588205337524, "learning_rate": 2.6712089332156633e-05, "loss": 0.9334, "step": 175 }, { "epoch": 2.6532663316582914, "grad_norm": 0.44376295804977417, "learning_rate": 2.6170751557024197e-05, "loss": 0.896, "step": 176 }, { "epoch": 2.6683417085427137, "grad_norm": 0.4388138949871063, "learning_rate": 2.5651396549429086e-05, "loss": 0.8283, "step": 177 }, { "epoch": 2.6834170854271355, "grad_norm": 0.44835880398750305, "learning_rate": 2.515416050122092e-05, "loss": 0.861, "step": 178 }, { "epoch": 2.698492462311558, "grad_norm": 0.4445788860321045, "learning_rate": 2.4679173803935662e-05, "loss": 0.7559, "step": 179 }, { "epoch": 2.7135678391959797, "grad_norm": 0.45511695742607117, "learning_rate": 2.4226561014602522e-05, "loss": 0.7843, "step": 180 }, { "epoch": 2.728643216080402, "grad_norm": 0.46726593375205994, "learning_rate": 2.3796440823081167e-05, "loss": 0.7236, "step": 181 }, { "epoch": 2.7437185929648242, "grad_norm": 0.49476441740989685, "learning_rate": 2.3388926020937286e-05, "loss": 0.6451, "step": 182 }, { "epoch": 2.758793969849246, "grad_norm": 0.49135738611221313, "learning_rate": 2.3004123471865e-05, "loss": 0.7884, "step": 183 }, { "epoch": 2.7738693467336684, "grad_norm": 0.3582770526409149, "learning_rate": 2.2642134083663678e-05, "loss": 1.0252, "step": 184 }, { "epoch": 2.7889447236180906, "grad_norm": 0.40019041299819946, "learning_rate": 2.2303052781776664e-05, "loss": 1.0195, "step": 185 }, { "epoch": 2.8040201005025125, "grad_norm": 0.3975183069705963, "learning_rate": 2.198696848439865e-05, "loss": 0.9664, "step": 186 }, { "epoch": 2.819095477386935, "grad_norm": 0.43884652853012085, "learning_rate": 2.169396407915849e-05, "loss": 0.9815, "step": 187 }, { "epoch": 2.8341708542713566, "grad_norm": 0.44053277373313904, "learning_rate": 2.142411640138332e-05, "loss": 0.9296, "step": 188 }, { "epoch": 2.849246231155779, "grad_norm": 0.4180326759815216, "learning_rate": 2.1177496213949837e-05, "loss": 0.8335, "step": 189 }, { "epoch": 2.8643216080402008, "grad_norm": 0.4276379346847534, "learning_rate": 2.0954168188727962e-05, "loss": 0.8277, "step": 190 }, { "epoch": 2.879396984924623, "grad_norm": 0.45003947615623474, "learning_rate": 2.0754190889621745e-05, "loss": 0.8232, "step": 191 }, { "epoch": 2.8944723618090453, "grad_norm": 0.4623722434043884, "learning_rate": 2.0577616757212016e-05, "loss": 0.7848, "step": 192 }, { "epoch": 2.909547738693467, "grad_norm": 0.49609658122062683, "learning_rate": 2.0424492095004746e-05, "loss": 0.7932, "step": 193 }, { "epoch": 2.9246231155778895, "grad_norm": 0.4570601284503937, "learning_rate": 2.029485705728876e-05, "loss": 0.7536, "step": 194 }, { "epoch": 2.9396984924623117, "grad_norm": 0.5050197839736938, "learning_rate": 2.0188745638605954e-05, "loss": 0.6292, "step": 195 }, { "epoch": 2.9547738693467336, "grad_norm": 0.40025895833969116, "learning_rate": 2.010618566483684e-05, "loss": 0.9652, "step": 196 }, { "epoch": 2.969849246231156, "grad_norm": 0.42418885231018066, "learning_rate": 2.0047198785903658e-05, "loss": 0.9192, "step": 197 }, { "epoch": 2.984924623115578, "grad_norm": 0.4549340307712555, "learning_rate": 2.0011800470093105e-05, "loss": 0.7822, "step": 198 }, { "epoch": 3.0, "grad_norm": 0.4662438631057739, "learning_rate": 2e-05, "loss": 0.7239, "step": 199 } ], "logging_steps": 1, "max_steps": 199, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.1995732340637696e+17, "train_batch_size": 18, "trial_name": null, "trial_params": null }