{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990224828934506, "eval_steps": 500, "global_step": 511, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019550342130987292, "grad_norm": 28.436507384573893, "learning_rate": 5.769230769230769e-06, "loss": 1.9755, "step": 1 }, { "epoch": 0.009775171065493646, "grad_norm": 16.326267191478145, "learning_rate": 2.8846153846153845e-05, "loss": 1.7228, "step": 5 }, { "epoch": 0.019550342130987292, "grad_norm": 7.77688805789614, "learning_rate": 5.769230769230769e-05, "loss": 1.1824, "step": 10 }, { "epoch": 0.02932551319648094, "grad_norm": 23.182562040558295, "learning_rate": 8.653846153846152e-05, "loss": 1.2132, "step": 15 }, { "epoch": 0.039100684261974585, "grad_norm": 3.8046090746822716, "learning_rate": 0.00011538461538461538, "loss": 1.0978, "step": 20 }, { "epoch": 0.04887585532746823, "grad_norm": 6.364935901579011, "learning_rate": 0.00014423076923076922, "loss": 1.2143, "step": 25 }, { "epoch": 0.05865102639296188, "grad_norm": 4.553589249592707, "learning_rate": 0.00017307692307692304, "loss": 1.1357, "step": 30 }, { "epoch": 0.06842619745845552, "grad_norm": 5.819131618520497, "learning_rate": 0.00020192307692307691, "loss": 1.2003, "step": 35 }, { "epoch": 0.07820136852394917, "grad_norm": 8.631833955662083, "learning_rate": 0.00023076923076923076, "loss": 1.2049, "step": 40 }, { "epoch": 0.08797653958944282, "grad_norm": 27.119527389294134, "learning_rate": 0.0002596153846153846, "loss": 2.0482, "step": 45 }, { "epoch": 0.09775171065493646, "grad_norm": 32.33385091368177, "learning_rate": 0.00028846153846153843, "loss": 1.6455, "step": 50 }, { "epoch": 0.10752688172043011, "grad_norm": 45.94902359405063, "learning_rate": 0.0002999683799255387, "loss": 1.9763, "step": 55 }, { "epoch": 0.11730205278592376, "grad_norm": 128.59363629595714, "learning_rate": 0.0002997751944121241, "loss": 1.5422, "step": 60 }, { "epoch": 0.1270772238514174, "grad_norm": 10.71651832916556, "learning_rate": 0.0002994066160471166, "loss": 1.7548, "step": 65 }, { "epoch": 0.13685239491691104, "grad_norm": 4.189410908699583, "learning_rate": 0.0002988630764507904, "loss": 1.3404, "step": 70 }, { "epoch": 0.1466275659824047, "grad_norm": 33.78267011030123, "learning_rate": 0.00029814521213014585, "loss": 1.4341, "step": 75 }, { "epoch": 0.15640273704789834, "grad_norm": 4.447241036402298, "learning_rate": 0.00029725386373353455, "loss": 1.4355, "step": 80 }, { "epoch": 0.16617790811339198, "grad_norm": 19.324376604180756, "learning_rate": 0.00029619007506622504, "loss": 1.4037, "step": 85 }, { "epoch": 0.17595307917888564, "grad_norm": 56.398325865658165, "learning_rate": 0.00029495509186806487, "loss": 2.1883, "step": 90 }, { "epoch": 0.18572825024437928, "grad_norm": 117.39758059322921, "learning_rate": 0.0002935503603546683, "loss": 2.0507, "step": 95 }, { "epoch": 0.19550342130987292, "grad_norm": 7.6884925370685995, "learning_rate": 0.00029197752552383914, "loss": 1.5932, "step": 100 }, { "epoch": 0.20527859237536658, "grad_norm": 963.0390862127407, "learning_rate": 0.000290238429229211, "loss": 5.223, "step": 105 }, { "epoch": 0.21505376344086022, "grad_norm": 258.2156128276988, "learning_rate": 0.00028833510802336203, "loss": 4.5213, "step": 110 }, { "epoch": 0.22482893450635386, "grad_norm": 264.762100287562, "learning_rate": 0.0002862697907729285, "loss": 2.0849, "step": 115 }, { "epoch": 0.23460410557184752, "grad_norm": 18.694604887076736, "learning_rate": 0.0002840448960485118, "loss": 1.8192, "step": 120 }, { "epoch": 0.24437927663734116, "grad_norm": 67.0812503804155, "learning_rate": 0.00028166302929243326, "loss": 1.3915, "step": 125 }, { "epoch": 0.2541544477028348, "grad_norm": 21.92554603766507, "learning_rate": 0.0002791269797676551, "loss": 1.5317, "step": 130 }, { "epoch": 0.26392961876832843, "grad_norm": 7.078011892011075, "learning_rate": 0.00027643971729144056, "loss": 1.4673, "step": 135 }, { "epoch": 0.27370478983382207, "grad_norm": 9.631691414307214, "learning_rate": 0.0002736043887575761, "loss": 1.3131, "step": 140 }, { "epoch": 0.28347996089931576, "grad_norm": 23.096760351073254, "learning_rate": 0.00027062431445123124, "loss": 1.572, "step": 145 }, { "epoch": 0.2932551319648094, "grad_norm": 3.17350344043114, "learning_rate": 0.0002675029841607691, "loss": 1.3668, "step": 150 }, { "epoch": 0.30303030303030304, "grad_norm": 6.678371795997961, "learning_rate": 0.00026424405309106216, "loss": 1.3082, "step": 155 }, { "epoch": 0.3128054740957967, "grad_norm": 4.718771263163467, "learning_rate": 0.00026085133758309883, "loss": 1.3286, "step": 160 }, { "epoch": 0.3225806451612903, "grad_norm": 8.790382644648172, "learning_rate": 0.00025732881064489233, "loss": 1.3241, "step": 165 }, { "epoch": 0.33235581622678395, "grad_norm": 4.416208770248871, "learning_rate": 0.0002536805972989267, "loss": 1.3357, "step": 170 }, { "epoch": 0.3421309872922776, "grad_norm": 3.1002678359027285, "learning_rate": 0.0002499109697515875, "loss": 1.4037, "step": 175 }, { "epoch": 0.3519061583577713, "grad_norm": 7.124469693986668, "learning_rate": 0.0002460243423902342, "loss": 1.625, "step": 180 }, { "epoch": 0.3616813294232649, "grad_norm": 51.006744297270856, "learning_rate": 0.00024202526661377277, "loss": 1.6499, "step": 185 }, { "epoch": 0.37145650048875856, "grad_norm": 14.662023479458805, "learning_rate": 0.00023791842550278217, "loss": 1.8342, "step": 190 }, { "epoch": 0.3812316715542522, "grad_norm": 4.973242239066626, "learning_rate": 0.00023370862833543648, "loss": 1.6823, "step": 195 }, { "epoch": 0.39100684261974583, "grad_norm": 95.12537291599145, "learning_rate": 0.0002294008049556441, "loss": 1.5268, "step": 200 }, { "epoch": 0.40078201368523947, "grad_norm": 3.394429345078054, "learning_rate": 0.000225, "loss": 1.45, "step": 205 }, { "epoch": 0.41055718475073316, "grad_norm": 3.5902349674560456, "learning_rate": 0.00022051136699031057, "loss": 1.2502, "step": 210 }, { "epoch": 0.4203323558162268, "grad_norm": 260.2217345574012, "learning_rate": 0.00021594016229861007, "loss": 1.4486, "step": 215 }, { "epoch": 0.43010752688172044, "grad_norm": 4.8647475236367725, "learning_rate": 0.0002112917389917347, "loss": 1.486, "step": 220 }, { "epoch": 0.4398826979472141, "grad_norm": 1.9024756731821182, "learning_rate": 0.0002065715405626634, "loss": 1.2628, "step": 225 }, { "epoch": 0.4496578690127077, "grad_norm": 10.37907922884646, "learning_rate": 0.00020178509455596596, "loss": 1.2518, "step": 230 }, { "epoch": 0.45943304007820135, "grad_norm": 1.9405506982628546, "learning_rate": 0.00019693800609482315, "loss": 1.2849, "step": 235 }, { "epoch": 0.46920821114369504, "grad_norm": 2.6160283264932462, "learning_rate": 0.00019203595131719932, "loss": 1.2548, "step": 240 }, { "epoch": 0.4789833822091887, "grad_norm": 2.1695347772705373, "learning_rate": 0.00018708467072885382, "loss": 1.3377, "step": 245 }, { "epoch": 0.4887585532746823, "grad_norm": 2.2144620011763374, "learning_rate": 0.00018208996248097458, "loss": 1.3093, "step": 250 }, { "epoch": 0.49853372434017595, "grad_norm": 2.1880901805448403, "learning_rate": 0.00017705767558030754, "loss": 1.245, "step": 255 }, { "epoch": 0.5083088954056696, "grad_norm": 2.991429961153096, "learning_rate": 0.0001719937030397311, "loss": 1.2559, "step": 260 }, { "epoch": 0.5180840664711632, "grad_norm": 23.064953881729117, "learning_rate": 0.00016690397497729818, "loss": 1.288, "step": 265 }, { "epoch": 0.5278592375366569, "grad_norm": 1.3455036549144204, "learning_rate": 0.00016179445167182677, "loss": 1.2717, "step": 270 }, { "epoch": 0.5376344086021505, "grad_norm": 1.1846238387921606, "learning_rate": 0.00015667111658317054, "loss": 1.2394, "step": 275 }, { "epoch": 0.5474095796676441, "grad_norm": 137.50996798714343, "learning_rate": 0.00015153996934534348, "loss": 1.3296, "step": 280 }, { "epoch": 0.5571847507331378, "grad_norm": 1.1488279921928382, "learning_rate": 0.00014640701874070455, "loss": 1.2874, "step": 285 }, { "epoch": 0.5669599217986315, "grad_norm": 1.2689991198054311, "learning_rate": 0.00014127827566342863, "loss": 1.2561, "step": 290 }, { "epoch": 0.5767350928641252, "grad_norm": 2.140511295392104, "learning_rate": 0.0001361597460805047, "loss": 1.2205, "step": 295 }, { "epoch": 0.5865102639296188, "grad_norm": 12.286388401977892, "learning_rate": 0.000131057423998504, "loss": 1.252, "step": 300 }, { "epoch": 0.5962854349951124, "grad_norm": 2.5541457654289395, "learning_rate": 0.00012597728444435418, "loss": 1.215, "step": 305 }, { "epoch": 0.6060606060606061, "grad_norm": 1.1732833800621696, "learning_rate": 0.00012092527646833949, "loss": 1.2053, "step": 310 }, { "epoch": 0.6158357771260997, "grad_norm": 1.4481798374657, "learning_rate": 0.00011590731617752066, "loss": 1.2061, "step": 315 }, { "epoch": 0.6256109481915934, "grad_norm": 0.9912604590459435, "learning_rate": 0.00011092927980773267, "loss": 1.1604, "step": 320 }, { "epoch": 0.635386119257087, "grad_norm": 1.0322599469502478, "learning_rate": 0.00010599699684227311, "loss": 1.1369, "step": 325 }, { "epoch": 0.6451612903225806, "grad_norm": 1.5437893851108073, "learning_rate": 0.00010111624318534006, "loss": 1.1721, "step": 330 }, { "epoch": 0.6549364613880743, "grad_norm": 1.383693940282455, "learning_rate": 9.629273439821313e-05, "loss": 1.1094, "step": 335 }, { "epoch": 0.6647116324535679, "grad_norm": 2.598856235735631, "learning_rate": 9.15321190060981e-05, "loss": 1.1251, "step": 340 }, { "epoch": 0.6744868035190615, "grad_norm": 1.0440965009342513, "learning_rate": 8.683997188347435e-05, "loss": 1.0953, "step": 345 }, { "epoch": 0.6842619745845552, "grad_norm": 1.2172823916521676, "learning_rate": 8.222178772568959e-05, "loss": 1.0839, "step": 350 }, { "epoch": 0.6940371456500489, "grad_norm": 0.9291103607651107, "learning_rate": 7.768297461444765e-05, "loss": 1.0786, "step": 355 }, { "epoch": 0.7038123167155426, "grad_norm": 36.656353739168324, "learning_rate": 7.32288476847252e-05, "loss": 1.1001, "step": 360 }, { "epoch": 0.7135874877810362, "grad_norm": 0.7923067460462172, "learning_rate": 6.886462290053158e-05, "loss": 1.0793, "step": 365 }, { "epoch": 0.7233626588465298, "grad_norm": 0.8419278909431203, "learning_rate": 6.4595410946803e-05, "loss": 1.0869, "step": 370 }, { "epoch": 0.7331378299120235, "grad_norm": 1.0339571657214093, "learning_rate": 6.04262112445821e-05, "loss": 1.0128, "step": 375 }, { "epoch": 0.7429130009775171, "grad_norm": 0.7333799573780848, "learning_rate": 5.636190609649249e-05, "loss": 1.0101, "step": 380 }, { "epoch": 0.7526881720430108, "grad_norm": 0.8595033406539786, "learning_rate": 5.240725496936372e-05, "loss": 1.0224, "step": 385 }, { "epoch": 0.7624633431085044, "grad_norm": 0.7112388580251547, "learning_rate": 4.8566888920701196e-05, "loss": 1.0016, "step": 390 }, { "epoch": 0.772238514173998, "grad_norm": 0.8167634152987004, "learning_rate": 4.48453051755301e-05, "loss": 0.9793, "step": 395 }, { "epoch": 0.7820136852394917, "grad_norm": 0.6682157986400304, "learning_rate": 4.12468618599611e-05, "loss": 1.0015, "step": 400 }, { "epoch": 0.7917888563049853, "grad_norm": 0.7797058649722416, "learning_rate": 3.777577289764752e-05, "loss": 0.9784, "step": 405 }, { "epoch": 0.8015640273704789, "grad_norm": 0.6768885831261553, "learning_rate": 3.443610307510907e-05, "loss": 0.9605, "step": 410 }, { "epoch": 0.8113391984359726, "grad_norm": 0.6416162504789994, "learning_rate": 3.1231763281701305e-05, "loss": 0.971, "step": 415 }, { "epoch": 0.8211143695014663, "grad_norm": 0.6784950124595185, "learning_rate": 2.816650592980495e-05, "loss": 0.9553, "step": 420 }, { "epoch": 0.83088954056696, "grad_norm": 0.689853020596839, "learning_rate": 2.5243920560598184e-05, "loss": 0.9351, "step": 425 }, { "epoch": 0.8406647116324536, "grad_norm": 0.6460996639595076, "learning_rate": 2.24674296405579e-05, "loss": 0.9313, "step": 430 }, { "epoch": 0.8504398826979472, "grad_norm": 0.6636695831614443, "learning_rate": 1.98402845536117e-05, "loss": 0.9266, "step": 435 }, { "epoch": 0.8602150537634409, "grad_norm": 0.6119153060968766, "learning_rate": 1.736556179363543e-05, "loss": 0.9134, "step": 440 }, { "epoch": 0.8699902248289345, "grad_norm": 0.5955119176542452, "learning_rate": 1.5046159361753224e-05, "loss": 0.9198, "step": 445 }, { "epoch": 0.8797653958944281, "grad_norm": 0.6324852203366718, "learning_rate": 1.2884793372660207e-05, "loss": 0.9051, "step": 450 }, { "epoch": 0.8895405669599218, "grad_norm": 0.6803072103352026, "learning_rate": 1.0883994873941815e-05, "loss": 0.8923, "step": 455 }, { "epoch": 0.8993157380254154, "grad_norm": 2.125025596226289, "learning_rate": 9.046106882113751e-06, "loss": 0.9315, "step": 460 }, { "epoch": 0.9090909090909091, "grad_norm": 0.5812394213670566, "learning_rate": 7.373281638854328e-06, "loss": 0.9131, "step": 465 }, { "epoch": 0.9188660801564027, "grad_norm": 0.6819326019234143, "learning_rate": 5.867478090641892e-06, "loss": 0.9521, "step": 470 }, { "epoch": 0.9286412512218963, "grad_norm": 0.6009777818830081, "learning_rate": 4.530459594748592e-06, "loss": 0.8585, "step": 475 }, { "epoch": 0.9384164222873901, "grad_norm": 0.5996940109383458, "learning_rate": 3.363791854277348e-06, "loss": 0.8938, "step": 480 }, { "epoch": 0.9481915933528837, "grad_norm": 0.6175664551394122, "learning_rate": 2.3688410846596282e-06, "loss": 0.8891, "step": 485 }, { "epoch": 0.9579667644183774, "grad_norm": 2.0911762607681768, "learning_rate": 1.5467724137617043e-06, "loss": 0.924, "step": 490 }, { "epoch": 0.967741935483871, "grad_norm": 0.6696352577624144, "learning_rate": 8.985485174722973e-07, "loss": 0.9077, "step": 495 }, { "epoch": 0.9775171065493646, "grad_norm": 0.5766498486118373, "learning_rate": 4.249284923700358e-07, "loss": 0.9012, "step": 500 }, { "epoch": 0.9872922776148583, "grad_norm": 0.6254945643083392, "learning_rate": 1.2646696679042833e-07, "loss": 0.9035, "step": 505 }, { "epoch": 0.9970674486803519, "grad_norm": 0.6202670688713676, "learning_rate": 3.5134513334200697e-09, "loss": 0.9303, "step": 510 }, { "epoch": 0.9990224828934506, "eval_loss": 3.596351385116577, "eval_runtime": 2.2495, "eval_samples_per_second": 2.667, "eval_steps_per_second": 0.445, "step": 511 }, { "epoch": 0.9990224828934506, "step": 511, "total_flos": 26722078556160.0, "train_loss": 1.3314139091805235, "train_runtime": 8881.3456, "train_samples_per_second": 1.842, "train_steps_per_second": 0.058 } ], "logging_steps": 5, "max_steps": 511, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 26722078556160.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }