{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.944622991347343, "eval_steps": 5000, "global_step": 20001, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00024721878862793575, "grad_norm": 1448.0, "learning_rate": 2.0000000000000002e-07, "loss": 13.9716, "step": 1 }, { "epoch": 0.006180469715698393, "grad_norm": 334.0, "learning_rate": 5e-06, "loss": 9.5244, "step": 25 }, { "epoch": 0.012360939431396786, "grad_norm": 78.5, "learning_rate": 1e-05, "loss": 7.5535, "step": 50 }, { "epoch": 0.018541409147095178, "grad_norm": 34.5, "learning_rate": 1.5e-05, "loss": 7.0022, "step": 75 }, { "epoch": 0.024721878862793572, "grad_norm": 18.75, "learning_rate": 2e-05, "loss": 6.4562, "step": 100 }, { "epoch": 0.030902348578491966, "grad_norm": 63.75, "learning_rate": 2.5e-05, "loss": 5.6999, "step": 125 }, { "epoch": 0.037082818294190356, "grad_norm": 29.125, "learning_rate": 3e-05, "loss": 5.2536, "step": 150 }, { "epoch": 0.04326328800988875, "grad_norm": 29.5, "learning_rate": 3.5e-05, "loss": 4.6767, "step": 175 }, { "epoch": 0.049443757725587144, "grad_norm": 20.25, "learning_rate": 4e-05, "loss": 4.2913, "step": 200 }, { "epoch": 0.05562422744128554, "grad_norm": 16.625, "learning_rate": 4.5e-05, "loss": 3.7505, "step": 225 }, { "epoch": 0.06180469715698393, "grad_norm": 17.375, "learning_rate": 5e-05, "loss": 3.3317, "step": 250 }, { "epoch": 0.06798516687268233, "grad_norm": 112.0, "learning_rate": 5.500000000000001e-05, "loss": 4.1318, "step": 275 }, { "epoch": 0.07416563658838071, "grad_norm": 31.75, "learning_rate": 6e-05, "loss": 3.558, "step": 300 }, { "epoch": 0.08034610630407911, "grad_norm": 13.3125, "learning_rate": 6.500000000000001e-05, "loss": 2.9825, "step": 325 }, { "epoch": 0.0865265760197775, "grad_norm": 12.875, "learning_rate": 7e-05, "loss": 2.6931, "step": 350 }, { "epoch": 0.09270704573547589, "grad_norm": 12.8125, "learning_rate": 7.500000000000001e-05, "loss": 2.4328, "step": 375 }, { "epoch": 0.09888751545117429, "grad_norm": 11.0625, "learning_rate": 8e-05, "loss": 2.285, "step": 400 }, { "epoch": 0.10506798516687268, "grad_norm": 6.28125, "learning_rate": 8.5e-05, "loss": 2.1667, "step": 425 }, { "epoch": 0.11124845488257108, "grad_norm": 4.5625, "learning_rate": 9e-05, "loss": 2.0534, "step": 450 }, { "epoch": 0.11742892459826947, "grad_norm": 3.90625, "learning_rate": 9.5e-05, "loss": 1.9744, "step": 475 }, { "epoch": 0.12360939431396786, "grad_norm": 4.78125, "learning_rate": 0.0001, "loss": 1.9209, "step": 500 }, { "epoch": 0.12978986402966625, "grad_norm": 3.09375, "learning_rate": 9.987179487179488e-05, "loss": 1.8793, "step": 525 }, { "epoch": 0.13597033374536466, "grad_norm": 2.953125, "learning_rate": 9.974358974358975e-05, "loss": 1.8301, "step": 550 }, { "epoch": 0.14215080346106304, "grad_norm": 2.25, "learning_rate": 9.961538461538463e-05, "loss": 1.7935, "step": 575 }, { "epoch": 0.14833127317676142, "grad_norm": 2.3125, "learning_rate": 9.948717948717949e-05, "loss": 1.7651, "step": 600 }, { "epoch": 0.15451174289245984, "grad_norm": 2.21875, "learning_rate": 9.935897435897437e-05, "loss": 1.737, "step": 625 }, { "epoch": 0.16069221260815822, "grad_norm": 2.28125, "learning_rate": 9.923076923076923e-05, "loss": 1.7098, "step": 650 }, { "epoch": 0.1668726823238566, "grad_norm": 1.71875, "learning_rate": 9.910256410256411e-05, "loss": 1.69, "step": 675 }, { "epoch": 0.173053152039555, "grad_norm": 1.671875, "learning_rate": 9.897435897435898e-05, "loss": 1.6622, "step": 700 }, { "epoch": 0.1792336217552534, "grad_norm": 1.8125, "learning_rate": 9.884615384615386e-05, "loss": 1.6502, "step": 725 }, { "epoch": 0.18541409147095178, "grad_norm": 1.4765625, "learning_rate": 9.871794871794872e-05, "loss": 1.629, "step": 750 }, { "epoch": 0.1915945611866502, "grad_norm": 1.3984375, "learning_rate": 9.85897435897436e-05, "loss": 1.6102, "step": 775 }, { "epoch": 0.19777503090234858, "grad_norm": 1.4296875, "learning_rate": 9.846153846153848e-05, "loss": 1.5991, "step": 800 }, { "epoch": 0.20395550061804696, "grad_norm": 1.5234375, "learning_rate": 9.833333333333333e-05, "loss": 1.5884, "step": 825 }, { "epoch": 0.21013597033374537, "grad_norm": 1.5, "learning_rate": 9.820512820512821e-05, "loss": 1.5704, "step": 850 }, { "epoch": 0.21631644004944375, "grad_norm": 1.453125, "learning_rate": 9.807692307692307e-05, "loss": 1.5573, "step": 875 }, { "epoch": 0.22249690976514216, "grad_norm": 1.4453125, "learning_rate": 9.794871794871795e-05, "loss": 1.5416, "step": 900 }, { "epoch": 0.22867737948084055, "grad_norm": 1.25, "learning_rate": 9.782051282051282e-05, "loss": 1.5409, "step": 925 }, { "epoch": 0.23485784919653893, "grad_norm": 1.2421875, "learning_rate": 9.76923076923077e-05, "loss": 1.5344, "step": 950 }, { "epoch": 0.24103831891223734, "grad_norm": 1.5, "learning_rate": 9.756410256410257e-05, "loss": 1.5195, "step": 975 }, { "epoch": 0.24721878862793573, "grad_norm": 1.2421875, "learning_rate": 9.743589743589744e-05, "loss": 1.5051, "step": 1000 }, { "epoch": 0.25339925834363414, "grad_norm": 1.1953125, "learning_rate": 9.730769230769232e-05, "loss": 1.505, "step": 1025 }, { "epoch": 0.2595797280593325, "grad_norm": 1.109375, "learning_rate": 9.717948717948718e-05, "loss": 1.4932, "step": 1050 }, { "epoch": 0.2657601977750309, "grad_norm": 1.046875, "learning_rate": 9.705128205128206e-05, "loss": 1.4901, "step": 1075 }, { "epoch": 0.2719406674907293, "grad_norm": 1.0546875, "learning_rate": 9.692307692307692e-05, "loss": 1.4806, "step": 1100 }, { "epoch": 0.27812113720642767, "grad_norm": 1.265625, "learning_rate": 9.67948717948718e-05, "loss": 1.4745, "step": 1125 }, { "epoch": 0.2843016069221261, "grad_norm": 1.3125, "learning_rate": 9.666666666666667e-05, "loss": 1.4604, "step": 1150 }, { "epoch": 0.2904820766378245, "grad_norm": 1.1640625, "learning_rate": 9.653846153846155e-05, "loss": 1.4584, "step": 1175 }, { "epoch": 0.29666254635352285, "grad_norm": 1.5234375, "learning_rate": 9.641025641025641e-05, "loss": 1.4489, "step": 1200 }, { "epoch": 0.30284301606922126, "grad_norm": 1.0546875, "learning_rate": 9.628205128205129e-05, "loss": 1.4439, "step": 1225 }, { "epoch": 0.30902348578491967, "grad_norm": 1.0234375, "learning_rate": 9.615384615384617e-05, "loss": 1.4367, "step": 1250 }, { "epoch": 0.315203955500618, "grad_norm": 1.15625, "learning_rate": 9.602564102564103e-05, "loss": 1.4392, "step": 1275 }, { "epoch": 0.32138442521631644, "grad_norm": 0.91015625, "learning_rate": 9.589743589743591e-05, "loss": 1.4281, "step": 1300 }, { "epoch": 0.32756489493201485, "grad_norm": 1.0234375, "learning_rate": 9.576923076923078e-05, "loss": 1.4251, "step": 1325 }, { "epoch": 0.3337453646477132, "grad_norm": 0.9921875, "learning_rate": 9.564102564102565e-05, "loss": 1.4222, "step": 1350 }, { "epoch": 0.3399258343634116, "grad_norm": 1.0859375, "learning_rate": 9.551282051282052e-05, "loss": 1.4259, "step": 1375 }, { "epoch": 0.34610630407911, "grad_norm": 0.91796875, "learning_rate": 9.53846153846154e-05, "loss": 1.4154, "step": 1400 }, { "epoch": 0.3522867737948084, "grad_norm": 1.0, "learning_rate": 9.525641025641026e-05, "loss": 1.4074, "step": 1425 }, { "epoch": 0.3584672435105068, "grad_norm": 0.92578125, "learning_rate": 9.512820512820513e-05, "loss": 1.4051, "step": 1450 }, { "epoch": 0.3646477132262052, "grad_norm": 1.0703125, "learning_rate": 9.5e-05, "loss": 1.4007, "step": 1475 }, { "epoch": 0.37082818294190356, "grad_norm": 0.90234375, "learning_rate": 9.487179487179487e-05, "loss": 1.389, "step": 1500 }, { "epoch": 0.377008652657602, "grad_norm": 0.9375, "learning_rate": 9.474358974358975e-05, "loss": 1.3924, "step": 1525 }, { "epoch": 0.3831891223733004, "grad_norm": 0.890625, "learning_rate": 9.461538461538461e-05, "loss": 1.3868, "step": 1550 }, { "epoch": 0.38936959208899874, "grad_norm": 0.9609375, "learning_rate": 9.448717948717949e-05, "loss": 1.3889, "step": 1575 }, { "epoch": 0.39555006180469715, "grad_norm": 0.9296875, "learning_rate": 9.435897435897436e-05, "loss": 1.3825, "step": 1600 }, { "epoch": 0.40173053152039556, "grad_norm": 1.0390625, "learning_rate": 9.423076923076924e-05, "loss": 1.3709, "step": 1625 }, { "epoch": 0.4079110012360939, "grad_norm": 0.9140625, "learning_rate": 9.41025641025641e-05, "loss": 1.3719, "step": 1650 }, { "epoch": 0.41409147095179233, "grad_norm": 0.953125, "learning_rate": 9.397435897435898e-05, "loss": 1.3718, "step": 1675 }, { "epoch": 0.42027194066749074, "grad_norm": 1.0078125, "learning_rate": 9.384615384615386e-05, "loss": 1.3673, "step": 1700 }, { "epoch": 0.4264524103831891, "grad_norm": 1.171875, "learning_rate": 9.371794871794872e-05, "loss": 1.3629, "step": 1725 }, { "epoch": 0.4326328800988875, "grad_norm": 0.8125, "learning_rate": 9.35897435897436e-05, "loss": 1.3616, "step": 1750 }, { "epoch": 0.4388133498145859, "grad_norm": 0.89453125, "learning_rate": 9.346153846153846e-05, "loss": 1.3599, "step": 1775 }, { "epoch": 0.44499381953028433, "grad_norm": 0.88671875, "learning_rate": 9.333333333333334e-05, "loss": 1.3516, "step": 1800 }, { "epoch": 0.4511742892459827, "grad_norm": 0.796875, "learning_rate": 9.320512820512821e-05, "loss": 1.3552, "step": 1825 }, { "epoch": 0.4573547589616811, "grad_norm": 0.89453125, "learning_rate": 9.307692307692309e-05, "loss": 1.3448, "step": 1850 }, { "epoch": 0.4635352286773795, "grad_norm": 0.94140625, "learning_rate": 9.294871794871795e-05, "loss": 1.3492, "step": 1875 }, { "epoch": 0.46971569839307786, "grad_norm": 0.85546875, "learning_rate": 9.282051282051283e-05, "loss": 1.34, "step": 1900 }, { "epoch": 0.4758961681087763, "grad_norm": 0.7734375, "learning_rate": 9.26923076923077e-05, "loss": 1.3406, "step": 1925 }, { "epoch": 0.4820766378244747, "grad_norm": 0.83203125, "learning_rate": 9.256410256410257e-05, "loss": 1.3369, "step": 1950 }, { "epoch": 0.48825710754017304, "grad_norm": 0.87109375, "learning_rate": 9.243589743589745e-05, "loss": 1.3297, "step": 1975 }, { "epoch": 0.49443757725587145, "grad_norm": 0.84375, "learning_rate": 9.230769230769232e-05, "loss": 1.3347, "step": 2000 }, { "epoch": 0.5006180469715699, "grad_norm": 0.81640625, "learning_rate": 9.217948717948718e-05, "loss": 1.3269, "step": 2025 }, { "epoch": 0.5067985166872683, "grad_norm": 0.96484375, "learning_rate": 9.205128205128205e-05, "loss": 1.3316, "step": 2050 }, { "epoch": 0.5129789864029666, "grad_norm": 1.078125, "learning_rate": 9.192307692307692e-05, "loss": 1.3212, "step": 2075 }, { "epoch": 0.519159456118665, "grad_norm": 0.8203125, "learning_rate": 9.179487179487179e-05, "loss": 1.3177, "step": 2100 }, { "epoch": 0.5253399258343634, "grad_norm": 0.8515625, "learning_rate": 9.166666666666667e-05, "loss": 1.3155, "step": 2125 }, { "epoch": 0.5315203955500618, "grad_norm": 0.90234375, "learning_rate": 9.153846153846155e-05, "loss": 1.3206, "step": 2150 }, { "epoch": 0.5377008652657602, "grad_norm": 0.796875, "learning_rate": 9.141025641025641e-05, "loss": 1.3131, "step": 2175 }, { "epoch": 0.5438813349814586, "grad_norm": 0.91796875, "learning_rate": 9.128205128205129e-05, "loss": 1.3143, "step": 2200 }, { "epoch": 0.5500618046971569, "grad_norm": 0.98046875, "learning_rate": 9.115384615384615e-05, "loss": 1.3099, "step": 2225 }, { "epoch": 0.5562422744128553, "grad_norm": 0.77734375, "learning_rate": 9.102564102564103e-05, "loss": 1.3104, "step": 2250 }, { "epoch": 0.5624227441285538, "grad_norm": 0.828125, "learning_rate": 9.08974358974359e-05, "loss": 1.3026, "step": 2275 }, { "epoch": 0.5686032138442522, "grad_norm": 0.75390625, "learning_rate": 9.076923076923078e-05, "loss": 1.2985, "step": 2300 }, { "epoch": 0.5747836835599506, "grad_norm": 0.80078125, "learning_rate": 9.064102564102564e-05, "loss": 1.3109, "step": 2325 }, { "epoch": 0.580964153275649, "grad_norm": 0.859375, "learning_rate": 9.051282051282052e-05, "loss": 1.3061, "step": 2350 }, { "epoch": 0.5871446229913473, "grad_norm": 0.77734375, "learning_rate": 9.038461538461538e-05, "loss": 1.3022, "step": 2375 }, { "epoch": 0.5933250927070457, "grad_norm": 0.91015625, "learning_rate": 9.025641025641026e-05, "loss": 1.3006, "step": 2400 }, { "epoch": 0.5995055624227441, "grad_norm": 0.84375, "learning_rate": 9.012820512820514e-05, "loss": 1.2938, "step": 2425 }, { "epoch": 0.6056860321384425, "grad_norm": 0.93359375, "learning_rate": 9e-05, "loss": 1.2968, "step": 2450 }, { "epoch": 0.6118665018541409, "grad_norm": 0.8046875, "learning_rate": 8.987179487179488e-05, "loss": 1.2872, "step": 2475 }, { "epoch": 0.6180469715698393, "grad_norm": 0.828125, "learning_rate": 8.974358974358975e-05, "loss": 1.2902, "step": 2500 }, { "epoch": 0.6242274412855378, "grad_norm": 0.88671875, "learning_rate": 8.961538461538463e-05, "loss": 1.2848, "step": 2525 }, { "epoch": 0.630407911001236, "grad_norm": 0.76171875, "learning_rate": 8.948717948717949e-05, "loss": 1.2902, "step": 2550 }, { "epoch": 0.6365883807169345, "grad_norm": 0.859375, "learning_rate": 8.935897435897437e-05, "loss": 1.2917, "step": 2575 }, { "epoch": 0.6427688504326329, "grad_norm": 0.76953125, "learning_rate": 8.923076923076924e-05, "loss": 1.2882, "step": 2600 }, { "epoch": 0.6489493201483313, "grad_norm": 0.84375, "learning_rate": 8.910256410256411e-05, "loss": 1.277, "step": 2625 }, { "epoch": 0.6551297898640297, "grad_norm": 0.71875, "learning_rate": 8.897435897435898e-05, "loss": 1.2832, "step": 2650 }, { "epoch": 0.6613102595797281, "grad_norm": 0.7578125, "learning_rate": 8.884615384615384e-05, "loss": 1.2822, "step": 2675 }, { "epoch": 0.6674907292954264, "grad_norm": 0.7578125, "learning_rate": 8.871794871794872e-05, "loss": 1.2809, "step": 2700 }, { "epoch": 0.6736711990111248, "grad_norm": 0.859375, "learning_rate": 8.858974358974359e-05, "loss": 1.2696, "step": 2725 }, { "epoch": 0.6798516687268232, "grad_norm": 0.79296875, "learning_rate": 8.846153846153847e-05, "loss": 1.2768, "step": 2750 }, { "epoch": 0.6860321384425216, "grad_norm": 0.8515625, "learning_rate": 8.833333333333333e-05, "loss": 1.2735, "step": 2775 }, { "epoch": 0.69221260815822, "grad_norm": 0.7734375, "learning_rate": 8.820512820512821e-05, "loss": 1.2743, "step": 2800 }, { "epoch": 0.6983930778739185, "grad_norm": 0.796875, "learning_rate": 8.807692307692307e-05, "loss": 1.2766, "step": 2825 }, { "epoch": 0.7045735475896168, "grad_norm": 0.81640625, "learning_rate": 8.794871794871795e-05, "loss": 1.2693, "step": 2850 }, { "epoch": 0.7107540173053152, "grad_norm": 0.83203125, "learning_rate": 8.782051282051283e-05, "loss": 1.263, "step": 2875 }, { "epoch": 0.7169344870210136, "grad_norm": 0.80859375, "learning_rate": 8.76923076923077e-05, "loss": 1.2618, "step": 2900 }, { "epoch": 0.723114956736712, "grad_norm": 0.77734375, "learning_rate": 8.756410256410257e-05, "loss": 1.2626, "step": 2925 }, { "epoch": 0.7292954264524104, "grad_norm": 0.8203125, "learning_rate": 8.743589743589744e-05, "loss": 1.2644, "step": 2950 }, { "epoch": 0.7354758961681088, "grad_norm": 0.7734375, "learning_rate": 8.730769230769232e-05, "loss": 1.2623, "step": 2975 }, { "epoch": 0.7416563658838071, "grad_norm": 0.77734375, "learning_rate": 8.717948717948718e-05, "loss": 1.2633, "step": 3000 }, { "epoch": 0.7478368355995055, "grad_norm": 0.75, "learning_rate": 8.705128205128206e-05, "loss": 1.261, "step": 3025 }, { "epoch": 0.754017305315204, "grad_norm": 0.7890625, "learning_rate": 8.692307692307692e-05, "loss": 1.259, "step": 3050 }, { "epoch": 0.7601977750309024, "grad_norm": 0.78515625, "learning_rate": 8.67948717948718e-05, "loss": 1.2546, "step": 3075 }, { "epoch": 0.7663782447466008, "grad_norm": 0.71875, "learning_rate": 8.666666666666667e-05, "loss": 1.2551, "step": 3100 }, { "epoch": 0.7725587144622992, "grad_norm": 0.71484375, "learning_rate": 8.653846153846155e-05, "loss": 1.2565, "step": 3125 }, { "epoch": 0.7787391841779975, "grad_norm": 0.76953125, "learning_rate": 8.641025641025642e-05, "loss": 1.2559, "step": 3150 }, { "epoch": 0.7849196538936959, "grad_norm": 0.76953125, "learning_rate": 8.628205128205129e-05, "loss": 1.253, "step": 3175 }, { "epoch": 0.7911001236093943, "grad_norm": 0.83203125, "learning_rate": 8.615384615384617e-05, "loss": 1.255, "step": 3200 }, { "epoch": 0.7972805933250927, "grad_norm": 0.72265625, "learning_rate": 8.602564102564103e-05, "loss": 1.2486, "step": 3225 }, { "epoch": 0.8034610630407911, "grad_norm": 0.74609375, "learning_rate": 8.58974358974359e-05, "loss": 1.24, "step": 3250 }, { "epoch": 0.8096415327564895, "grad_norm": 0.8125, "learning_rate": 8.576923076923076e-05, "loss": 1.2466, "step": 3275 }, { "epoch": 0.8158220024721878, "grad_norm": 0.765625, "learning_rate": 8.564102564102564e-05, "loss": 1.2455, "step": 3300 }, { "epoch": 0.8220024721878862, "grad_norm": 0.76171875, "learning_rate": 8.551282051282052e-05, "loss": 1.2409, "step": 3325 }, { "epoch": 0.8281829419035847, "grad_norm": 0.875, "learning_rate": 8.538461538461538e-05, "loss": 1.237, "step": 3350 }, { "epoch": 0.8343634116192831, "grad_norm": 0.82421875, "learning_rate": 8.525641025641026e-05, "loss": 1.2437, "step": 3375 }, { "epoch": 0.8405438813349815, "grad_norm": 0.73828125, "learning_rate": 8.512820512820513e-05, "loss": 1.2388, "step": 3400 }, { "epoch": 0.8467243510506799, "grad_norm": 0.8671875, "learning_rate": 8.5e-05, "loss": 1.238, "step": 3425 }, { "epoch": 0.8529048207663782, "grad_norm": 0.76171875, "learning_rate": 8.487179487179487e-05, "loss": 1.231, "step": 3450 }, { "epoch": 0.8590852904820766, "grad_norm": 0.7421875, "learning_rate": 8.474358974358975e-05, "loss": 1.2381, "step": 3475 }, { "epoch": 0.865265760197775, "grad_norm": 0.75390625, "learning_rate": 8.461538461538461e-05, "loss": 1.2305, "step": 3500 }, { "epoch": 0.8714462299134734, "grad_norm": 0.70703125, "learning_rate": 8.448717948717949e-05, "loss": 1.2326, "step": 3525 }, { "epoch": 0.8776266996291718, "grad_norm": 0.8046875, "learning_rate": 8.435897435897436e-05, "loss": 1.2288, "step": 3550 }, { "epoch": 0.8838071693448702, "grad_norm": 0.828125, "learning_rate": 8.423076923076924e-05, "loss": 1.2268, "step": 3575 }, { "epoch": 0.8899876390605687, "grad_norm": 0.7421875, "learning_rate": 8.410256410256411e-05, "loss": 1.2314, "step": 3600 }, { "epoch": 0.896168108776267, "grad_norm": 0.78515625, "learning_rate": 8.397435897435898e-05, "loss": 1.2292, "step": 3625 }, { "epoch": 0.9023485784919654, "grad_norm": 0.7109375, "learning_rate": 8.384615384615386e-05, "loss": 1.2304, "step": 3650 }, { "epoch": 0.9085290482076638, "grad_norm": 0.765625, "learning_rate": 8.371794871794872e-05, "loss": 1.2256, "step": 3675 }, { "epoch": 0.9147095179233622, "grad_norm": 0.765625, "learning_rate": 8.35897435897436e-05, "loss": 1.2268, "step": 3700 }, { "epoch": 0.9208899876390606, "grad_norm": 0.78515625, "learning_rate": 8.346153846153847e-05, "loss": 1.2287, "step": 3725 }, { "epoch": 0.927070457354759, "grad_norm": 0.72265625, "learning_rate": 8.333333333333334e-05, "loss": 1.2252, "step": 3750 }, { "epoch": 0.9332509270704573, "grad_norm": 0.78515625, "learning_rate": 8.320512820512821e-05, "loss": 1.225, "step": 3775 }, { "epoch": 0.9394313967861557, "grad_norm": 0.7734375, "learning_rate": 8.307692307692309e-05, "loss": 1.2183, "step": 3800 }, { "epoch": 0.9456118665018541, "grad_norm": 0.6796875, "learning_rate": 8.294871794871795e-05, "loss": 1.2244, "step": 3825 }, { "epoch": 0.9517923362175525, "grad_norm": 0.71484375, "learning_rate": 8.282051282051283e-05, "loss": 1.2194, "step": 3850 }, { "epoch": 0.957972805933251, "grad_norm": 0.75, "learning_rate": 8.26923076923077e-05, "loss": 1.2162, "step": 3875 }, { "epoch": 0.9641532756489494, "grad_norm": 0.73828125, "learning_rate": 8.256410256410256e-05, "loss": 1.2146, "step": 3900 }, { "epoch": 0.9703337453646477, "grad_norm": 0.76953125, "learning_rate": 8.243589743589744e-05, "loss": 1.2144, "step": 3925 }, { "epoch": 0.9765142150803461, "grad_norm": 0.73828125, "learning_rate": 8.23076923076923e-05, "loss": 1.2146, "step": 3950 }, { "epoch": 0.9826946847960445, "grad_norm": 0.70703125, "learning_rate": 8.217948717948718e-05, "loss": 1.2151, "step": 3975 }, { "epoch": 0.9888751545117429, "grad_norm": 0.72265625, "learning_rate": 8.205128205128205e-05, "loss": 1.2146, "step": 4000 }, { "epoch": 0.9950556242274413, "grad_norm": 0.91796875, "learning_rate": 8.192307692307693e-05, "loss": 1.211, "step": 4025 }, { "epoch": 1.0012360939431397, "grad_norm": 0.88671875, "learning_rate": 8.179487179487179e-05, "loss": 1.198, "step": 4050 }, { "epoch": 1.0074165636588381, "grad_norm": 0.8046875, "learning_rate": 8.166666666666667e-05, "loss": 1.1373, "step": 4075 }, { "epoch": 1.0135970333745365, "grad_norm": 0.84375, "learning_rate": 8.153846153846155e-05, "loss": 1.1436, "step": 4100 }, { "epoch": 1.019777503090235, "grad_norm": 0.7265625, "learning_rate": 8.141025641025641e-05, "loss": 1.138, "step": 4125 }, { "epoch": 1.0259579728059331, "grad_norm": 0.80078125, "learning_rate": 8.128205128205129e-05, "loss": 1.1334, "step": 4150 }, { "epoch": 1.0321384425216316, "grad_norm": 0.828125, "learning_rate": 8.115384615384616e-05, "loss": 1.1423, "step": 4175 }, { "epoch": 1.03831891223733, "grad_norm": 0.75390625, "learning_rate": 8.102564102564103e-05, "loss": 1.1398, "step": 4200 }, { "epoch": 1.0444993819530284, "grad_norm": 0.7890625, "learning_rate": 8.08974358974359e-05, "loss": 1.1378, "step": 4225 }, { "epoch": 1.0506798516687268, "grad_norm": 0.76953125, "learning_rate": 8.076923076923078e-05, "loss": 1.1376, "step": 4250 }, { "epoch": 1.0568603213844252, "grad_norm": 0.80078125, "learning_rate": 8.064102564102564e-05, "loss": 1.1443, "step": 4275 }, { "epoch": 1.0630407911001236, "grad_norm": 0.828125, "learning_rate": 8.051282051282052e-05, "loss": 1.1441, "step": 4300 }, { "epoch": 1.069221260815822, "grad_norm": 0.77734375, "learning_rate": 8.038461538461538e-05, "loss": 1.1435, "step": 4325 }, { "epoch": 1.0754017305315204, "grad_norm": 0.73828125, "learning_rate": 8.025641025641026e-05, "loss": 1.1449, "step": 4350 }, { "epoch": 1.0815822002472189, "grad_norm": 0.80078125, "learning_rate": 8.012820512820514e-05, "loss": 1.1468, "step": 4375 }, { "epoch": 1.0877626699629173, "grad_norm": 0.796875, "learning_rate": 8e-05, "loss": 1.1402, "step": 4400 }, { "epoch": 1.0939431396786157, "grad_norm": 0.82421875, "learning_rate": 7.987179487179488e-05, "loss": 1.1405, "step": 4425 }, { "epoch": 1.100123609394314, "grad_norm": 0.7421875, "learning_rate": 7.974358974358975e-05, "loss": 1.1441, "step": 4450 }, { "epoch": 1.1063040791100123, "grad_norm": 0.74609375, "learning_rate": 7.961538461538461e-05, "loss": 1.1486, "step": 4475 }, { "epoch": 1.1124845488257107, "grad_norm": 0.7890625, "learning_rate": 7.948717948717948e-05, "loss": 1.1439, "step": 4500 }, { "epoch": 1.118665018541409, "grad_norm": 0.73828125, "learning_rate": 7.935897435897436e-05, "loss": 1.1408, "step": 4525 }, { "epoch": 1.1248454882571075, "grad_norm": 0.74609375, "learning_rate": 7.923076923076924e-05, "loss": 1.1381, "step": 4550 }, { "epoch": 1.131025957972806, "grad_norm": 0.77734375, "learning_rate": 7.91025641025641e-05, "loss": 1.1441, "step": 4575 }, { "epoch": 1.1372064276885043, "grad_norm": 0.80078125, "learning_rate": 7.897435897435898e-05, "loss": 1.1377, "step": 4600 }, { "epoch": 1.1433868974042027, "grad_norm": 0.7734375, "learning_rate": 7.884615384615384e-05, "loss": 1.1405, "step": 4625 }, { "epoch": 1.1495673671199012, "grad_norm": 0.734375, "learning_rate": 7.871794871794872e-05, "loss": 1.146, "step": 4650 }, { "epoch": 1.1557478368355996, "grad_norm": 0.8125, "learning_rate": 7.858974358974359e-05, "loss": 1.1416, "step": 4675 }, { "epoch": 1.161928306551298, "grad_norm": 0.84375, "learning_rate": 7.846153846153847e-05, "loss": 1.1352, "step": 4700 }, { "epoch": 1.1681087762669964, "grad_norm": 0.83984375, "learning_rate": 7.833333333333333e-05, "loss": 1.1398, "step": 4725 }, { "epoch": 1.1742892459826946, "grad_norm": 0.75390625, "learning_rate": 7.820512820512821e-05, "loss": 1.1422, "step": 4750 }, { "epoch": 1.180469715698393, "grad_norm": 0.72265625, "learning_rate": 7.807692307692307e-05, "loss": 1.1416, "step": 4775 }, { "epoch": 1.1866501854140914, "grad_norm": 0.73828125, "learning_rate": 7.794871794871795e-05, "loss": 1.1416, "step": 4800 }, { "epoch": 1.1928306551297898, "grad_norm": 0.80078125, "learning_rate": 7.782051282051283e-05, "loss": 1.1387, "step": 4825 }, { "epoch": 1.1990111248454882, "grad_norm": 0.7890625, "learning_rate": 7.76923076923077e-05, "loss": 1.1372, "step": 4850 }, { "epoch": 1.2051915945611866, "grad_norm": 0.75, "learning_rate": 7.756410256410257e-05, "loss": 1.1381, "step": 4875 }, { "epoch": 1.211372064276885, "grad_norm": 0.72265625, "learning_rate": 7.743589743589744e-05, "loss": 1.1342, "step": 4900 }, { "epoch": 1.2175525339925835, "grad_norm": 0.71875, "learning_rate": 7.730769230769232e-05, "loss": 1.1355, "step": 4925 }, { "epoch": 1.2237330037082819, "grad_norm": 0.75390625, "learning_rate": 7.717948717948718e-05, "loss": 1.1406, "step": 4950 }, { "epoch": 1.2299134734239803, "grad_norm": 0.78125, "learning_rate": 7.705128205128206e-05, "loss": 1.1411, "step": 4975 }, { "epoch": 1.2360939431396787, "grad_norm": 0.7578125, "learning_rate": 7.692307692307693e-05, "loss": 1.135, "step": 5000 }, { "epoch": 1.2360939431396787, "eval_loss": 1.048352837562561, "eval_runtime": 1.5386, "eval_samples_per_second": 415.323, "eval_steps_per_second": 1.95, "step": 5000 }, { "epoch": 1.242274412855377, "grad_norm": 0.8203125, "learning_rate": 7.67948717948718e-05, "loss": 1.1348, "step": 5025 }, { "epoch": 1.2484548825710755, "grad_norm": 0.7578125, "learning_rate": 7.666666666666667e-05, "loss": 1.1338, "step": 5050 }, { "epoch": 1.254635352286774, "grad_norm": 0.8359375, "learning_rate": 7.653846153846153e-05, "loss": 1.1291, "step": 5075 }, { "epoch": 1.260815822002472, "grad_norm": 0.7734375, "learning_rate": 7.641025641025641e-05, "loss": 1.1325, "step": 5100 }, { "epoch": 1.2669962917181705, "grad_norm": 0.7265625, "learning_rate": 7.628205128205128e-05, "loss": 1.1359, "step": 5125 }, { "epoch": 1.273176761433869, "grad_norm": 0.78125, "learning_rate": 7.615384615384616e-05, "loss": 1.1337, "step": 5150 }, { "epoch": 1.2793572311495673, "grad_norm": 0.7890625, "learning_rate": 7.602564102564102e-05, "loss": 1.132, "step": 5175 }, { "epoch": 1.2855377008652658, "grad_norm": 0.734375, "learning_rate": 7.58974358974359e-05, "loss": 1.1308, "step": 5200 }, { "epoch": 1.2917181705809642, "grad_norm": 0.75, "learning_rate": 7.576923076923076e-05, "loss": 1.1291, "step": 5225 }, { "epoch": 1.2978986402966626, "grad_norm": 0.7109375, "learning_rate": 7.564102564102564e-05, "loss": 1.1248, "step": 5250 }, { "epoch": 1.304079110012361, "grad_norm": 0.78515625, "learning_rate": 7.551282051282052e-05, "loss": 1.1262, "step": 5275 }, { "epoch": 1.3102595797280594, "grad_norm": 0.78125, "learning_rate": 7.538461538461539e-05, "loss": 1.1246, "step": 5300 }, { "epoch": 1.3164400494437576, "grad_norm": 0.75390625, "learning_rate": 7.525641025641026e-05, "loss": 1.1234, "step": 5325 }, { "epoch": 1.322620519159456, "grad_norm": 0.734375, "learning_rate": 7.512820512820513e-05, "loss": 1.1252, "step": 5350 }, { "epoch": 1.3288009888751544, "grad_norm": 0.78125, "learning_rate": 7.500000000000001e-05, "loss": 1.1288, "step": 5375 }, { "epoch": 1.3349814585908528, "grad_norm": 0.7265625, "learning_rate": 7.487179487179487e-05, "loss": 1.1227, "step": 5400 }, { "epoch": 1.3411619283065512, "grad_norm": 0.72265625, "learning_rate": 7.474358974358975e-05, "loss": 1.1233, "step": 5425 }, { "epoch": 1.3473423980222496, "grad_norm": 0.73828125, "learning_rate": 7.461538461538462e-05, "loss": 1.1233, "step": 5450 }, { "epoch": 1.353522867737948, "grad_norm": 0.859375, "learning_rate": 7.44871794871795e-05, "loss": 1.1248, "step": 5475 }, { "epoch": 1.3597033374536465, "grad_norm": 0.734375, "learning_rate": 7.435897435897436e-05, "loss": 1.1264, "step": 5500 }, { "epoch": 1.3658838071693449, "grad_norm": 0.703125, "learning_rate": 7.423076923076924e-05, "loss": 1.1215, "step": 5525 }, { "epoch": 1.3720642768850433, "grad_norm": 0.69921875, "learning_rate": 7.410256410256412e-05, "loss": 1.1237, "step": 5550 }, { "epoch": 1.3782447466007417, "grad_norm": 0.734375, "learning_rate": 7.397435897435898e-05, "loss": 1.1212, "step": 5575 }, { "epoch": 1.38442521631644, "grad_norm": 0.7578125, "learning_rate": 7.384615384615386e-05, "loss": 1.118, "step": 5600 }, { "epoch": 1.3906056860321385, "grad_norm": 0.73828125, "learning_rate": 7.371794871794872e-05, "loss": 1.1163, "step": 5625 }, { "epoch": 1.396786155747837, "grad_norm": 0.73046875, "learning_rate": 7.35897435897436e-05, "loss": 1.1168, "step": 5650 }, { "epoch": 1.4029666254635353, "grad_norm": 0.66796875, "learning_rate": 7.346153846153847e-05, "loss": 1.118, "step": 5675 }, { "epoch": 1.4091470951792338, "grad_norm": 0.79296875, "learning_rate": 7.333333333333333e-05, "loss": 1.1153, "step": 5700 }, { "epoch": 1.415327564894932, "grad_norm": 0.75390625, "learning_rate": 7.320512820512821e-05, "loss": 1.1237, "step": 5725 }, { "epoch": 1.4215080346106304, "grad_norm": 0.8828125, "learning_rate": 7.307692307692307e-05, "loss": 1.1225, "step": 5750 }, { "epoch": 1.4276885043263288, "grad_norm": 0.765625, "learning_rate": 7.294871794871795e-05, "loss": 1.1224, "step": 5775 }, { "epoch": 1.4338689740420272, "grad_norm": 0.76171875, "learning_rate": 7.282051282051282e-05, "loss": 1.1187, "step": 5800 }, { "epoch": 1.4400494437577256, "grad_norm": 0.703125, "learning_rate": 7.26923076923077e-05, "loss": 1.1186, "step": 5825 }, { "epoch": 1.446229913473424, "grad_norm": 0.79296875, "learning_rate": 7.256410256410256e-05, "loss": 1.1164, "step": 5850 }, { "epoch": 1.4524103831891224, "grad_norm": 0.76953125, "learning_rate": 7.243589743589744e-05, "loss": 1.1194, "step": 5875 }, { "epoch": 1.4585908529048208, "grad_norm": 0.7734375, "learning_rate": 7.23076923076923e-05, "loss": 1.1164, "step": 5900 }, { "epoch": 1.4647713226205192, "grad_norm": 0.7109375, "learning_rate": 7.217948717948718e-05, "loss": 1.1159, "step": 5925 }, { "epoch": 1.4709517923362174, "grad_norm": 0.75390625, "learning_rate": 7.205128205128205e-05, "loss": 1.1184, "step": 5950 }, { "epoch": 1.4771322620519158, "grad_norm": 0.734375, "learning_rate": 7.192307692307693e-05, "loss": 1.1136, "step": 5975 }, { "epoch": 1.4833127317676142, "grad_norm": 0.77734375, "learning_rate": 7.17948717948718e-05, "loss": 1.1086, "step": 6000 }, { "epoch": 1.4894932014833127, "grad_norm": 0.78125, "learning_rate": 7.166666666666667e-05, "loss": 1.1098, "step": 6025 }, { "epoch": 1.495673671199011, "grad_norm": 0.8046875, "learning_rate": 7.153846153846155e-05, "loss": 1.1132, "step": 6050 }, { "epoch": 1.5018541409147095, "grad_norm": 0.73828125, "learning_rate": 7.141025641025641e-05, "loss": 1.1044, "step": 6075 }, { "epoch": 1.508034610630408, "grad_norm": 0.74609375, "learning_rate": 7.128205128205129e-05, "loss": 1.1081, "step": 6100 }, { "epoch": 1.5142150803461063, "grad_norm": 0.74609375, "learning_rate": 7.115384615384616e-05, "loss": 1.1109, "step": 6125 }, { "epoch": 1.5203955500618047, "grad_norm": 0.78125, "learning_rate": 7.102564102564103e-05, "loss": 1.1085, "step": 6150 }, { "epoch": 1.5265760197775031, "grad_norm": 0.80859375, "learning_rate": 7.08974358974359e-05, "loss": 1.1143, "step": 6175 }, { "epoch": 1.5327564894932015, "grad_norm": 0.73046875, "learning_rate": 7.076923076923078e-05, "loss": 1.106, "step": 6200 }, { "epoch": 1.5389369592089, "grad_norm": 0.7421875, "learning_rate": 7.064102564102564e-05, "loss": 1.1094, "step": 6225 }, { "epoch": 1.5451174289245984, "grad_norm": 0.8203125, "learning_rate": 7.051282051282052e-05, "loss": 1.1063, "step": 6250 }, { "epoch": 1.5512978986402968, "grad_norm": 0.7265625, "learning_rate": 7.03846153846154e-05, "loss": 1.105, "step": 6275 }, { "epoch": 1.5574783683559952, "grad_norm": 0.7421875, "learning_rate": 7.025641025641025e-05, "loss": 1.1063, "step": 6300 }, { "epoch": 1.5636588380716936, "grad_norm": 0.7421875, "learning_rate": 7.012820512820513e-05, "loss": 1.1029, "step": 6325 }, { "epoch": 1.569839307787392, "grad_norm": 0.78515625, "learning_rate": 7e-05, "loss": 1.105, "step": 6350 }, { "epoch": 1.5760197775030902, "grad_norm": 0.75, "learning_rate": 6.987179487179487e-05, "loss": 1.1036, "step": 6375 }, { "epoch": 1.5822002472187886, "grad_norm": 0.73828125, "learning_rate": 6.974358974358974e-05, "loss": 1.1072, "step": 6400 }, { "epoch": 1.588380716934487, "grad_norm": 0.74609375, "learning_rate": 6.961538461538462e-05, "loss": 1.098, "step": 6425 }, { "epoch": 1.5945611866501854, "grad_norm": 0.7421875, "learning_rate": 6.94871794871795e-05, "loss": 1.101, "step": 6450 }, { "epoch": 1.6007416563658838, "grad_norm": 0.78515625, "learning_rate": 6.935897435897436e-05, "loss": 1.1008, "step": 6475 }, { "epoch": 1.6069221260815822, "grad_norm": 0.796875, "learning_rate": 6.923076923076924e-05, "loss": 1.1, "step": 6500 }, { "epoch": 1.6131025957972804, "grad_norm": 0.7265625, "learning_rate": 6.91025641025641e-05, "loss": 1.0997, "step": 6525 }, { "epoch": 1.6192830655129788, "grad_norm": 0.7890625, "learning_rate": 6.897435897435898e-05, "loss": 1.1005, "step": 6550 }, { "epoch": 1.6254635352286773, "grad_norm": 0.7421875, "learning_rate": 6.884615384615385e-05, "loss": 1.0953, "step": 6575 }, { "epoch": 1.6316440049443757, "grad_norm": 0.71875, "learning_rate": 6.871794871794872e-05, "loss": 1.0999, "step": 6600 }, { "epoch": 1.637824474660074, "grad_norm": 0.7265625, "learning_rate": 6.858974358974359e-05, "loss": 1.0965, "step": 6625 }, { "epoch": 1.6440049443757725, "grad_norm": 0.73828125, "learning_rate": 6.846153846153847e-05, "loss": 1.098, "step": 6650 }, { "epoch": 1.650185414091471, "grad_norm": 0.82421875, "learning_rate": 6.833333333333333e-05, "loss": 1.09, "step": 6675 }, { "epoch": 1.6563658838071693, "grad_norm": 0.828125, "learning_rate": 6.820512820512821e-05, "loss": 1.0992, "step": 6700 }, { "epoch": 1.6625463535228677, "grad_norm": 0.78515625, "learning_rate": 6.807692307692309e-05, "loss": 1.0948, "step": 6725 }, { "epoch": 1.6687268232385661, "grad_norm": 0.75, "learning_rate": 6.794871794871795e-05, "loss": 1.098, "step": 6750 }, { "epoch": 1.6749072929542645, "grad_norm": 0.75, "learning_rate": 6.782051282051283e-05, "loss": 1.0915, "step": 6775 }, { "epoch": 1.681087762669963, "grad_norm": 0.75, "learning_rate": 6.76923076923077e-05, "loss": 1.0943, "step": 6800 }, { "epoch": 1.6872682323856614, "grad_norm": 0.6953125, "learning_rate": 6.756410256410258e-05, "loss": 1.0913, "step": 6825 }, { "epoch": 1.6934487021013598, "grad_norm": 0.7265625, "learning_rate": 6.743589743589744e-05, "loss": 1.092, "step": 6850 }, { "epoch": 1.6996291718170582, "grad_norm": 0.8125, "learning_rate": 6.730769230769232e-05, "loss": 1.0904, "step": 6875 }, { "epoch": 1.7058096415327566, "grad_norm": 0.81640625, "learning_rate": 6.717948717948718e-05, "loss": 1.091, "step": 6900 }, { "epoch": 1.711990111248455, "grad_norm": 0.7421875, "learning_rate": 6.705128205128205e-05, "loss": 1.0901, "step": 6925 }, { "epoch": 1.7181705809641534, "grad_norm": 0.765625, "learning_rate": 6.692307692307693e-05, "loss": 1.0885, "step": 6950 }, { "epoch": 1.7243510506798516, "grad_norm": 0.74609375, "learning_rate": 6.679487179487179e-05, "loss": 1.0897, "step": 6975 }, { "epoch": 1.73053152039555, "grad_norm": 0.83203125, "learning_rate": 6.666666666666667e-05, "loss": 1.0888, "step": 7000 }, { "epoch": 1.7367119901112484, "grad_norm": 0.7421875, "learning_rate": 6.653846153846153e-05, "loss": 1.0887, "step": 7025 }, { "epoch": 1.7428924598269468, "grad_norm": 0.7421875, "learning_rate": 6.641025641025641e-05, "loss": 1.0857, "step": 7050 }, { "epoch": 1.7490729295426453, "grad_norm": 0.8359375, "learning_rate": 6.628205128205128e-05, "loss": 1.089, "step": 7075 }, { "epoch": 1.7552533992583437, "grad_norm": 0.75390625, "learning_rate": 6.615384615384616e-05, "loss": 1.0866, "step": 7100 }, { "epoch": 1.7614338689740419, "grad_norm": 0.73046875, "learning_rate": 6.602564102564102e-05, "loss": 1.0871, "step": 7125 }, { "epoch": 1.7676143386897403, "grad_norm": 0.75, "learning_rate": 6.58974358974359e-05, "loss": 1.0779, "step": 7150 }, { "epoch": 1.7737948084054387, "grad_norm": 0.89453125, "learning_rate": 6.576923076923078e-05, "loss": 1.0852, "step": 7175 }, { "epoch": 1.779975278121137, "grad_norm": 0.7890625, "learning_rate": 6.564102564102564e-05, "loss": 1.0836, "step": 7200 }, { "epoch": 1.7861557478368355, "grad_norm": 0.78515625, "learning_rate": 6.551282051282052e-05, "loss": 1.0843, "step": 7225 }, { "epoch": 1.792336217552534, "grad_norm": 0.7265625, "learning_rate": 6.538461538461539e-05, "loss": 1.0802, "step": 7250 }, { "epoch": 1.7985166872682323, "grad_norm": 0.7421875, "learning_rate": 6.525641025641026e-05, "loss": 1.083, "step": 7275 }, { "epoch": 1.8046971569839307, "grad_norm": 0.79296875, "learning_rate": 6.512820512820513e-05, "loss": 1.0798, "step": 7300 }, { "epoch": 1.8108776266996292, "grad_norm": 0.72265625, "learning_rate": 6.500000000000001e-05, "loss": 1.0823, "step": 7325 }, { "epoch": 1.8170580964153276, "grad_norm": 0.734375, "learning_rate": 6.487179487179487e-05, "loss": 1.0814, "step": 7350 }, { "epoch": 1.823238566131026, "grad_norm": 0.78125, "learning_rate": 6.474358974358975e-05, "loss": 1.0827, "step": 7375 }, { "epoch": 1.8294190358467244, "grad_norm": 0.71484375, "learning_rate": 6.461538461538462e-05, "loss": 1.0827, "step": 7400 }, { "epoch": 1.8355995055624228, "grad_norm": 0.78125, "learning_rate": 6.44871794871795e-05, "loss": 1.0799, "step": 7425 }, { "epoch": 1.8417799752781212, "grad_norm": 0.71484375, "learning_rate": 6.435897435897437e-05, "loss": 1.0783, "step": 7450 }, { "epoch": 1.8479604449938196, "grad_norm": 0.7890625, "learning_rate": 6.423076923076924e-05, "loss": 1.0758, "step": 7475 }, { "epoch": 1.854140914709518, "grad_norm": 0.78515625, "learning_rate": 6.410256410256412e-05, "loss": 1.078, "step": 7500 }, { "epoch": 1.8603213844252164, "grad_norm": 0.7421875, "learning_rate": 6.397435897435897e-05, "loss": 1.0715, "step": 7525 }, { "epoch": 1.8665018541409149, "grad_norm": 0.734375, "learning_rate": 6.384615384615385e-05, "loss": 1.0761, "step": 7550 }, { "epoch": 1.8726823238566133, "grad_norm": 0.7734375, "learning_rate": 6.371794871794871e-05, "loss": 1.076, "step": 7575 }, { "epoch": 1.8788627935723115, "grad_norm": 0.85546875, "learning_rate": 6.358974358974359e-05, "loss": 1.0732, "step": 7600 }, { "epoch": 1.8850432632880099, "grad_norm": 0.76953125, "learning_rate": 6.346153846153847e-05, "loss": 1.073, "step": 7625 }, { "epoch": 1.8912237330037083, "grad_norm": 0.75, "learning_rate": 6.333333333333333e-05, "loss": 1.07, "step": 7650 }, { "epoch": 1.8974042027194067, "grad_norm": 0.74609375, "learning_rate": 6.320512820512821e-05, "loss": 1.0739, "step": 7675 }, { "epoch": 1.903584672435105, "grad_norm": 0.80078125, "learning_rate": 6.307692307692308e-05, "loss": 1.071, "step": 7700 }, { "epoch": 1.9097651421508035, "grad_norm": 0.77734375, "learning_rate": 6.294871794871795e-05, "loss": 1.0706, "step": 7725 }, { "epoch": 1.9159456118665017, "grad_norm": 0.734375, "learning_rate": 6.282051282051282e-05, "loss": 1.0725, "step": 7750 }, { "epoch": 1.9221260815822, "grad_norm": 0.72265625, "learning_rate": 6.26923076923077e-05, "loss": 1.0666, "step": 7775 }, { "epoch": 1.9283065512978985, "grad_norm": 0.7890625, "learning_rate": 6.256410256410256e-05, "loss": 1.0661, "step": 7800 }, { "epoch": 1.934487021013597, "grad_norm": 0.71484375, "learning_rate": 6.243589743589744e-05, "loss": 1.0677, "step": 7825 }, { "epoch": 1.9406674907292953, "grad_norm": 0.734375, "learning_rate": 6.23076923076923e-05, "loss": 1.0683, "step": 7850 }, { "epoch": 1.9468479604449938, "grad_norm": 0.80859375, "learning_rate": 6.217948717948718e-05, "loss": 1.0676, "step": 7875 }, { "epoch": 1.9530284301606922, "grad_norm": 0.78125, "learning_rate": 6.205128205128206e-05, "loss": 1.068, "step": 7900 }, { "epoch": 1.9592088998763906, "grad_norm": 0.75390625, "learning_rate": 6.192307692307693e-05, "loss": 1.0662, "step": 7925 }, { "epoch": 1.965389369592089, "grad_norm": 0.84375, "learning_rate": 6.17948717948718e-05, "loss": 1.0653, "step": 7950 }, { "epoch": 1.9715698393077874, "grad_norm": 0.80859375, "learning_rate": 6.166666666666667e-05, "loss": 1.0629, "step": 7975 }, { "epoch": 1.9777503090234858, "grad_norm": 0.7265625, "learning_rate": 6.153846153846155e-05, "loss": 1.0669, "step": 8000 }, { "epoch": 1.9839307787391842, "grad_norm": 0.71875, "learning_rate": 6.141025641025641e-05, "loss": 1.0648, "step": 8025 }, { "epoch": 1.9901112484548826, "grad_norm": 0.734375, "learning_rate": 6.128205128205129e-05, "loss": 1.0646, "step": 8050 }, { "epoch": 1.996291718170581, "grad_norm": 0.8125, "learning_rate": 6.115384615384616e-05, "loss": 1.0591, "step": 8075 }, { "epoch": 2.0024721878862795, "grad_norm": 0.79296875, "learning_rate": 6.1025641025641035e-05, "loss": 1.0239, "step": 8100 }, { "epoch": 2.008652657601978, "grad_norm": 0.73046875, "learning_rate": 6.089743589743589e-05, "loss": 0.9635, "step": 8125 }, { "epoch": 2.0148331273176763, "grad_norm": 0.8828125, "learning_rate": 6.0769230769230765e-05, "loss": 0.9628, "step": 8150 }, { "epoch": 2.0210135970333747, "grad_norm": 0.78125, "learning_rate": 6.0641025641025637e-05, "loss": 0.9707, "step": 8175 }, { "epoch": 2.027194066749073, "grad_norm": 0.78125, "learning_rate": 6.0512820512820515e-05, "loss": 0.9658, "step": 8200 }, { "epoch": 2.0333745364647715, "grad_norm": 0.79296875, "learning_rate": 6.038461538461539e-05, "loss": 0.9695, "step": 8225 }, { "epoch": 2.03955500618047, "grad_norm": 0.8046875, "learning_rate": 6.025641025641026e-05, "loss": 0.9714, "step": 8250 }, { "epoch": 2.0457354758961683, "grad_norm": 0.78515625, "learning_rate": 6.012820512820513e-05, "loss": 0.973, "step": 8275 }, { "epoch": 2.0519159456118663, "grad_norm": 0.80078125, "learning_rate": 6e-05, "loss": 0.9738, "step": 8300 }, { "epoch": 2.0580964153275647, "grad_norm": 0.7734375, "learning_rate": 5.987179487179487e-05, "loss": 0.9714, "step": 8325 }, { "epoch": 2.064276885043263, "grad_norm": 0.76953125, "learning_rate": 5.9743589743589745e-05, "loss": 0.9718, "step": 8350 }, { "epoch": 2.0704573547589615, "grad_norm": 0.82421875, "learning_rate": 5.9615384615384616e-05, "loss": 0.9768, "step": 8375 }, { "epoch": 2.07663782447466, "grad_norm": 0.81640625, "learning_rate": 5.948717948717949e-05, "loss": 0.9756, "step": 8400 }, { "epoch": 2.0828182941903584, "grad_norm": 0.8125, "learning_rate": 5.935897435897436e-05, "loss": 0.9768, "step": 8425 }, { "epoch": 2.0889987639060568, "grad_norm": 0.8046875, "learning_rate": 5.923076923076923e-05, "loss": 0.9755, "step": 8450 }, { "epoch": 2.095179233621755, "grad_norm": 0.77734375, "learning_rate": 5.910256410256411e-05, "loss": 0.9758, "step": 8475 }, { "epoch": 2.1013597033374536, "grad_norm": 0.8125, "learning_rate": 5.897435897435898e-05, "loss": 0.9762, "step": 8500 }, { "epoch": 2.107540173053152, "grad_norm": 0.75, "learning_rate": 5.884615384615385e-05, "loss": 0.9774, "step": 8525 }, { "epoch": 2.1137206427688504, "grad_norm": 0.76953125, "learning_rate": 5.8717948717948725e-05, "loss": 0.9795, "step": 8550 }, { "epoch": 2.119901112484549, "grad_norm": 0.796875, "learning_rate": 5.8589743589743596e-05, "loss": 0.9762, "step": 8575 }, { "epoch": 2.1260815822002472, "grad_norm": 0.79296875, "learning_rate": 5.846153846153847e-05, "loss": 0.9808, "step": 8600 }, { "epoch": 2.1322620519159456, "grad_norm": 0.88671875, "learning_rate": 5.833333333333334e-05, "loss": 0.9781, "step": 8625 }, { "epoch": 2.138442521631644, "grad_norm": 0.76953125, "learning_rate": 5.820512820512821e-05, "loss": 0.9772, "step": 8650 }, { "epoch": 2.1446229913473425, "grad_norm": 0.79296875, "learning_rate": 5.807692307692308e-05, "loss": 0.9774, "step": 8675 }, { "epoch": 2.150803461063041, "grad_norm": 0.8125, "learning_rate": 5.7948717948717954e-05, "loss": 0.9764, "step": 8700 }, { "epoch": 2.1569839307787393, "grad_norm": 0.8046875, "learning_rate": 5.7820512820512826e-05, "loss": 0.9781, "step": 8725 }, { "epoch": 2.1631644004944377, "grad_norm": 0.81640625, "learning_rate": 5.769230769230769e-05, "loss": 0.9748, "step": 8750 }, { "epoch": 2.169344870210136, "grad_norm": 0.77734375, "learning_rate": 5.756410256410256e-05, "loss": 0.9762, "step": 8775 }, { "epoch": 2.1755253399258345, "grad_norm": 0.79296875, "learning_rate": 5.7435897435897434e-05, "loss": 0.9789, "step": 8800 }, { "epoch": 2.181705809641533, "grad_norm": 0.82421875, "learning_rate": 5.7307692307692306e-05, "loss": 0.9764, "step": 8825 }, { "epoch": 2.1878862793572313, "grad_norm": 0.78515625, "learning_rate": 5.717948717948718e-05, "loss": 0.9767, "step": 8850 }, { "epoch": 2.1940667490729293, "grad_norm": 0.76171875, "learning_rate": 5.705128205128205e-05, "loss": 0.9798, "step": 8875 }, { "epoch": 2.200247218788628, "grad_norm": 0.7890625, "learning_rate": 5.692307692307692e-05, "loss": 0.9766, "step": 8900 }, { "epoch": 2.206427688504326, "grad_norm": 0.8515625, "learning_rate": 5.679487179487179e-05, "loss": 0.9814, "step": 8925 }, { "epoch": 2.2126081582200245, "grad_norm": 0.8203125, "learning_rate": 5.666666666666667e-05, "loss": 0.9784, "step": 8950 }, { "epoch": 2.218788627935723, "grad_norm": 0.8515625, "learning_rate": 5.653846153846154e-05, "loss": 0.9749, "step": 8975 }, { "epoch": 2.2249690976514214, "grad_norm": 0.8046875, "learning_rate": 5.6410256410256414e-05, "loss": 0.9814, "step": 9000 }, { "epoch": 2.23114956736712, "grad_norm": 0.7578125, "learning_rate": 5.6282051282051286e-05, "loss": 0.9805, "step": 9025 }, { "epoch": 2.237330037082818, "grad_norm": 0.80859375, "learning_rate": 5.615384615384616e-05, "loss": 0.9749, "step": 9050 }, { "epoch": 2.2435105067985166, "grad_norm": 0.76171875, "learning_rate": 5.602564102564103e-05, "loss": 0.9747, "step": 9075 }, { "epoch": 2.249690976514215, "grad_norm": 0.78515625, "learning_rate": 5.58974358974359e-05, "loss": 0.9755, "step": 9100 }, { "epoch": 2.2558714462299134, "grad_norm": 0.78515625, "learning_rate": 5.576923076923077e-05, "loss": 0.9767, "step": 9125 }, { "epoch": 2.262051915945612, "grad_norm": 0.77734375, "learning_rate": 5.5641025641025644e-05, "loss": 0.9799, "step": 9150 }, { "epoch": 2.2682323856613102, "grad_norm": 0.75390625, "learning_rate": 5.5512820512820515e-05, "loss": 0.9807, "step": 9175 }, { "epoch": 2.2744128553770087, "grad_norm": 0.78125, "learning_rate": 5.538461538461539e-05, "loss": 0.9765, "step": 9200 }, { "epoch": 2.280593325092707, "grad_norm": 0.80078125, "learning_rate": 5.5256410256410265e-05, "loss": 0.9767, "step": 9225 }, { "epoch": 2.2867737948084055, "grad_norm": 0.79296875, "learning_rate": 5.512820512820514e-05, "loss": 0.9751, "step": 9250 }, { "epoch": 2.292954264524104, "grad_norm": 0.78515625, "learning_rate": 5.500000000000001e-05, "loss": 0.9749, "step": 9275 }, { "epoch": 2.2991347342398023, "grad_norm": 0.76953125, "learning_rate": 5.487179487179488e-05, "loss": 0.975, "step": 9300 }, { "epoch": 2.3053152039555007, "grad_norm": 0.78125, "learning_rate": 5.474358974358975e-05, "loss": 0.9754, "step": 9325 }, { "epoch": 2.311495673671199, "grad_norm": 0.796875, "learning_rate": 5.461538461538461e-05, "loss": 0.9721, "step": 9350 }, { "epoch": 2.3176761433868975, "grad_norm": 0.76953125, "learning_rate": 5.448717948717948e-05, "loss": 0.9739, "step": 9375 }, { "epoch": 2.323856613102596, "grad_norm": 0.74609375, "learning_rate": 5.435897435897436e-05, "loss": 0.9759, "step": 9400 }, { "epoch": 2.3300370828182944, "grad_norm": 0.75390625, "learning_rate": 5.423076923076923e-05, "loss": 0.9716, "step": 9425 }, { "epoch": 2.3362175525339928, "grad_norm": 0.7734375, "learning_rate": 5.41025641025641e-05, "loss": 0.9757, "step": 9450 }, { "epoch": 2.342398022249691, "grad_norm": 0.73046875, "learning_rate": 5.3974358974358975e-05, "loss": 0.9723, "step": 9475 }, { "epoch": 2.348578491965389, "grad_norm": 0.80078125, "learning_rate": 5.384615384615385e-05, "loss": 0.973, "step": 9500 }, { "epoch": 2.354758961681088, "grad_norm": 0.77734375, "learning_rate": 5.371794871794872e-05, "loss": 0.9721, "step": 9525 }, { "epoch": 2.360939431396786, "grad_norm": 0.76171875, "learning_rate": 5.358974358974359e-05, "loss": 0.9762, "step": 9550 }, { "epoch": 2.3671199011124844, "grad_norm": 0.81640625, "learning_rate": 5.346153846153846e-05, "loss": 0.9781, "step": 9575 }, { "epoch": 2.373300370828183, "grad_norm": 0.8203125, "learning_rate": 5.333333333333333e-05, "loss": 0.9767, "step": 9600 }, { "epoch": 2.379480840543881, "grad_norm": 0.7578125, "learning_rate": 5.3205128205128205e-05, "loss": 0.974, "step": 9625 }, { "epoch": 2.3856613102595796, "grad_norm": 0.8046875, "learning_rate": 5.3076923076923076e-05, "loss": 0.9717, "step": 9650 }, { "epoch": 2.391841779975278, "grad_norm": 0.79296875, "learning_rate": 5.2948717948717955e-05, "loss": 0.9717, "step": 9675 }, { "epoch": 2.3980222496909764, "grad_norm": 0.828125, "learning_rate": 5.2820512820512826e-05, "loss": 0.9726, "step": 9700 }, { "epoch": 2.404202719406675, "grad_norm": 0.81640625, "learning_rate": 5.26923076923077e-05, "loss": 0.9746, "step": 9725 }, { "epoch": 2.4103831891223733, "grad_norm": 0.8203125, "learning_rate": 5.256410256410257e-05, "loss": 0.9739, "step": 9750 }, { "epoch": 2.4165636588380717, "grad_norm": 0.796875, "learning_rate": 5.243589743589744e-05, "loss": 0.9702, "step": 9775 }, { "epoch": 2.42274412855377, "grad_norm": 0.76171875, "learning_rate": 5.230769230769231e-05, "loss": 0.97, "step": 9800 }, { "epoch": 2.4289245982694685, "grad_norm": 0.796875, "learning_rate": 5.2179487179487185e-05, "loss": 0.9698, "step": 9825 }, { "epoch": 2.435105067985167, "grad_norm": 0.83984375, "learning_rate": 5.2051282051282056e-05, "loss": 0.9687, "step": 9850 }, { "epoch": 2.4412855377008653, "grad_norm": 0.79296875, "learning_rate": 5.192307692307693e-05, "loss": 0.9673, "step": 9875 }, { "epoch": 2.4474660074165637, "grad_norm": 0.82421875, "learning_rate": 5.17948717948718e-05, "loss": 0.9671, "step": 9900 }, { "epoch": 2.453646477132262, "grad_norm": 0.78515625, "learning_rate": 5.166666666666667e-05, "loss": 0.9685, "step": 9925 }, { "epoch": 2.4598269468479605, "grad_norm": 0.83203125, "learning_rate": 5.1538461538461536e-05, "loss": 0.9722, "step": 9950 }, { "epoch": 2.466007416563659, "grad_norm": 0.875, "learning_rate": 5.141025641025641e-05, "loss": 0.969, "step": 9975 }, { "epoch": 2.4721878862793574, "grad_norm": 0.80078125, "learning_rate": 5.128205128205128e-05, "loss": 0.9717, "step": 10000 }, { "epoch": 2.4721878862793574, "eval_loss": 1.0057789087295532, "eval_runtime": 1.5251, "eval_samples_per_second": 418.982, "eval_steps_per_second": 1.967, "step": 10000 }, { "epoch": 2.478368355995056, "grad_norm": 0.765625, "learning_rate": 5.115384615384615e-05, "loss": 0.9694, "step": 10025 }, { "epoch": 2.484548825710754, "grad_norm": 0.8046875, "learning_rate": 5.102564102564102e-05, "loss": 0.9696, "step": 10050 }, { "epoch": 2.490729295426452, "grad_norm": 0.734375, "learning_rate": 5.0897435897435894e-05, "loss": 0.9687, "step": 10075 }, { "epoch": 2.496909765142151, "grad_norm": 0.7578125, "learning_rate": 5.0769230769230766e-05, "loss": 0.964, "step": 10100 }, { "epoch": 2.503090234857849, "grad_norm": 0.82421875, "learning_rate": 5.0641025641025644e-05, "loss": 0.9691, "step": 10125 }, { "epoch": 2.509270704573548, "grad_norm": 0.7734375, "learning_rate": 5.0512820512820516e-05, "loss": 0.9649, "step": 10150 }, { "epoch": 2.515451174289246, "grad_norm": 0.7890625, "learning_rate": 5.038461538461539e-05, "loss": 0.9673, "step": 10175 }, { "epoch": 2.521631644004944, "grad_norm": 0.75390625, "learning_rate": 5.025641025641026e-05, "loss": 0.9669, "step": 10200 }, { "epoch": 2.5278121137206426, "grad_norm": 0.765625, "learning_rate": 5.012820512820513e-05, "loss": 0.9653, "step": 10225 }, { "epoch": 2.533992583436341, "grad_norm": 0.88671875, "learning_rate": 5e-05, "loss": 0.9682, "step": 10250 }, { "epoch": 2.5401730531520395, "grad_norm": 0.7734375, "learning_rate": 4.9871794871794874e-05, "loss": 0.9615, "step": 10275 }, { "epoch": 2.546353522867738, "grad_norm": 0.765625, "learning_rate": 4.9743589743589746e-05, "loss": 0.9618, "step": 10300 }, { "epoch": 2.5525339925834363, "grad_norm": 0.796875, "learning_rate": 4.961538461538462e-05, "loss": 0.9633, "step": 10325 }, { "epoch": 2.5587144622991347, "grad_norm": 0.74609375, "learning_rate": 4.948717948717949e-05, "loss": 0.9624, "step": 10350 }, { "epoch": 2.564894932014833, "grad_norm": 0.75, "learning_rate": 4.935897435897436e-05, "loss": 0.9645, "step": 10375 }, { "epoch": 2.5710754017305315, "grad_norm": 0.73046875, "learning_rate": 4.923076923076924e-05, "loss": 0.9594, "step": 10400 }, { "epoch": 2.57725587144623, "grad_norm": 0.70703125, "learning_rate": 4.9102564102564104e-05, "loss": 0.9652, "step": 10425 }, { "epoch": 2.5834363411619283, "grad_norm": 0.7734375, "learning_rate": 4.8974358974358975e-05, "loss": 0.9606, "step": 10450 }, { "epoch": 2.5896168108776267, "grad_norm": 0.7890625, "learning_rate": 4.884615384615385e-05, "loss": 0.9639, "step": 10475 }, { "epoch": 2.595797280593325, "grad_norm": 0.796875, "learning_rate": 4.871794871794872e-05, "loss": 0.9615, "step": 10500 }, { "epoch": 2.6019777503090236, "grad_norm": 0.80078125, "learning_rate": 4.858974358974359e-05, "loss": 0.961, "step": 10525 }, { "epoch": 2.608158220024722, "grad_norm": 0.73046875, "learning_rate": 4.846153846153846e-05, "loss": 0.9608, "step": 10550 }, { "epoch": 2.6143386897404204, "grad_norm": 0.83203125, "learning_rate": 4.8333333333333334e-05, "loss": 0.9605, "step": 10575 }, { "epoch": 2.620519159456119, "grad_norm": 0.7421875, "learning_rate": 4.8205128205128205e-05, "loss": 0.9572, "step": 10600 }, { "epoch": 2.626699629171817, "grad_norm": 0.78125, "learning_rate": 4.8076923076923084e-05, "loss": 0.961, "step": 10625 }, { "epoch": 2.632880098887515, "grad_norm": 0.84765625, "learning_rate": 4.7948717948717955e-05, "loss": 0.9538, "step": 10650 }, { "epoch": 2.639060568603214, "grad_norm": 0.84765625, "learning_rate": 4.782051282051283e-05, "loss": 0.9529, "step": 10675 }, { "epoch": 2.645241038318912, "grad_norm": 0.8359375, "learning_rate": 4.76923076923077e-05, "loss": 0.9579, "step": 10700 }, { "epoch": 2.651421508034611, "grad_norm": 0.73046875, "learning_rate": 4.7564102564102563e-05, "loss": 0.956, "step": 10725 }, { "epoch": 2.657601977750309, "grad_norm": 0.76953125, "learning_rate": 4.7435897435897435e-05, "loss": 0.9592, "step": 10750 }, { "epoch": 2.6637824474660077, "grad_norm": 0.76953125, "learning_rate": 4.730769230769231e-05, "loss": 0.9578, "step": 10775 }, { "epoch": 2.6699629171817056, "grad_norm": 0.84765625, "learning_rate": 4.717948717948718e-05, "loss": 0.9579, "step": 10800 }, { "epoch": 2.676143386897404, "grad_norm": 0.76953125, "learning_rate": 4.705128205128205e-05, "loss": 0.9561, "step": 10825 }, { "epoch": 2.6823238566131025, "grad_norm": 0.70703125, "learning_rate": 4.692307692307693e-05, "loss": 0.9567, "step": 10850 }, { "epoch": 2.688504326328801, "grad_norm": 0.796875, "learning_rate": 4.67948717948718e-05, "loss": 0.956, "step": 10875 }, { "epoch": 2.6946847960444993, "grad_norm": 0.78515625, "learning_rate": 4.666666666666667e-05, "loss": 0.9567, "step": 10900 }, { "epoch": 2.7008652657601977, "grad_norm": 0.80078125, "learning_rate": 4.653846153846154e-05, "loss": 0.9514, "step": 10925 }, { "epoch": 2.707045735475896, "grad_norm": 0.74609375, "learning_rate": 4.6410256410256415e-05, "loss": 0.9535, "step": 10950 }, { "epoch": 2.7132262051915945, "grad_norm": 0.87890625, "learning_rate": 4.6282051282051287e-05, "loss": 0.9571, "step": 10975 }, { "epoch": 2.719406674907293, "grad_norm": 0.79296875, "learning_rate": 4.615384615384616e-05, "loss": 0.9511, "step": 11000 }, { "epoch": 2.7255871446229913, "grad_norm": 0.8046875, "learning_rate": 4.602564102564102e-05, "loss": 0.9511, "step": 11025 }, { "epoch": 2.7317676143386898, "grad_norm": 0.78515625, "learning_rate": 4.5897435897435895e-05, "loss": 0.9554, "step": 11050 }, { "epoch": 2.737948084054388, "grad_norm": 0.80859375, "learning_rate": 4.576923076923077e-05, "loss": 0.95, "step": 11075 }, { "epoch": 2.7441285537700866, "grad_norm": 0.7890625, "learning_rate": 4.5641025641025645e-05, "loss": 0.951, "step": 11100 }, { "epoch": 2.750309023485785, "grad_norm": 0.71875, "learning_rate": 4.5512820512820516e-05, "loss": 0.9475, "step": 11125 }, { "epoch": 2.7564894932014834, "grad_norm": 0.796875, "learning_rate": 4.538461538461539e-05, "loss": 0.9511, "step": 11150 }, { "epoch": 2.762669962917182, "grad_norm": 0.75, "learning_rate": 4.525641025641026e-05, "loss": 0.9489, "step": 11175 }, { "epoch": 2.76885043263288, "grad_norm": 0.73046875, "learning_rate": 4.512820512820513e-05, "loss": 0.9511, "step": 11200 }, { "epoch": 2.7750309023485786, "grad_norm": 0.734375, "learning_rate": 4.5e-05, "loss": 0.951, "step": 11225 }, { "epoch": 2.781211372064277, "grad_norm": 0.765625, "learning_rate": 4.4871794871794874e-05, "loss": 0.9532, "step": 11250 }, { "epoch": 2.787391841779975, "grad_norm": 0.7890625, "learning_rate": 4.4743589743589746e-05, "loss": 0.9517, "step": 11275 }, { "epoch": 2.793572311495674, "grad_norm": 0.76171875, "learning_rate": 4.461538461538462e-05, "loss": 0.9479, "step": 11300 }, { "epoch": 2.799752781211372, "grad_norm": 0.7734375, "learning_rate": 4.448717948717949e-05, "loss": 0.9493, "step": 11325 }, { "epoch": 2.8059332509270707, "grad_norm": 0.76171875, "learning_rate": 4.435897435897436e-05, "loss": 0.9489, "step": 11350 }, { "epoch": 2.8121137206427687, "grad_norm": 0.76953125, "learning_rate": 4.423076923076923e-05, "loss": 0.945, "step": 11375 }, { "epoch": 2.8182941903584675, "grad_norm": 0.7890625, "learning_rate": 4.4102564102564104e-05, "loss": 0.9511, "step": 11400 }, { "epoch": 2.8244746600741655, "grad_norm": 0.7734375, "learning_rate": 4.3974358974358976e-05, "loss": 0.9455, "step": 11425 }, { "epoch": 2.830655129789864, "grad_norm": 0.78125, "learning_rate": 4.384615384615385e-05, "loss": 0.9468, "step": 11450 }, { "epoch": 2.8368355995055623, "grad_norm": 0.796875, "learning_rate": 4.371794871794872e-05, "loss": 0.9478, "step": 11475 }, { "epoch": 2.8430160692212607, "grad_norm": 0.76953125, "learning_rate": 4.358974358974359e-05, "loss": 0.9448, "step": 11500 }, { "epoch": 2.849196538936959, "grad_norm": 0.78125, "learning_rate": 4.346153846153846e-05, "loss": 0.9439, "step": 11525 }, { "epoch": 2.8553770086526575, "grad_norm": 0.703125, "learning_rate": 4.3333333333333334e-05, "loss": 0.948, "step": 11550 }, { "epoch": 2.861557478368356, "grad_norm": 0.7578125, "learning_rate": 4.320512820512821e-05, "loss": 0.9465, "step": 11575 }, { "epoch": 2.8677379480840544, "grad_norm": 0.73046875, "learning_rate": 4.3076923076923084e-05, "loss": 0.9469, "step": 11600 }, { "epoch": 2.8739184177997528, "grad_norm": 0.82421875, "learning_rate": 4.294871794871795e-05, "loss": 0.9417, "step": 11625 }, { "epoch": 2.880098887515451, "grad_norm": 0.6953125, "learning_rate": 4.282051282051282e-05, "loss": 0.9431, "step": 11650 }, { "epoch": 2.8862793572311496, "grad_norm": 0.74609375, "learning_rate": 4.269230769230769e-05, "loss": 0.9446, "step": 11675 }, { "epoch": 2.892459826946848, "grad_norm": 0.734375, "learning_rate": 4.2564102564102564e-05, "loss": 0.943, "step": 11700 }, { "epoch": 2.8986402966625464, "grad_norm": 0.76171875, "learning_rate": 4.2435897435897435e-05, "loss": 0.9467, "step": 11725 }, { "epoch": 2.904820766378245, "grad_norm": 0.7265625, "learning_rate": 4.230769230769231e-05, "loss": 0.9489, "step": 11750 }, { "epoch": 2.9110012360939432, "grad_norm": 0.72265625, "learning_rate": 4.217948717948718e-05, "loss": 0.9435, "step": 11775 }, { "epoch": 2.9171817058096416, "grad_norm": 0.734375, "learning_rate": 4.205128205128206e-05, "loss": 0.9405, "step": 11800 }, { "epoch": 2.92336217552534, "grad_norm": 0.74609375, "learning_rate": 4.192307692307693e-05, "loss": 0.9422, "step": 11825 }, { "epoch": 2.9295426452410385, "grad_norm": 0.73828125, "learning_rate": 4.17948717948718e-05, "loss": 0.9421, "step": 11850 }, { "epoch": 2.935723114956737, "grad_norm": 0.7578125, "learning_rate": 4.166666666666667e-05, "loss": 0.9394, "step": 11875 }, { "epoch": 2.941903584672435, "grad_norm": 0.7734375, "learning_rate": 4.1538461538461544e-05, "loss": 0.9416, "step": 11900 }, { "epoch": 2.9480840543881337, "grad_norm": 0.71484375, "learning_rate": 4.1410256410256415e-05, "loss": 0.9412, "step": 11925 }, { "epoch": 2.9542645241038317, "grad_norm": 0.76953125, "learning_rate": 4.128205128205128e-05, "loss": 0.9399, "step": 11950 }, { "epoch": 2.9604449938195305, "grad_norm": 0.70703125, "learning_rate": 4.115384615384615e-05, "loss": 0.9396, "step": 11975 }, { "epoch": 2.9666254635352285, "grad_norm": 0.765625, "learning_rate": 4.1025641025641023e-05, "loss": 0.9366, "step": 12000 }, { "epoch": 2.9728059332509273, "grad_norm": 0.73828125, "learning_rate": 4.0897435897435895e-05, "loss": 0.9368, "step": 12025 }, { "epoch": 2.9789864029666253, "grad_norm": 0.72265625, "learning_rate": 4.0769230769230773e-05, "loss": 0.9392, "step": 12050 }, { "epoch": 2.9851668726823237, "grad_norm": 0.77734375, "learning_rate": 4.0641025641025645e-05, "loss": 0.9397, "step": 12075 }, { "epoch": 2.991347342398022, "grad_norm": 0.7421875, "learning_rate": 4.051282051282052e-05, "loss": 0.9385, "step": 12100 }, { "epoch": 2.9975278121137205, "grad_norm": 0.7734375, "learning_rate": 4.038461538461539e-05, "loss": 0.9397, "step": 12125 }, { "epoch": 3.003708281829419, "grad_norm": 0.84375, "learning_rate": 4.025641025641026e-05, "loss": 0.8884, "step": 12150 }, { "epoch": 3.0098887515451174, "grad_norm": 0.75, "learning_rate": 4.012820512820513e-05, "loss": 0.8596, "step": 12175 }, { "epoch": 3.016069221260816, "grad_norm": 0.765625, "learning_rate": 4e-05, "loss": 0.8606, "step": 12200 }, { "epoch": 3.022249690976514, "grad_norm": 0.765625, "learning_rate": 3.9871794871794875e-05, "loss": 0.8623, "step": 12225 }, { "epoch": 3.0284301606922126, "grad_norm": 0.7734375, "learning_rate": 3.974358974358974e-05, "loss": 0.8606, "step": 12250 }, { "epoch": 3.034610630407911, "grad_norm": 0.78125, "learning_rate": 3.961538461538462e-05, "loss": 0.8626, "step": 12275 }, { "epoch": 3.0407911001236094, "grad_norm": 0.72265625, "learning_rate": 3.948717948717949e-05, "loss": 0.8634, "step": 12300 }, { "epoch": 3.046971569839308, "grad_norm": 0.76171875, "learning_rate": 3.935897435897436e-05, "loss": 0.8615, "step": 12325 }, { "epoch": 3.0531520395550062, "grad_norm": 0.75, "learning_rate": 3.923076923076923e-05, "loss": 0.8622, "step": 12350 }, { "epoch": 3.0593325092707047, "grad_norm": 0.75390625, "learning_rate": 3.9102564102564105e-05, "loss": 0.8627, "step": 12375 }, { "epoch": 3.065512978986403, "grad_norm": 0.7578125, "learning_rate": 3.8974358974358976e-05, "loss": 0.8603, "step": 12400 }, { "epoch": 3.0716934487021015, "grad_norm": 0.76171875, "learning_rate": 3.884615384615385e-05, "loss": 0.8676, "step": 12425 }, { "epoch": 3.0778739184178, "grad_norm": 0.7421875, "learning_rate": 3.871794871794872e-05, "loss": 0.8622, "step": 12450 }, { "epoch": 3.0840543881334983, "grad_norm": 0.80078125, "learning_rate": 3.858974358974359e-05, "loss": 0.8625, "step": 12475 }, { "epoch": 3.0902348578491967, "grad_norm": 0.75390625, "learning_rate": 3.846153846153846e-05, "loss": 0.8649, "step": 12500 }, { "epoch": 3.096415327564895, "grad_norm": 0.76953125, "learning_rate": 3.8333333333333334e-05, "loss": 0.8663, "step": 12525 }, { "epoch": 3.1025957972805935, "grad_norm": 0.7734375, "learning_rate": 3.8205128205128206e-05, "loss": 0.8633, "step": 12550 }, { "epoch": 3.1087762669962915, "grad_norm": 0.7578125, "learning_rate": 3.807692307692308e-05, "loss": 0.8652, "step": 12575 }, { "epoch": 3.11495673671199, "grad_norm": 0.74609375, "learning_rate": 3.794871794871795e-05, "loss": 0.8671, "step": 12600 }, { "epoch": 3.1211372064276883, "grad_norm": 0.7734375, "learning_rate": 3.782051282051282e-05, "loss": 0.8633, "step": 12625 }, { "epoch": 3.1273176761433867, "grad_norm": 0.72265625, "learning_rate": 3.769230769230769e-05, "loss": 0.8672, "step": 12650 }, { "epoch": 3.133498145859085, "grad_norm": 0.7421875, "learning_rate": 3.7564102564102564e-05, "loss": 0.868, "step": 12675 }, { "epoch": 3.1396786155747836, "grad_norm": 0.734375, "learning_rate": 3.7435897435897436e-05, "loss": 0.8656, "step": 12700 }, { "epoch": 3.145859085290482, "grad_norm": 0.73828125, "learning_rate": 3.730769230769231e-05, "loss": 0.8663, "step": 12725 }, { "epoch": 3.1520395550061804, "grad_norm": 0.765625, "learning_rate": 3.717948717948718e-05, "loss": 0.8679, "step": 12750 }, { "epoch": 3.158220024721879, "grad_norm": 0.7734375, "learning_rate": 3.705128205128206e-05, "loss": 0.867, "step": 12775 }, { "epoch": 3.164400494437577, "grad_norm": 0.7265625, "learning_rate": 3.692307692307693e-05, "loss": 0.8685, "step": 12800 }, { "epoch": 3.1705809641532756, "grad_norm": 0.76953125, "learning_rate": 3.67948717948718e-05, "loss": 0.8673, "step": 12825 }, { "epoch": 3.176761433868974, "grad_norm": 0.74609375, "learning_rate": 3.6666666666666666e-05, "loss": 0.8688, "step": 12850 }, { "epoch": 3.1829419035846724, "grad_norm": 0.75390625, "learning_rate": 3.653846153846154e-05, "loss": 0.8638, "step": 12875 }, { "epoch": 3.189122373300371, "grad_norm": 0.7734375, "learning_rate": 3.641025641025641e-05, "loss": 0.87, "step": 12900 }, { "epoch": 3.1953028430160693, "grad_norm": 0.72265625, "learning_rate": 3.628205128205128e-05, "loss": 0.8717, "step": 12925 }, { "epoch": 3.2014833127317677, "grad_norm": 0.734375, "learning_rate": 3.615384615384615e-05, "loss": 0.863, "step": 12950 }, { "epoch": 3.207663782447466, "grad_norm": 0.75390625, "learning_rate": 3.6025641025641024e-05, "loss": 0.8704, "step": 12975 }, { "epoch": 3.2138442521631645, "grad_norm": 0.73828125, "learning_rate": 3.58974358974359e-05, "loss": 0.8695, "step": 13000 }, { "epoch": 3.220024721878863, "grad_norm": 0.73046875, "learning_rate": 3.5769230769230774e-05, "loss": 0.8695, "step": 13025 }, { "epoch": 3.2262051915945613, "grad_norm": 0.734375, "learning_rate": 3.5641025641025646e-05, "loss": 0.8668, "step": 13050 }, { "epoch": 3.2323856613102597, "grad_norm": 0.77734375, "learning_rate": 3.551282051282052e-05, "loss": 0.8703, "step": 13075 }, { "epoch": 3.238566131025958, "grad_norm": 0.72265625, "learning_rate": 3.538461538461539e-05, "loss": 0.8671, "step": 13100 }, { "epoch": 3.2447466007416566, "grad_norm": 0.74609375, "learning_rate": 3.525641025641026e-05, "loss": 0.8677, "step": 13125 }, { "epoch": 3.2509270704573545, "grad_norm": 0.7109375, "learning_rate": 3.5128205128205125e-05, "loss": 0.8671, "step": 13150 }, { "epoch": 3.2571075401730534, "grad_norm": 0.71875, "learning_rate": 3.5e-05, "loss": 0.8682, "step": 13175 }, { "epoch": 3.2632880098887513, "grad_norm": 0.7734375, "learning_rate": 3.487179487179487e-05, "loss": 0.8677, "step": 13200 }, { "epoch": 3.2694684796044498, "grad_norm": 0.703125, "learning_rate": 3.474358974358975e-05, "loss": 0.871, "step": 13225 }, { "epoch": 3.275648949320148, "grad_norm": 0.78125, "learning_rate": 3.461538461538462e-05, "loss": 0.8682, "step": 13250 }, { "epoch": 3.2818294190358466, "grad_norm": 0.74609375, "learning_rate": 3.448717948717949e-05, "loss": 0.8653, "step": 13275 }, { "epoch": 3.288009888751545, "grad_norm": 0.7265625, "learning_rate": 3.435897435897436e-05, "loss": 0.8697, "step": 13300 }, { "epoch": 3.2941903584672434, "grad_norm": 0.796875, "learning_rate": 3.4230769230769234e-05, "loss": 0.8697, "step": 13325 }, { "epoch": 3.300370828182942, "grad_norm": 0.7578125, "learning_rate": 3.4102564102564105e-05, "loss": 0.869, "step": 13350 }, { "epoch": 3.30655129789864, "grad_norm": 0.73046875, "learning_rate": 3.397435897435898e-05, "loss": 0.8687, "step": 13375 }, { "epoch": 3.3127317676143386, "grad_norm": 0.78125, "learning_rate": 3.384615384615385e-05, "loss": 0.8692, "step": 13400 }, { "epoch": 3.318912237330037, "grad_norm": 0.74609375, "learning_rate": 3.371794871794872e-05, "loss": 0.8687, "step": 13425 }, { "epoch": 3.3250927070457355, "grad_norm": 0.7734375, "learning_rate": 3.358974358974359e-05, "loss": 0.8708, "step": 13450 }, { "epoch": 3.331273176761434, "grad_norm": 0.74609375, "learning_rate": 3.346153846153846e-05, "loss": 0.8659, "step": 13475 }, { "epoch": 3.3374536464771323, "grad_norm": 0.7578125, "learning_rate": 3.3333333333333335e-05, "loss": 0.8662, "step": 13500 }, { "epoch": 3.3436341161928307, "grad_norm": 0.72265625, "learning_rate": 3.3205128205128207e-05, "loss": 0.8692, "step": 13525 }, { "epoch": 3.349814585908529, "grad_norm": 0.78515625, "learning_rate": 3.307692307692308e-05, "loss": 0.8677, "step": 13550 }, { "epoch": 3.3559950556242275, "grad_norm": 0.765625, "learning_rate": 3.294871794871795e-05, "loss": 0.8672, "step": 13575 }, { "epoch": 3.362175525339926, "grad_norm": 0.765625, "learning_rate": 3.282051282051282e-05, "loss": 0.8703, "step": 13600 }, { "epoch": 3.3683559950556243, "grad_norm": 0.76171875, "learning_rate": 3.269230769230769e-05, "loss": 0.8658, "step": 13625 }, { "epoch": 3.3745364647713227, "grad_norm": 0.75390625, "learning_rate": 3.2564102564102565e-05, "loss": 0.8679, "step": 13650 }, { "epoch": 3.380716934487021, "grad_norm": 0.76953125, "learning_rate": 3.2435897435897436e-05, "loss": 0.8654, "step": 13675 }, { "epoch": 3.3868974042027196, "grad_norm": 0.73828125, "learning_rate": 3.230769230769231e-05, "loss": 0.8662, "step": 13700 }, { "epoch": 3.393077873918418, "grad_norm": 0.73046875, "learning_rate": 3.2179487179487186e-05, "loss": 0.8654, "step": 13725 }, { "epoch": 3.3992583436341164, "grad_norm": 0.734375, "learning_rate": 3.205128205128206e-05, "loss": 0.8696, "step": 13750 }, { "epoch": 3.4054388133498144, "grad_norm": 0.7421875, "learning_rate": 3.192307692307692e-05, "loss": 0.8648, "step": 13775 }, { "epoch": 3.411619283065513, "grad_norm": 0.74609375, "learning_rate": 3.1794871794871795e-05, "loss": 0.8677, "step": 13800 }, { "epoch": 3.417799752781211, "grad_norm": 0.72265625, "learning_rate": 3.1666666666666666e-05, "loss": 0.8665, "step": 13825 }, { "epoch": 3.4239802224969096, "grad_norm": 0.74609375, "learning_rate": 3.153846153846154e-05, "loss": 0.8662, "step": 13850 }, { "epoch": 3.430160692212608, "grad_norm": 0.75390625, "learning_rate": 3.141025641025641e-05, "loss": 0.8658, "step": 13875 }, { "epoch": 3.4363411619283064, "grad_norm": 0.72265625, "learning_rate": 3.128205128205128e-05, "loss": 0.8652, "step": 13900 }, { "epoch": 3.442521631644005, "grad_norm": 0.75390625, "learning_rate": 3.115384615384615e-05, "loss": 0.8667, "step": 13925 }, { "epoch": 3.4487021013597032, "grad_norm": 0.73046875, "learning_rate": 3.102564102564103e-05, "loss": 0.8654, "step": 13950 }, { "epoch": 3.4548825710754016, "grad_norm": 0.7734375, "learning_rate": 3.08974358974359e-05, "loss": 0.8666, "step": 13975 }, { "epoch": 3.4610630407911, "grad_norm": 0.765625, "learning_rate": 3.0769230769230774e-05, "loss": 0.8712, "step": 14000 }, { "epoch": 3.4672435105067985, "grad_norm": 0.7265625, "learning_rate": 3.0641025641025646e-05, "loss": 0.867, "step": 14025 }, { "epoch": 3.473423980222497, "grad_norm": 0.72265625, "learning_rate": 3.0512820512820518e-05, "loss": 0.8635, "step": 14050 }, { "epoch": 3.4796044499381953, "grad_norm": 0.765625, "learning_rate": 3.0384615384615382e-05, "loss": 0.8686, "step": 14075 }, { "epoch": 3.4857849196538937, "grad_norm": 0.7265625, "learning_rate": 3.0256410256410257e-05, "loss": 0.8654, "step": 14100 }, { "epoch": 3.491965389369592, "grad_norm": 0.71875, "learning_rate": 3.012820512820513e-05, "loss": 0.869, "step": 14125 }, { "epoch": 3.4981458590852905, "grad_norm": 0.7265625, "learning_rate": 3e-05, "loss": 0.862, "step": 14150 }, { "epoch": 3.504326328800989, "grad_norm": 0.76171875, "learning_rate": 2.9871794871794872e-05, "loss": 0.866, "step": 14175 }, { "epoch": 3.5105067985166873, "grad_norm": 0.71875, "learning_rate": 2.9743589743589744e-05, "loss": 0.8701, "step": 14200 }, { "epoch": 3.5166872682323858, "grad_norm": 0.7734375, "learning_rate": 2.9615384615384616e-05, "loss": 0.8679, "step": 14225 }, { "epoch": 3.522867737948084, "grad_norm": 0.71484375, "learning_rate": 2.948717948717949e-05, "loss": 0.8639, "step": 14250 }, { "epoch": 3.5290482076637826, "grad_norm": 0.734375, "learning_rate": 2.9358974358974362e-05, "loss": 0.8674, "step": 14275 }, { "epoch": 3.535228677379481, "grad_norm": 0.7265625, "learning_rate": 2.9230769230769234e-05, "loss": 0.8639, "step": 14300 }, { "epoch": 3.5414091470951794, "grad_norm": 0.73828125, "learning_rate": 2.9102564102564106e-05, "loss": 0.8652, "step": 14325 }, { "epoch": 3.5475896168108774, "grad_norm": 0.7109375, "learning_rate": 2.8974358974358977e-05, "loss": 0.8687, "step": 14350 }, { "epoch": 3.553770086526576, "grad_norm": 0.75390625, "learning_rate": 2.8846153846153845e-05, "loss": 0.8664, "step": 14375 }, { "epoch": 3.559950556242274, "grad_norm": 0.703125, "learning_rate": 2.8717948717948717e-05, "loss": 0.8671, "step": 14400 }, { "epoch": 3.566131025957973, "grad_norm": 0.703125, "learning_rate": 2.858974358974359e-05, "loss": 0.8658, "step": 14425 }, { "epoch": 3.572311495673671, "grad_norm": 0.69921875, "learning_rate": 2.846153846153846e-05, "loss": 0.8598, "step": 14450 }, { "epoch": 3.57849196538937, "grad_norm": 0.74609375, "learning_rate": 2.8333333333333335e-05, "loss": 0.8643, "step": 14475 }, { "epoch": 3.584672435105068, "grad_norm": 0.7265625, "learning_rate": 2.8205128205128207e-05, "loss": 0.8641, "step": 14500 }, { "epoch": 3.5908529048207662, "grad_norm": 0.75, "learning_rate": 2.807692307692308e-05, "loss": 0.8665, "step": 14525 }, { "epoch": 3.5970333745364647, "grad_norm": 0.7265625, "learning_rate": 2.794871794871795e-05, "loss": 0.8649, "step": 14550 }, { "epoch": 3.603213844252163, "grad_norm": 0.73046875, "learning_rate": 2.7820512820512822e-05, "loss": 0.8671, "step": 14575 }, { "epoch": 3.6093943139678615, "grad_norm": 0.71484375, "learning_rate": 2.7692307692307694e-05, "loss": 0.8656, "step": 14600 }, { "epoch": 3.61557478368356, "grad_norm": 0.7265625, "learning_rate": 2.756410256410257e-05, "loss": 0.8642, "step": 14625 }, { "epoch": 3.6217552533992583, "grad_norm": 0.71875, "learning_rate": 2.743589743589744e-05, "loss": 0.8628, "step": 14650 }, { "epoch": 3.6279357231149567, "grad_norm": 0.73828125, "learning_rate": 2.7307692307692305e-05, "loss": 0.8648, "step": 14675 }, { "epoch": 3.634116192830655, "grad_norm": 0.7109375, "learning_rate": 2.717948717948718e-05, "loss": 0.8674, "step": 14700 }, { "epoch": 3.6402966625463535, "grad_norm": 0.71484375, "learning_rate": 2.705128205128205e-05, "loss": 0.8664, "step": 14725 }, { "epoch": 3.646477132262052, "grad_norm": 0.7421875, "learning_rate": 2.6923076923076923e-05, "loss": 0.8643, "step": 14750 }, { "epoch": 3.6526576019777504, "grad_norm": 0.7421875, "learning_rate": 2.6794871794871795e-05, "loss": 0.8679, "step": 14775 }, { "epoch": 3.6588380716934488, "grad_norm": 0.76171875, "learning_rate": 2.6666666666666667e-05, "loss": 0.8669, "step": 14800 }, { "epoch": 3.665018541409147, "grad_norm": 0.74609375, "learning_rate": 2.6538461538461538e-05, "loss": 0.8619, "step": 14825 }, { "epoch": 3.6711990111248456, "grad_norm": 0.72265625, "learning_rate": 2.6410256410256413e-05, "loss": 0.8685, "step": 14850 }, { "epoch": 3.677379480840544, "grad_norm": 0.7265625, "learning_rate": 2.6282051282051285e-05, "loss": 0.8666, "step": 14875 }, { "epoch": 3.6835599505562424, "grad_norm": 0.69921875, "learning_rate": 2.6153846153846157e-05, "loss": 0.8645, "step": 14900 }, { "epoch": 3.689740420271941, "grad_norm": 0.6875, "learning_rate": 2.6025641025641028e-05, "loss": 0.8657, "step": 14925 }, { "epoch": 3.6959208899876392, "grad_norm": 0.73828125, "learning_rate": 2.58974358974359e-05, "loss": 0.8611, "step": 14950 }, { "epoch": 3.702101359703337, "grad_norm": 0.71875, "learning_rate": 2.5769230769230768e-05, "loss": 0.8628, "step": 14975 }, { "epoch": 3.708281829419036, "grad_norm": 0.7265625, "learning_rate": 2.564102564102564e-05, "loss": 0.8643, "step": 15000 }, { "epoch": 3.708281829419036, "eval_loss": 0.9966387748718262, "eval_runtime": 1.5383, "eval_samples_per_second": 415.389, "eval_steps_per_second": 1.95, "step": 15000 }, { "epoch": 3.714462299134734, "grad_norm": 0.73046875, "learning_rate": 2.551282051282051e-05, "loss": 0.864, "step": 15025 }, { "epoch": 3.720642768850433, "grad_norm": 0.71484375, "learning_rate": 2.5384615384615383e-05, "loss": 0.8643, "step": 15050 }, { "epoch": 3.726823238566131, "grad_norm": 0.7578125, "learning_rate": 2.5256410256410258e-05, "loss": 0.8614, "step": 15075 }, { "epoch": 3.7330037082818293, "grad_norm": 0.78125, "learning_rate": 2.512820512820513e-05, "loss": 0.8694, "step": 15100 }, { "epoch": 3.7391841779975277, "grad_norm": 0.75, "learning_rate": 2.5e-05, "loss": 0.8666, "step": 15125 }, { "epoch": 3.745364647713226, "grad_norm": 0.734375, "learning_rate": 2.4871794871794873e-05, "loss": 0.8616, "step": 15150 }, { "epoch": 3.7515451174289245, "grad_norm": 0.71484375, "learning_rate": 2.4743589743589744e-05, "loss": 0.8646, "step": 15175 }, { "epoch": 3.757725587144623, "grad_norm": 0.734375, "learning_rate": 2.461538461538462e-05, "loss": 0.8623, "step": 15200 }, { "epoch": 3.7639060568603213, "grad_norm": 0.73046875, "learning_rate": 2.4487179487179488e-05, "loss": 0.8645, "step": 15225 }, { "epoch": 3.7700865265760197, "grad_norm": 0.71484375, "learning_rate": 2.435897435897436e-05, "loss": 0.8612, "step": 15250 }, { "epoch": 3.776266996291718, "grad_norm": 0.71875, "learning_rate": 2.423076923076923e-05, "loss": 0.8642, "step": 15275 }, { "epoch": 3.7824474660074165, "grad_norm": 0.734375, "learning_rate": 2.4102564102564103e-05, "loss": 0.8615, "step": 15300 }, { "epoch": 3.788627935723115, "grad_norm": 0.71875, "learning_rate": 2.3974358974358978e-05, "loss": 0.8603, "step": 15325 }, { "epoch": 3.7948084054388134, "grad_norm": 0.70703125, "learning_rate": 2.384615384615385e-05, "loss": 0.8675, "step": 15350 }, { "epoch": 3.800988875154512, "grad_norm": 0.68359375, "learning_rate": 2.3717948717948718e-05, "loss": 0.8655, "step": 15375 }, { "epoch": 3.80716934487021, "grad_norm": 0.71875, "learning_rate": 2.358974358974359e-05, "loss": 0.8632, "step": 15400 }, { "epoch": 3.8133498145859086, "grad_norm": 0.734375, "learning_rate": 2.3461538461538464e-05, "loss": 0.8599, "step": 15425 }, { "epoch": 3.819530284301607, "grad_norm": 0.75390625, "learning_rate": 2.3333333333333336e-05, "loss": 0.8656, "step": 15450 }, { "epoch": 3.8257107540173054, "grad_norm": 0.73046875, "learning_rate": 2.3205128205128207e-05, "loss": 0.8638, "step": 15475 }, { "epoch": 3.831891223733004, "grad_norm": 0.69140625, "learning_rate": 2.307692307692308e-05, "loss": 0.8621, "step": 15500 }, { "epoch": 3.8380716934487022, "grad_norm": 0.71484375, "learning_rate": 2.2948717948717947e-05, "loss": 0.8651, "step": 15525 }, { "epoch": 3.8442521631644007, "grad_norm": 0.7109375, "learning_rate": 2.2820512820512822e-05, "loss": 0.8607, "step": 15550 }, { "epoch": 3.850432632880099, "grad_norm": 0.6953125, "learning_rate": 2.2692307692307694e-05, "loss": 0.8632, "step": 15575 }, { "epoch": 3.856613102595797, "grad_norm": 0.69140625, "learning_rate": 2.2564102564102566e-05, "loss": 0.8632, "step": 15600 }, { "epoch": 3.862793572311496, "grad_norm": 0.69921875, "learning_rate": 2.2435897435897437e-05, "loss": 0.8631, "step": 15625 }, { "epoch": 3.868974042027194, "grad_norm": 0.67578125, "learning_rate": 2.230769230769231e-05, "loss": 0.8594, "step": 15650 }, { "epoch": 3.8751545117428927, "grad_norm": 0.69140625, "learning_rate": 2.217948717948718e-05, "loss": 0.8617, "step": 15675 }, { "epoch": 3.8813349814585907, "grad_norm": 0.72265625, "learning_rate": 2.2051282051282052e-05, "loss": 0.8617, "step": 15700 }, { "epoch": 3.887515451174289, "grad_norm": 0.75390625, "learning_rate": 2.1923076923076924e-05, "loss": 0.8629, "step": 15725 }, { "epoch": 3.8936959208899875, "grad_norm": 0.69921875, "learning_rate": 2.1794871794871795e-05, "loss": 0.8639, "step": 15750 }, { "epoch": 3.899876390605686, "grad_norm": 0.734375, "learning_rate": 2.1666666666666667e-05, "loss": 0.8616, "step": 15775 }, { "epoch": 3.9060568603213843, "grad_norm": 0.72265625, "learning_rate": 2.1538461538461542e-05, "loss": 0.865, "step": 15800 }, { "epoch": 3.9122373300370827, "grad_norm": 0.68359375, "learning_rate": 2.141025641025641e-05, "loss": 0.8609, "step": 15825 }, { "epoch": 3.918417799752781, "grad_norm": 0.6640625, "learning_rate": 2.1282051282051282e-05, "loss": 0.861, "step": 15850 }, { "epoch": 3.9245982694684796, "grad_norm": 0.7109375, "learning_rate": 2.1153846153846154e-05, "loss": 0.8613, "step": 15875 }, { "epoch": 3.930778739184178, "grad_norm": 0.69140625, "learning_rate": 2.102564102564103e-05, "loss": 0.8624, "step": 15900 }, { "epoch": 3.9369592088998764, "grad_norm": 0.6953125, "learning_rate": 2.08974358974359e-05, "loss": 0.8633, "step": 15925 }, { "epoch": 3.943139678615575, "grad_norm": 0.703125, "learning_rate": 2.0769230769230772e-05, "loss": 0.8589, "step": 15950 }, { "epoch": 3.949320148331273, "grad_norm": 0.70703125, "learning_rate": 2.064102564102564e-05, "loss": 0.8605, "step": 15975 }, { "epoch": 3.9555006180469716, "grad_norm": 0.70703125, "learning_rate": 2.0512820512820512e-05, "loss": 0.8612, "step": 16000 }, { "epoch": 3.96168108776267, "grad_norm": 0.6875, "learning_rate": 2.0384615384615387e-05, "loss": 0.8612, "step": 16025 }, { "epoch": 3.9678615574783684, "grad_norm": 0.72265625, "learning_rate": 2.025641025641026e-05, "loss": 0.861, "step": 16050 }, { "epoch": 3.974042027194067, "grad_norm": 0.73046875, "learning_rate": 2.012820512820513e-05, "loss": 0.8593, "step": 16075 }, { "epoch": 3.9802224969097653, "grad_norm": 0.69140625, "learning_rate": 2e-05, "loss": 0.8612, "step": 16100 }, { "epoch": 3.9864029666254637, "grad_norm": 0.71484375, "learning_rate": 1.987179487179487e-05, "loss": 0.8664, "step": 16125 }, { "epoch": 3.992583436341162, "grad_norm": 0.70703125, "learning_rate": 1.9743589743589745e-05, "loss": 0.8599, "step": 16150 }, { "epoch": 3.99876390605686, "grad_norm": 0.6953125, "learning_rate": 1.9615384615384617e-05, "loss": 0.8649, "step": 16175 }, { "epoch": 4.004944375772559, "grad_norm": 0.7109375, "learning_rate": 1.9487179487179488e-05, "loss": 0.8264, "step": 16200 }, { "epoch": 4.011124845488257, "grad_norm": 0.6875, "learning_rate": 1.935897435897436e-05, "loss": 0.8184, "step": 16225 }, { "epoch": 4.017305315203956, "grad_norm": 0.69140625, "learning_rate": 1.923076923076923e-05, "loss": 0.8214, "step": 16250 }, { "epoch": 4.023485784919654, "grad_norm": 0.69140625, "learning_rate": 1.9102564102564103e-05, "loss": 0.8166, "step": 16275 }, { "epoch": 4.0296662546353526, "grad_norm": 0.671875, "learning_rate": 1.8974358974358975e-05, "loss": 0.8191, "step": 16300 }, { "epoch": 4.0358467243510505, "grad_norm": 0.703125, "learning_rate": 1.8846153846153846e-05, "loss": 0.8194, "step": 16325 }, { "epoch": 4.042027194066749, "grad_norm": 0.69921875, "learning_rate": 1.8717948717948718e-05, "loss": 0.8189, "step": 16350 }, { "epoch": 4.048207663782447, "grad_norm": 0.70703125, "learning_rate": 1.858974358974359e-05, "loss": 0.822, "step": 16375 }, { "epoch": 4.054388133498146, "grad_norm": 0.73046875, "learning_rate": 1.8461538461538465e-05, "loss": 0.8146, "step": 16400 }, { "epoch": 4.060568603213844, "grad_norm": 0.6953125, "learning_rate": 1.8333333333333333e-05, "loss": 0.8168, "step": 16425 }, { "epoch": 4.066749072929543, "grad_norm": 0.6953125, "learning_rate": 1.8205128205128204e-05, "loss": 0.8221, "step": 16450 }, { "epoch": 4.072929542645241, "grad_norm": 0.7109375, "learning_rate": 1.8076923076923076e-05, "loss": 0.8179, "step": 16475 }, { "epoch": 4.07911001236094, "grad_norm": 0.6875, "learning_rate": 1.794871794871795e-05, "loss": 0.8182, "step": 16500 }, { "epoch": 4.085290482076638, "grad_norm": 0.73046875, "learning_rate": 1.7820512820512823e-05, "loss": 0.8145, "step": 16525 }, { "epoch": 4.091470951792337, "grad_norm": 0.703125, "learning_rate": 1.7692307692307694e-05, "loss": 0.8239, "step": 16550 }, { "epoch": 4.097651421508035, "grad_norm": 0.69140625, "learning_rate": 1.7564102564102563e-05, "loss": 0.8151, "step": 16575 }, { "epoch": 4.103831891223733, "grad_norm": 0.7109375, "learning_rate": 1.7435897435897434e-05, "loss": 0.8193, "step": 16600 }, { "epoch": 4.1100123609394315, "grad_norm": 0.6875, "learning_rate": 1.730769230769231e-05, "loss": 0.8193, "step": 16625 }, { "epoch": 4.116192830655129, "grad_norm": 0.69140625, "learning_rate": 1.717948717948718e-05, "loss": 0.8193, "step": 16650 }, { "epoch": 4.122373300370828, "grad_norm": 0.703125, "learning_rate": 1.7051282051282053e-05, "loss": 0.8214, "step": 16675 }, { "epoch": 4.128553770086526, "grad_norm": 0.69140625, "learning_rate": 1.6923076923076924e-05, "loss": 0.8211, "step": 16700 }, { "epoch": 4.134734239802225, "grad_norm": 0.6953125, "learning_rate": 1.6794871794871796e-05, "loss": 0.8213, "step": 16725 }, { "epoch": 4.140914709517923, "grad_norm": 0.7109375, "learning_rate": 1.6666666666666667e-05, "loss": 0.8209, "step": 16750 }, { "epoch": 4.147095179233622, "grad_norm": 0.6953125, "learning_rate": 1.653846153846154e-05, "loss": 0.8185, "step": 16775 }, { "epoch": 4.15327564894932, "grad_norm": 0.6875, "learning_rate": 1.641025641025641e-05, "loss": 0.8188, "step": 16800 }, { "epoch": 4.159456118665019, "grad_norm": 0.69140625, "learning_rate": 1.6282051282051282e-05, "loss": 0.8209, "step": 16825 }, { "epoch": 4.165636588380717, "grad_norm": 0.70703125, "learning_rate": 1.6153846153846154e-05, "loss": 0.82, "step": 16850 }, { "epoch": 4.171817058096416, "grad_norm": 0.6953125, "learning_rate": 1.602564102564103e-05, "loss": 0.8203, "step": 16875 }, { "epoch": 4.1779975278121135, "grad_norm": 0.71484375, "learning_rate": 1.5897435897435897e-05, "loss": 0.8175, "step": 16900 }, { "epoch": 4.184177997527812, "grad_norm": 0.71875, "learning_rate": 1.576923076923077e-05, "loss": 0.8207, "step": 16925 }, { "epoch": 4.19035846724351, "grad_norm": 0.6953125, "learning_rate": 1.564102564102564e-05, "loss": 0.8213, "step": 16950 }, { "epoch": 4.196538936959209, "grad_norm": 0.6953125, "learning_rate": 1.5512820512820516e-05, "loss": 0.8235, "step": 16975 }, { "epoch": 4.202719406674907, "grad_norm": 0.703125, "learning_rate": 1.5384615384615387e-05, "loss": 0.8166, "step": 17000 }, { "epoch": 4.208899876390606, "grad_norm": 0.6875, "learning_rate": 1.5256410256410259e-05, "loss": 0.8178, "step": 17025 }, { "epoch": 4.215080346106304, "grad_norm": 0.6875, "learning_rate": 1.5128205128205129e-05, "loss": 0.8214, "step": 17050 }, { "epoch": 4.221260815822003, "grad_norm": 0.70703125, "learning_rate": 1.5e-05, "loss": 0.8228, "step": 17075 }, { "epoch": 4.227441285537701, "grad_norm": 0.69140625, "learning_rate": 1.4871794871794872e-05, "loss": 0.8216, "step": 17100 }, { "epoch": 4.2336217552534, "grad_norm": 0.6796875, "learning_rate": 1.4743589743589745e-05, "loss": 0.8202, "step": 17125 }, { "epoch": 4.239802224969098, "grad_norm": 0.6875, "learning_rate": 1.4615384615384617e-05, "loss": 0.8245, "step": 17150 }, { "epoch": 4.2459826946847965, "grad_norm": 0.68359375, "learning_rate": 1.4487179487179489e-05, "loss": 0.8228, "step": 17175 }, { "epoch": 4.2521631644004945, "grad_norm": 0.703125, "learning_rate": 1.4358974358974359e-05, "loss": 0.8222, "step": 17200 }, { "epoch": 4.258343634116192, "grad_norm": 0.703125, "learning_rate": 1.423076923076923e-05, "loss": 0.8213, "step": 17225 }, { "epoch": 4.264524103831891, "grad_norm": 0.6953125, "learning_rate": 1.4102564102564104e-05, "loss": 0.8226, "step": 17250 }, { "epoch": 4.270704573547589, "grad_norm": 0.703125, "learning_rate": 1.3974358974358975e-05, "loss": 0.8171, "step": 17275 }, { "epoch": 4.276885043263288, "grad_norm": 0.70703125, "learning_rate": 1.3846153846153847e-05, "loss": 0.8161, "step": 17300 }, { "epoch": 4.283065512978986, "grad_norm": 0.70703125, "learning_rate": 1.371794871794872e-05, "loss": 0.8203, "step": 17325 }, { "epoch": 4.289245982694685, "grad_norm": 0.69921875, "learning_rate": 1.358974358974359e-05, "loss": 0.8184, "step": 17350 }, { "epoch": 4.295426452410383, "grad_norm": 0.7109375, "learning_rate": 1.3461538461538462e-05, "loss": 0.8223, "step": 17375 }, { "epoch": 4.301606922126082, "grad_norm": 0.68359375, "learning_rate": 1.3333333333333333e-05, "loss": 0.8207, "step": 17400 }, { "epoch": 4.30778739184178, "grad_norm": 0.68359375, "learning_rate": 1.3205128205128207e-05, "loss": 0.8233, "step": 17425 }, { "epoch": 4.313967861557479, "grad_norm": 0.69921875, "learning_rate": 1.3076923076923078e-05, "loss": 0.8211, "step": 17450 }, { "epoch": 4.3201483312731765, "grad_norm": 0.73046875, "learning_rate": 1.294871794871795e-05, "loss": 0.8193, "step": 17475 }, { "epoch": 4.326328800988875, "grad_norm": 0.6875, "learning_rate": 1.282051282051282e-05, "loss": 0.8207, "step": 17500 }, { "epoch": 4.332509270704573, "grad_norm": 0.6796875, "learning_rate": 1.2692307692307691e-05, "loss": 0.8247, "step": 17525 }, { "epoch": 4.338689740420272, "grad_norm": 0.69140625, "learning_rate": 1.2564102564102565e-05, "loss": 0.826, "step": 17550 }, { "epoch": 4.34487021013597, "grad_norm": 0.6953125, "learning_rate": 1.2435897435897436e-05, "loss": 0.82, "step": 17575 }, { "epoch": 4.351050679851669, "grad_norm": 0.6953125, "learning_rate": 1.230769230769231e-05, "loss": 0.8161, "step": 17600 }, { "epoch": 4.357231149567367, "grad_norm": 0.67578125, "learning_rate": 1.217948717948718e-05, "loss": 0.826, "step": 17625 }, { "epoch": 4.363411619283066, "grad_norm": 0.68359375, "learning_rate": 1.2051282051282051e-05, "loss": 0.8199, "step": 17650 }, { "epoch": 4.369592088998764, "grad_norm": 0.703125, "learning_rate": 1.1923076923076925e-05, "loss": 0.8251, "step": 17675 }, { "epoch": 4.375772558714463, "grad_norm": 0.6953125, "learning_rate": 1.1794871794871795e-05, "loss": 0.8209, "step": 17700 }, { "epoch": 4.381953028430161, "grad_norm": 0.6953125, "learning_rate": 1.1666666666666668e-05, "loss": 0.8206, "step": 17725 }, { "epoch": 4.388133498145859, "grad_norm": 0.6875, "learning_rate": 1.153846153846154e-05, "loss": 0.8209, "step": 17750 }, { "epoch": 4.3943139678615575, "grad_norm": 0.6875, "learning_rate": 1.1410256410256411e-05, "loss": 0.8218, "step": 17775 }, { "epoch": 4.400494437577256, "grad_norm": 0.68359375, "learning_rate": 1.1282051282051283e-05, "loss": 0.8216, "step": 17800 }, { "epoch": 4.406674907292954, "grad_norm": 0.6953125, "learning_rate": 1.1153846153846154e-05, "loss": 0.82, "step": 17825 }, { "epoch": 4.412855377008652, "grad_norm": 0.69921875, "learning_rate": 1.1025641025641026e-05, "loss": 0.8224, "step": 17850 }, { "epoch": 4.419035846724351, "grad_norm": 0.6953125, "learning_rate": 1.0897435897435898e-05, "loss": 0.8209, "step": 17875 }, { "epoch": 4.425216316440049, "grad_norm": 0.69140625, "learning_rate": 1.0769230769230771e-05, "loss": 0.8198, "step": 17900 }, { "epoch": 4.431396786155748, "grad_norm": 0.703125, "learning_rate": 1.0641025641025641e-05, "loss": 0.8259, "step": 17925 }, { "epoch": 4.437577255871446, "grad_norm": 0.6953125, "learning_rate": 1.0512820512820514e-05, "loss": 0.8211, "step": 17950 }, { "epoch": 4.443757725587145, "grad_norm": 0.66796875, "learning_rate": 1.0384615384615386e-05, "loss": 0.8213, "step": 17975 }, { "epoch": 4.449938195302843, "grad_norm": 0.703125, "learning_rate": 1.0256410256410256e-05, "loss": 0.8266, "step": 18000 }, { "epoch": 4.456118665018542, "grad_norm": 0.69921875, "learning_rate": 1.012820512820513e-05, "loss": 0.821, "step": 18025 }, { "epoch": 4.46229913473424, "grad_norm": 0.67578125, "learning_rate": 1e-05, "loss": 0.8223, "step": 18050 }, { "epoch": 4.468479604449938, "grad_norm": 0.6953125, "learning_rate": 9.871794871794872e-06, "loss": 0.8216, "step": 18075 }, { "epoch": 4.474660074165636, "grad_norm": 0.703125, "learning_rate": 9.743589743589744e-06, "loss": 0.8183, "step": 18100 }, { "epoch": 4.480840543881335, "grad_norm": 0.6953125, "learning_rate": 9.615384615384616e-06, "loss": 0.8243, "step": 18125 }, { "epoch": 4.487021013597033, "grad_norm": 0.71484375, "learning_rate": 9.487179487179487e-06, "loss": 0.8197, "step": 18150 }, { "epoch": 4.493201483312732, "grad_norm": 0.71484375, "learning_rate": 9.358974358974359e-06, "loss": 0.8243, "step": 18175 }, { "epoch": 4.49938195302843, "grad_norm": 0.703125, "learning_rate": 9.230769230769232e-06, "loss": 0.8211, "step": 18200 }, { "epoch": 4.505562422744129, "grad_norm": 0.6953125, "learning_rate": 9.102564102564102e-06, "loss": 0.8222, "step": 18225 }, { "epoch": 4.511742892459827, "grad_norm": 0.6953125, "learning_rate": 8.974358974358976e-06, "loss": 0.8225, "step": 18250 }, { "epoch": 4.517923362175526, "grad_norm": 0.69140625, "learning_rate": 8.846153846153847e-06, "loss": 0.8261, "step": 18275 }, { "epoch": 4.524103831891224, "grad_norm": 0.6796875, "learning_rate": 8.717948717948717e-06, "loss": 0.8234, "step": 18300 }, { "epoch": 4.5302843016069225, "grad_norm": 0.6796875, "learning_rate": 8.58974358974359e-06, "loss": 0.8213, "step": 18325 }, { "epoch": 4.5364647713226205, "grad_norm": 0.70703125, "learning_rate": 8.461538461538462e-06, "loss": 0.8272, "step": 18350 }, { "epoch": 4.5426452410383185, "grad_norm": 0.6796875, "learning_rate": 8.333333333333334e-06, "loss": 0.8223, "step": 18375 }, { "epoch": 4.548825710754017, "grad_norm": 0.70703125, "learning_rate": 8.205128205128205e-06, "loss": 0.8218, "step": 18400 }, { "epoch": 4.555006180469716, "grad_norm": 0.6875, "learning_rate": 8.076923076923077e-06, "loss": 0.8195, "step": 18425 }, { "epoch": 4.561186650185414, "grad_norm": 0.6640625, "learning_rate": 7.948717948717949e-06, "loss": 0.8222, "step": 18450 }, { "epoch": 4.567367119901112, "grad_norm": 0.69140625, "learning_rate": 7.82051282051282e-06, "loss": 0.8212, "step": 18475 }, { "epoch": 4.573547589616811, "grad_norm": 0.703125, "learning_rate": 7.692307692307694e-06, "loss": 0.8263, "step": 18500 }, { "epoch": 4.579728059332509, "grad_norm": 0.67578125, "learning_rate": 7.564102564102564e-06, "loss": 0.824, "step": 18525 }, { "epoch": 4.585908529048208, "grad_norm": 0.7109375, "learning_rate": 7.435897435897436e-06, "loss": 0.8189, "step": 18550 }, { "epoch": 4.592088998763906, "grad_norm": 0.6953125, "learning_rate": 7.3076923076923085e-06, "loss": 0.8213, "step": 18575 }, { "epoch": 4.598269468479605, "grad_norm": 0.6796875, "learning_rate": 7.179487179487179e-06, "loss": 0.8183, "step": 18600 }, { "epoch": 4.604449938195303, "grad_norm": 0.6875, "learning_rate": 7.051282051282052e-06, "loss": 0.8203, "step": 18625 }, { "epoch": 4.610630407911001, "grad_norm": 0.671875, "learning_rate": 6.923076923076923e-06, "loss": 0.819, "step": 18650 }, { "epoch": 4.616810877626699, "grad_norm": 0.73828125, "learning_rate": 6.794871794871795e-06, "loss": 0.8235, "step": 18675 }, { "epoch": 4.622991347342398, "grad_norm": 0.69140625, "learning_rate": 6.666666666666667e-06, "loss": 0.8229, "step": 18700 }, { "epoch": 4.629171817058096, "grad_norm": 0.6796875, "learning_rate": 6.538461538461539e-06, "loss": 0.8189, "step": 18725 }, { "epoch": 4.635352286773795, "grad_norm": 0.6875, "learning_rate": 6.41025641025641e-06, "loss": 0.8224, "step": 18750 }, { "epoch": 4.641532756489493, "grad_norm": 0.66796875, "learning_rate": 6.282051282051282e-06, "loss": 0.8225, "step": 18775 }, { "epoch": 4.647713226205192, "grad_norm": 0.67578125, "learning_rate": 6.153846153846155e-06, "loss": 0.8194, "step": 18800 }, { "epoch": 4.65389369592089, "grad_norm": 0.68359375, "learning_rate": 6.025641025641026e-06, "loss": 0.8225, "step": 18825 }, { "epoch": 4.660074165636589, "grad_norm": 0.6875, "learning_rate": 5.897435897435897e-06, "loss": 0.8169, "step": 18850 }, { "epoch": 4.666254635352287, "grad_norm": 0.6953125, "learning_rate": 5.76923076923077e-06, "loss": 0.8153, "step": 18875 }, { "epoch": 4.6724351050679855, "grad_norm": 0.671875, "learning_rate": 5.641025641025641e-06, "loss": 0.822, "step": 18900 }, { "epoch": 4.6786155747836835, "grad_norm": 0.671875, "learning_rate": 5.512820512820513e-06, "loss": 0.822, "step": 18925 }, { "epoch": 4.684796044499382, "grad_norm": 0.6796875, "learning_rate": 5.3846153846153855e-06, "loss": 0.8184, "step": 18950 }, { "epoch": 4.69097651421508, "grad_norm": 0.69921875, "learning_rate": 5.256410256410257e-06, "loss": 0.8247, "step": 18975 }, { "epoch": 4.697156983930778, "grad_norm": 0.67578125, "learning_rate": 5.128205128205128e-06, "loss": 0.8212, "step": 19000 }, { "epoch": 4.703337453646477, "grad_norm": 0.671875, "learning_rate": 5e-06, "loss": 0.8226, "step": 19025 }, { "epoch": 4.709517923362176, "grad_norm": 0.66796875, "learning_rate": 4.871794871794872e-06, "loss": 0.8206, "step": 19050 }, { "epoch": 4.715698393077874, "grad_norm": 0.6796875, "learning_rate": 4.743589743589744e-06, "loss": 0.8183, "step": 19075 }, { "epoch": 4.721878862793572, "grad_norm": 0.69140625, "learning_rate": 4.615384615384616e-06, "loss": 0.8184, "step": 19100 }, { "epoch": 4.728059332509271, "grad_norm": 0.70703125, "learning_rate": 4.487179487179488e-06, "loss": 0.8212, "step": 19125 }, { "epoch": 4.734239802224969, "grad_norm": 0.66796875, "learning_rate": 4.3589743589743586e-06, "loss": 0.8198, "step": 19150 }, { "epoch": 4.740420271940668, "grad_norm": 0.69921875, "learning_rate": 4.230769230769231e-06, "loss": 0.8235, "step": 19175 }, { "epoch": 4.746600741656366, "grad_norm": 0.6796875, "learning_rate": 4.102564102564103e-06, "loss": 0.8228, "step": 19200 }, { "epoch": 4.752781211372064, "grad_norm": 0.67578125, "learning_rate": 3.974358974358974e-06, "loss": 0.8278, "step": 19225 }, { "epoch": 4.758961681087762, "grad_norm": 0.66015625, "learning_rate": 3.846153846153847e-06, "loss": 0.817, "step": 19250 }, { "epoch": 4.765142150803461, "grad_norm": 0.6953125, "learning_rate": 3.717948717948718e-06, "loss": 0.8226, "step": 19275 }, { "epoch": 4.771322620519159, "grad_norm": 0.6796875, "learning_rate": 3.5897435897435896e-06, "loss": 0.8218, "step": 19300 }, { "epoch": 4.777503090234858, "grad_norm": 0.69921875, "learning_rate": 3.4615384615384617e-06, "loss": 0.8209, "step": 19325 }, { "epoch": 4.783683559950556, "grad_norm": 0.6875, "learning_rate": 3.3333333333333333e-06, "loss": 0.8231, "step": 19350 }, { "epoch": 4.789864029666255, "grad_norm": 0.6875, "learning_rate": 3.205128205128205e-06, "loss": 0.8199, "step": 19375 }, { "epoch": 4.796044499381953, "grad_norm": 0.68359375, "learning_rate": 3.0769230769230774e-06, "loss": 0.8214, "step": 19400 }, { "epoch": 4.802224969097652, "grad_norm": 0.6875, "learning_rate": 2.9487179487179486e-06, "loss": 0.8221, "step": 19425 }, { "epoch": 4.80840543881335, "grad_norm": 0.6796875, "learning_rate": 2.8205128205128207e-06, "loss": 0.8198, "step": 19450 }, { "epoch": 4.8145859085290486, "grad_norm": 0.68359375, "learning_rate": 2.6923076923076928e-06, "loss": 0.8195, "step": 19475 }, { "epoch": 4.8207663782447465, "grad_norm": 0.69140625, "learning_rate": 2.564102564102564e-06, "loss": 0.8217, "step": 19500 }, { "epoch": 4.826946847960445, "grad_norm": 0.6875, "learning_rate": 2.435897435897436e-06, "loss": 0.8185, "step": 19525 }, { "epoch": 4.833127317676143, "grad_norm": 0.68359375, "learning_rate": 2.307692307692308e-06, "loss": 0.8196, "step": 19550 }, { "epoch": 4.839307787391842, "grad_norm": 0.69140625, "learning_rate": 2.1794871794871793e-06, "loss": 0.8204, "step": 19575 }, { "epoch": 4.84548825710754, "grad_norm": 0.66796875, "learning_rate": 2.0512820512820513e-06, "loss": 0.8201, "step": 19600 }, { "epoch": 4.851668726823238, "grad_norm": 0.67578125, "learning_rate": 1.9230769230769234e-06, "loss": 0.8229, "step": 19625 }, { "epoch": 4.857849196538937, "grad_norm": 0.6640625, "learning_rate": 1.7948717948717948e-06, "loss": 0.8222, "step": 19650 }, { "epoch": 4.864029666254636, "grad_norm": 0.6796875, "learning_rate": 1.6666666666666667e-06, "loss": 0.8232, "step": 19675 }, { "epoch": 4.870210135970334, "grad_norm": 0.69140625, "learning_rate": 1.5384615384615387e-06, "loss": 0.8224, "step": 19700 }, { "epoch": 4.876390605686032, "grad_norm": 0.66796875, "learning_rate": 1.4102564102564104e-06, "loss": 0.8235, "step": 19725 }, { "epoch": 4.882571075401731, "grad_norm": 0.671875, "learning_rate": 1.282051282051282e-06, "loss": 0.8164, "step": 19750 }, { "epoch": 4.888751545117429, "grad_norm": 0.69140625, "learning_rate": 1.153846153846154e-06, "loss": 0.8217, "step": 19775 }, { "epoch": 4.8949320148331275, "grad_norm": 0.6875, "learning_rate": 1.0256410256410257e-06, "loss": 0.8185, "step": 19800 }, { "epoch": 4.901112484548825, "grad_norm": 0.66796875, "learning_rate": 8.974358974358974e-07, "loss": 0.8256, "step": 19825 }, { "epoch": 4.907292954264524, "grad_norm": 0.68359375, "learning_rate": 7.692307692307694e-07, "loss": 0.8235, "step": 19850 }, { "epoch": 4.913473423980222, "grad_norm": 0.67578125, "learning_rate": 6.41025641025641e-07, "loss": 0.8234, "step": 19875 }, { "epoch": 4.919653893695921, "grad_norm": 0.6953125, "learning_rate": 5.128205128205128e-07, "loss": 0.8208, "step": 19900 }, { "epoch": 4.925834363411619, "grad_norm": 0.68359375, "learning_rate": 3.846153846153847e-07, "loss": 0.8235, "step": 19925 }, { "epoch": 4.932014833127318, "grad_norm": 0.67578125, "learning_rate": 2.564102564102564e-07, "loss": 0.8218, "step": 19950 }, { "epoch": 4.938195302843016, "grad_norm": 0.67578125, "learning_rate": 1.282051282051282e-07, "loss": 0.8207, "step": 19975 }, { "epoch": 4.944375772558715, "grad_norm": 0.6796875, "learning_rate": 0.0, "loss": 0.8191, "step": 20000 }, { "epoch": 4.944375772558715, "eval_loss": 1.0042184591293335, "eval_runtime": 1.5308, "eval_samples_per_second": 417.441, "eval_steps_per_second": 1.96, "step": 20000 }, { "epoch": 4.944622991347343, "step": 20001, "total_flos": 9.058565045825516e+19, "train_loss": 4.2969823139893115e-05, "train_runtime": 7.0157, "train_samples_per_second": 729792.908, "train_steps_per_second": 2850.754 } ], "logging_steps": 25, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5000, "total_flos": 9.058565045825516e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }