{ "best_metric": 7.215361500971087, "best_model_checkpoint": "./checkpoint-9000", "epoch": 5.048, "eval_steps": 500, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025, "grad_norm": 6.131621360778809, "learning_rate": 2.1875e-07, "loss": 0.9345, "step": 25 }, { "epoch": 0.005, "grad_norm": 6.021520137786865, "learning_rate": 4.375e-07, "loss": 0.8231, "step": 50 }, { "epoch": 0.0075, "grad_norm": 5.526496410369873, "learning_rate": 6.5625e-07, "loss": 0.5623, "step": 75 }, { "epoch": 0.01, "grad_norm": 4.9277825355529785, "learning_rate": 8.75e-07, "loss": 0.4173, "step": 100 }, { "epoch": 0.0125, "grad_norm": 4.292990684509277, "learning_rate": 1.09375e-06, "loss": 0.385, "step": 125 }, { "epoch": 0.015, "grad_norm": 5.749295234680176, "learning_rate": 1.3125e-06, "loss": 0.3931, "step": 150 }, { "epoch": 0.0175, "grad_norm": 3.8306965827941895, "learning_rate": 1.5312499999999997e-06, "loss": 0.3516, "step": 175 }, { "epoch": 0.02, "grad_norm": 4.687748908996582, "learning_rate": 1.75e-06, "loss": 0.3235, "step": 200 }, { "epoch": 0.0225, "grad_norm": 4.232759952545166, "learning_rate": 1.96875e-06, "loss": 0.3314, "step": 225 }, { "epoch": 0.025, "grad_norm": 4.185751914978027, "learning_rate": 2.1875e-06, "loss": 0.309, "step": 250 }, { "epoch": 0.0275, "grad_norm": 4.818612098693848, "learning_rate": 2.40625e-06, "loss": 0.2991, "step": 275 }, { "epoch": 0.03, "grad_norm": 4.171736717224121, "learning_rate": 2.625e-06, "loss": 0.2832, "step": 300 }, { "epoch": 0.0325, "grad_norm": 5.217376708984375, "learning_rate": 2.8437499999999997e-06, "loss": 0.2873, "step": 325 }, { "epoch": 0.035, "grad_norm": 4.671106815338135, "learning_rate": 3.0624999999999995e-06, "loss": 0.2957, "step": 350 }, { "epoch": 0.0375, "grad_norm": 3.9175262451171875, "learning_rate": 3.2812499999999997e-06, "loss": 0.2634, "step": 375 }, { "epoch": 0.04, "grad_norm": 4.647582054138184, "learning_rate": 3.5e-06, "loss": 0.2541, "step": 400 }, { "epoch": 0.0425, "grad_norm": 3.25675368309021, "learning_rate": 3.7187499999999998e-06, "loss": 0.2244, "step": 425 }, { "epoch": 0.045, "grad_norm": 4.597206115722656, "learning_rate": 3.9375e-06, "loss": 0.2492, "step": 450 }, { "epoch": 0.0475, "grad_norm": 4.602332592010498, "learning_rate": 4.156249999999999e-06, "loss": 0.246, "step": 475 }, { "epoch": 0.05, "grad_norm": 3.6419622898101807, "learning_rate": 4.375e-06, "loss": 0.2208, "step": 500 }, { "epoch": 0.05, "eval_loss": 0.2592349350452423, "eval_runtime": 4116.5906, "eval_samples_per_second": 3.311, "eval_steps_per_second": 0.414, "eval_wer": 20.691487412510533, "step": 500 }, { "epoch": 0.0525, "grad_norm": 3.6599488258361816, "learning_rate": 4.363486842105263e-06, "loss": 0.2539, "step": 525 }, { "epoch": 0.055, "grad_norm": 3.6934616565704346, "learning_rate": 4.351973684210526e-06, "loss": 0.2313, "step": 550 }, { "epoch": 0.0575, "grad_norm": 3.7546138763427734, "learning_rate": 4.340460526315789e-06, "loss": 0.2272, "step": 575 }, { "epoch": 0.06, "grad_norm": 3.096877098083496, "learning_rate": 4.3289473684210525e-06, "loss": 0.2373, "step": 600 }, { "epoch": 0.0625, "grad_norm": 3.572812795639038, "learning_rate": 4.3174342105263155e-06, "loss": 0.2285, "step": 625 }, { "epoch": 0.065, "grad_norm": 3.3494396209716797, "learning_rate": 4.3059210526315785e-06, "loss": 0.2293, "step": 650 }, { "epoch": 0.0675, "grad_norm": 3.5156869888305664, "learning_rate": 4.2944078947368415e-06, "loss": 0.2063, "step": 675 }, { "epoch": 0.07, "grad_norm": 3.698807716369629, "learning_rate": 4.282894736842105e-06, "loss": 0.2113, "step": 700 }, { "epoch": 0.0725, "grad_norm": 3.716585636138916, "learning_rate": 4.271381578947368e-06, "loss": 0.2055, "step": 725 }, { "epoch": 0.075, "grad_norm": 4.204227924346924, "learning_rate": 4.2598684210526314e-06, "loss": 0.2114, "step": 750 }, { "epoch": 0.0775, "grad_norm": 3.479562282562256, "learning_rate": 4.2483552631578944e-06, "loss": 0.2224, "step": 775 }, { "epoch": 0.08, "grad_norm": 4.5203094482421875, "learning_rate": 4.2368421052631575e-06, "loss": 0.2523, "step": 800 }, { "epoch": 0.0825, "grad_norm": 3.6081738471984863, "learning_rate": 4.2253289473684205e-06, "loss": 0.2383, "step": 825 }, { "epoch": 0.085, "grad_norm": 3.2602758407592773, "learning_rate": 4.2138157894736835e-06, "loss": 0.1808, "step": 850 }, { "epoch": 0.0875, "grad_norm": 3.6786868572235107, "learning_rate": 4.202302631578947e-06, "loss": 0.1747, "step": 875 }, { "epoch": 0.09, "grad_norm": 3.1120803356170654, "learning_rate": 4.19078947368421e-06, "loss": 0.1662, "step": 900 }, { "epoch": 0.0925, "grad_norm": 3.1962203979492188, "learning_rate": 4.179276315789473e-06, "loss": 0.1771, "step": 925 }, { "epoch": 0.095, "grad_norm": 3.172363758087158, "learning_rate": 4.167763157894736e-06, "loss": 0.1751, "step": 950 }, { "epoch": 0.0975, "grad_norm": 2.4304590225219727, "learning_rate": 4.156249999999999e-06, "loss": 0.1701, "step": 975 }, { "epoch": 0.1, "grad_norm": 3.193345308303833, "learning_rate": 4.144736842105262e-06, "loss": 0.1489, "step": 1000 }, { "epoch": 0.1, "eval_loss": 0.1971057653427124, "eval_runtime": 4130.6867, "eval_samples_per_second": 3.3, "eval_steps_per_second": 0.413, "eval_wer": 14.68265601524424, "step": 1000 }, { "epoch": 0.1025, "grad_norm": 3.322065591812134, "learning_rate": 4.133223684210526e-06, "loss": 0.1701, "step": 1025 }, { "epoch": 0.105, "grad_norm": 3.5462722778320312, "learning_rate": 4.121710526315789e-06, "loss": 0.1875, "step": 1050 }, { "epoch": 0.1075, "grad_norm": 3.39326810836792, "learning_rate": 4.110197368421052e-06, "loss": 0.1506, "step": 1075 }, { "epoch": 0.11, "grad_norm": 2.9165821075439453, "learning_rate": 4.098684210526315e-06, "loss": 0.1525, "step": 1100 }, { "epoch": 0.1125, "grad_norm": 3.262007236480713, "learning_rate": 4.087171052631578e-06, "loss": 0.157, "step": 1125 }, { "epoch": 0.115, "grad_norm": 2.4523119926452637, "learning_rate": 4.075657894736842e-06, "loss": 0.1416, "step": 1150 }, { "epoch": 0.1175, "grad_norm": 2.7651798725128174, "learning_rate": 4.064144736842105e-06, "loss": 0.1527, "step": 1175 }, { "epoch": 0.12, "grad_norm": 3.609523296356201, "learning_rate": 4.052631578947368e-06, "loss": 0.1822, "step": 1200 }, { "epoch": 0.1225, "grad_norm": 3.8101985454559326, "learning_rate": 4.041118421052631e-06, "loss": 0.1703, "step": 1225 }, { "epoch": 0.125, "grad_norm": 3.8921287059783936, "learning_rate": 4.029605263157894e-06, "loss": 0.1924, "step": 1250 }, { "epoch": 0.1275, "grad_norm": 4.463279724121094, "learning_rate": 4.018092105263157e-06, "loss": 0.1818, "step": 1275 }, { "epoch": 0.13, "grad_norm": 3.6556308269500732, "learning_rate": 4.00657894736842e-06, "loss": 0.1726, "step": 1300 }, { "epoch": 0.1325, "grad_norm": 2.98067569732666, "learning_rate": 3.995065789473683e-06, "loss": 0.174, "step": 1325 }, { "epoch": 0.135, "grad_norm": 2.8287429809570312, "learning_rate": 3.983552631578947e-06, "loss": 0.1631, "step": 1350 }, { "epoch": 0.1375, "grad_norm": 2.6438794136047363, "learning_rate": 3.97203947368421e-06, "loss": 0.1475, "step": 1375 }, { "epoch": 0.14, "grad_norm": 3.513123035430908, "learning_rate": 3.960526315789473e-06, "loss": 0.1457, "step": 1400 }, { "epoch": 0.1425, "grad_norm": 2.4688916206359863, "learning_rate": 3.949013157894737e-06, "loss": 0.1375, "step": 1425 }, { "epoch": 0.145, "grad_norm": 4.005943775177002, "learning_rate": 3.9375e-06, "loss": 0.1623, "step": 1450 }, { "epoch": 0.1475, "grad_norm": 2.91786789894104, "learning_rate": 3.925986842105263e-06, "loss": 0.1701, "step": 1475 }, { "epoch": 0.15, "grad_norm": 3.5332415103912354, "learning_rate": 3.914473684210526e-06, "loss": 0.1973, "step": 1500 }, { "epoch": 0.15, "eval_loss": 0.17469166219234467, "eval_runtime": 4132.0041, "eval_samples_per_second": 3.299, "eval_steps_per_second": 0.412, "eval_wer": 12.377697973542453, "step": 1500 }, { "epoch": 0.1525, "grad_norm": 4.05070686340332, "learning_rate": 3.902960526315789e-06, "loss": 0.1796, "step": 1525 }, { "epoch": 0.155, "grad_norm": 2.989821195602417, "learning_rate": 3.891447368421052e-06, "loss": 0.1561, "step": 1550 }, { "epoch": 0.1575, "grad_norm": 2.9603219032287598, "learning_rate": 3.879934210526315e-06, "loss": 0.1609, "step": 1575 }, { "epoch": 0.16, "grad_norm": 3.2663583755493164, "learning_rate": 3.868421052631579e-06, "loss": 0.1833, "step": 1600 }, { "epoch": 0.1625, "grad_norm": 3.459775686264038, "learning_rate": 3.856907894736842e-06, "loss": 0.1727, "step": 1625 }, { "epoch": 0.165, "grad_norm": 3.427720069885254, "learning_rate": 3.845394736842105e-06, "loss": 0.181, "step": 1650 }, { "epoch": 0.1675, "grad_norm": 4.471118450164795, "learning_rate": 3.833881578947368e-06, "loss": 0.1536, "step": 1675 }, { "epoch": 0.17, "grad_norm": 3.1428306102752686, "learning_rate": 3.822368421052632e-06, "loss": 0.1372, "step": 1700 }, { "epoch": 1.0021, "grad_norm": 2.8270132541656494, "learning_rate": 3.8108552631578944e-06, "loss": 0.1454, "step": 1725 }, { "epoch": 1.0046, "grad_norm": 3.0873589515686035, "learning_rate": 3.799342105263158e-06, "loss": 0.1303, "step": 1750 }, { "epoch": 1.0071, "grad_norm": 3.187711000442505, "learning_rate": 3.787828947368421e-06, "loss": 0.1383, "step": 1775 }, { "epoch": 1.0096, "grad_norm": 3.1710643768310547, "learning_rate": 3.776315789473684e-06, "loss": 0.1626, "step": 1800 }, { "epoch": 1.0121, "grad_norm": 3.4516818523406982, "learning_rate": 3.7648026315789473e-06, "loss": 0.1405, "step": 1825 }, { "epoch": 1.0146, "grad_norm": 2.930408000946045, "learning_rate": 3.7532894736842103e-06, "loss": 0.143, "step": 1850 }, { "epoch": 1.0171, "grad_norm": 3.066941261291504, "learning_rate": 3.7417763157894733e-06, "loss": 0.1437, "step": 1875 }, { "epoch": 1.0196, "grad_norm": 3.389916181564331, "learning_rate": 3.7302631578947363e-06, "loss": 0.1289, "step": 1900 }, { "epoch": 1.0221, "grad_norm": 3.048574209213257, "learning_rate": 3.7187499999999998e-06, "loss": 0.1415, "step": 1925 }, { "epoch": 1.0246, "grad_norm": 2.5267295837402344, "learning_rate": 3.7072368421052628e-06, "loss": 0.1386, "step": 1950 }, { "epoch": 1.0271, "grad_norm": 3.151757001876831, "learning_rate": 3.6957236842105258e-06, "loss": 0.1436, "step": 1975 }, { "epoch": 1.0296, "grad_norm": 3.629039764404297, "learning_rate": 3.684210526315789e-06, "loss": 0.1353, "step": 2000 }, { "epoch": 1.0296, "eval_loss": 0.1527385264635086, "eval_runtime": 4116.9756, "eval_samples_per_second": 3.311, "eval_steps_per_second": 0.414, "eval_wer": 10.719520685990693, "step": 2000 }, { "epoch": 1.0321, "grad_norm": 1.8788173198699951, "learning_rate": 3.6726973684210522e-06, "loss": 0.1322, "step": 2025 }, { "epoch": 1.0346, "grad_norm": 2.587233066558838, "learning_rate": 3.6611842105263157e-06, "loss": 0.1176, "step": 2050 }, { "epoch": 1.0371, "grad_norm": 4.001532077789307, "learning_rate": 3.6496710526315787e-06, "loss": 0.1233, "step": 2075 }, { "epoch": 1.0396, "grad_norm": 3.3947739601135254, "learning_rate": 3.638157894736842e-06, "loss": 0.1188, "step": 2100 }, { "epoch": 1.0421, "grad_norm": 3.4743120670318604, "learning_rate": 3.626644736842105e-06, "loss": 0.1318, "step": 2125 }, { "epoch": 1.0446, "grad_norm": 2.9288718700408936, "learning_rate": 3.615131578947368e-06, "loss": 0.1224, "step": 2150 }, { "epoch": 1.0471, "grad_norm": 2.6081368923187256, "learning_rate": 3.603618421052631e-06, "loss": 0.1232, "step": 2175 }, { "epoch": 1.0496, "grad_norm": 2.4068429470062256, "learning_rate": 3.5921052631578946e-06, "loss": 0.1073, "step": 2200 }, { "epoch": 1.0521, "grad_norm": 3.049074411392212, "learning_rate": 3.5805921052631576e-06, "loss": 0.1071, "step": 2225 }, { "epoch": 1.0546, "grad_norm": 2.0809032917022705, "learning_rate": 3.5690789473684206e-06, "loss": 0.1217, "step": 2250 }, { "epoch": 1.0571, "grad_norm": 3.0854332447052, "learning_rate": 3.5575657894736836e-06, "loss": 0.1332, "step": 2275 }, { "epoch": 1.0596, "grad_norm": 3.580145835876465, "learning_rate": 3.546052631578947e-06, "loss": 0.131, "step": 2300 }, { "epoch": 1.0621, "grad_norm": 3.8924479484558105, "learning_rate": 3.53453947368421e-06, "loss": 0.136, "step": 2325 }, { "epoch": 1.0646, "grad_norm": 2.8398871421813965, "learning_rate": 3.523026315789473e-06, "loss": 0.1081, "step": 2350 }, { "epoch": 1.0671, "grad_norm": 3.007026195526123, "learning_rate": 3.511513157894737e-06, "loss": 0.1115, "step": 2375 }, { "epoch": 1.0695999999999999, "grad_norm": 1.5712552070617676, "learning_rate": 3.5e-06, "loss": 0.1183, "step": 2400 }, { "epoch": 1.0721, "grad_norm": 3.844963312149048, "learning_rate": 3.488486842105263e-06, "loss": 0.113, "step": 2425 }, { "epoch": 1.0746, "grad_norm": 2.8939759731292725, "learning_rate": 3.476973684210526e-06, "loss": 0.1115, "step": 2450 }, { "epoch": 1.0771, "grad_norm": 1.8150537014007568, "learning_rate": 3.4654605263157894e-06, "loss": 0.1117, "step": 2475 }, { "epoch": 1.0796000000000001, "grad_norm": 2.839418649673462, "learning_rate": 3.4539473684210524e-06, "loss": 0.1065, "step": 2500 }, { "epoch": 1.0796000000000001, "eval_loss": 0.1456422209739685, "eval_runtime": 4133.4016, "eval_samples_per_second": 3.298, "eval_steps_per_second": 0.412, "eval_wer": 9.869361281102277, "step": 2500 }, { "epoch": 1.0821, "grad_norm": 3.4274985790252686, "learning_rate": 3.4424342105263154e-06, "loss": 0.1067, "step": 2525 }, { "epoch": 1.0846, "grad_norm": 2.2946057319641113, "learning_rate": 3.4309210526315784e-06, "loss": 0.1038, "step": 2550 }, { "epoch": 1.0871, "grad_norm": 2.5364551544189453, "learning_rate": 3.419407894736842e-06, "loss": 0.1073, "step": 2575 }, { "epoch": 1.0896, "grad_norm": 2.9779515266418457, "learning_rate": 3.4083552631578944e-06, "loss": 0.1067, "step": 2600 }, { "epoch": 1.0921, "grad_norm": 2.502685308456421, "learning_rate": 3.3968421052631574e-06, "loss": 0.1229, "step": 2625 }, { "epoch": 1.0946, "grad_norm": 2.181756019592285, "learning_rate": 3.3853289473684205e-06, "loss": 0.1071, "step": 2650 }, { "epoch": 1.0971, "grad_norm": 2.428738594055176, "learning_rate": 3.3738157894736843e-06, "loss": 0.101, "step": 2675 }, { "epoch": 1.0996, "grad_norm": 3.797952651977539, "learning_rate": 3.3623026315789473e-06, "loss": 0.1198, "step": 2700 }, { "epoch": 1.1021, "grad_norm": 2.9902758598327637, "learning_rate": 3.3507894736842103e-06, "loss": 0.1013, "step": 2725 }, { "epoch": 1.1046, "grad_norm": 3.0514307022094727, "learning_rate": 3.3392763157894734e-06, "loss": 0.1075, "step": 2750 }, { "epoch": 1.1071, "grad_norm": 3.2877554893493652, "learning_rate": 3.327763157894737e-06, "loss": 0.1059, "step": 2775 }, { "epoch": 1.1096, "grad_norm": 2.3952691555023193, "learning_rate": 3.31625e-06, "loss": 0.0926, "step": 2800 }, { "epoch": 1.1121, "grad_norm": 2.2840464115142822, "learning_rate": 3.304736842105263e-06, "loss": 0.1048, "step": 2825 }, { "epoch": 1.1146, "grad_norm": 2.7062416076660156, "learning_rate": 3.293223684210526e-06, "loss": 0.1049, "step": 2850 }, { "epoch": 1.1171, "grad_norm": 2.971315860748291, "learning_rate": 3.2817105263157893e-06, "loss": 0.1073, "step": 2875 }, { "epoch": 1.1196, "grad_norm": 2.8689844608306885, "learning_rate": 3.2701973684210523e-06, "loss": 0.1141, "step": 2900 }, { "epoch": 1.1221, "grad_norm": 3.6150734424591064, "learning_rate": 3.2586842105263153e-06, "loss": 0.1066, "step": 2925 }, { "epoch": 1.1246, "grad_norm": 2.3004024028778076, "learning_rate": 3.2471710526315783e-06, "loss": 0.1248, "step": 2950 }, { "epoch": 1.1271, "grad_norm": 2.5995240211486816, "learning_rate": 3.2356578947368417e-06, "loss": 0.0972, "step": 2975 }, { "epoch": 1.1296, "grad_norm": 2.957960367202759, "learning_rate": 3.224144736842105e-06, "loss": 0.106, "step": 3000 }, { "epoch": 1.1296, "eval_loss": 0.13624447584152222, "eval_runtime": 4123.4662, "eval_samples_per_second": 3.305, "eval_steps_per_second": 0.413, "eval_wer": 9.09249148008355, "step": 3000 }, { "epoch": 1.1320999999999999, "grad_norm": 2.653007984161377, "learning_rate": 3.212631578947368e-06, "loss": 0.1083, "step": 3025 }, { "epoch": 1.1346, "grad_norm": 2.6895744800567627, "learning_rate": 3.2011184210526316e-06, "loss": 0.1119, "step": 3050 }, { "epoch": 1.1371, "grad_norm": 2.1507463455200195, "learning_rate": 3.1896052631578946e-06, "loss": 0.0944, "step": 3075 }, { "epoch": 1.1396, "grad_norm": 3.61063289642334, "learning_rate": 3.1780921052631576e-06, "loss": 0.095, "step": 3100 }, { "epoch": 1.1421000000000001, "grad_norm": 2.570584774017334, "learning_rate": 3.1665789473684206e-06, "loss": 0.1076, "step": 3125 }, { "epoch": 1.1446, "grad_norm": 3.05507230758667, "learning_rate": 3.155065789473684e-06, "loss": 0.1175, "step": 3150 }, { "epoch": 1.1471, "grad_norm": 2.82817006111145, "learning_rate": 3.143552631578947e-06, "loss": 0.0965, "step": 3175 }, { "epoch": 1.1496, "grad_norm": 2.336517572402954, "learning_rate": 3.13203947368421e-06, "loss": 0.0955, "step": 3200 }, { "epoch": 1.1521, "grad_norm": 3.8640036582946777, "learning_rate": 3.120526315789473e-06, "loss": 0.1044, "step": 3225 }, { "epoch": 1.1546, "grad_norm": 3.7205588817596436, "learning_rate": 3.1090131578947366e-06, "loss": 0.1013, "step": 3250 }, { "epoch": 1.1571, "grad_norm": 2.1962900161743164, "learning_rate": 3.0974999999999996e-06, "loss": 0.0978, "step": 3275 }, { "epoch": 1.1596, "grad_norm": 3.3310599327087402, "learning_rate": 3.0859868421052626e-06, "loss": 0.1089, "step": 3300 }, { "epoch": 1.1621, "grad_norm": 2.699566602706909, "learning_rate": 3.074473684210526e-06, "loss": 0.1078, "step": 3325 }, { "epoch": 1.1646, "grad_norm": 3.79370379447937, "learning_rate": 3.0629605263157894e-06, "loss": 0.1118, "step": 3350 }, { "epoch": 1.1671, "grad_norm": 1.9741384983062744, "learning_rate": 3.0514473684210525e-06, "loss": 0.1119, "step": 3375 }, { "epoch": 1.1696, "grad_norm": 2.29034686088562, "learning_rate": 3.0399342105263155e-06, "loss": 0.1015, "step": 3400 }, { "epoch": 2.0017, "grad_norm": 2.011443853378296, "learning_rate": 3.028421052631579e-06, "loss": 0.0708, "step": 3425 }, { "epoch": 2.0042, "grad_norm": 1.2196134328842163, "learning_rate": 3.016907894736842e-06, "loss": 0.0668, "step": 3450 }, { "epoch": 2.0067, "grad_norm": 2.863933563232422, "learning_rate": 3.005394736842105e-06, "loss": 0.0673, "step": 3475 }, { "epoch": 2.0092, "grad_norm": 1.9341013431549072, "learning_rate": 2.9938815789473684e-06, "loss": 0.0718, "step": 3500 }, { "epoch": 2.0092, "eval_loss": 0.13255682587623596, "eval_runtime": 4133.4892, "eval_samples_per_second": 3.297, "eval_steps_per_second": 0.412, "eval_wer": 8.542819451060867, "step": 3500 }, { "epoch": 2.0117, "grad_norm": 2.795734405517578, "learning_rate": 2.9823684210526314e-06, "loss": 0.071, "step": 3525 }, { "epoch": 2.0142, "grad_norm": 1.982479214668274, "learning_rate": 2.9708552631578944e-06, "loss": 0.0629, "step": 3550 }, { "epoch": 2.0167, "grad_norm": 3.168161630630493, "learning_rate": 2.9593421052631574e-06, "loss": 0.0593, "step": 3575 }, { "epoch": 2.0192, "grad_norm": 2.259500741958618, "learning_rate": 2.947828947368421e-06, "loss": 0.0696, "step": 3600 }, { "epoch": 2.0217, "grad_norm": 2.1626062393188477, "learning_rate": 2.936315789473684e-06, "loss": 0.0687, "step": 3625 }, { "epoch": 2.0242, "grad_norm": 2.4419946670532227, "learning_rate": 2.924802631578947e-06, "loss": 0.0686, "step": 3650 }, { "epoch": 2.0267, "grad_norm": 2.445758819580078, "learning_rate": 2.9132894736842103e-06, "loss": 0.0631, "step": 3675 }, { "epoch": 2.0292, "grad_norm": 2.614476442337036, "learning_rate": 2.9017763157894737e-06, "loss": 0.0647, "step": 3700 }, { "epoch": 2.0317, "grad_norm": 1.4166672229766846, "learning_rate": 2.8902631578947367e-06, "loss": 0.0653, "step": 3725 }, { "epoch": 2.0342, "grad_norm": 1.8435245752334595, "learning_rate": 2.8787499999999998e-06, "loss": 0.0567, "step": 3750 }, { "epoch": 2.0367, "grad_norm": 1.8179950714111328, "learning_rate": 2.867236842105263e-06, "loss": 0.0636, "step": 3775 }, { "epoch": 2.0392, "grad_norm": 1.487122893333435, "learning_rate": 2.855723684210526e-06, "loss": 0.0598, "step": 3800 }, { "epoch": 2.0417, "grad_norm": 2.9211690425872803, "learning_rate": 2.8442105263157892e-06, "loss": 0.0599, "step": 3825 }, { "epoch": 2.0442, "grad_norm": 2.5018093585968018, "learning_rate": 2.8326973684210522e-06, "loss": 0.055, "step": 3850 }, { "epoch": 2.0467, "grad_norm": 2.186502456665039, "learning_rate": 2.8211842105263157e-06, "loss": 0.0533, "step": 3875 }, { "epoch": 2.0492, "grad_norm": 1.039233922958374, "learning_rate": 2.8096710526315787e-06, "loss": 0.0514, "step": 3900 }, { "epoch": 2.0517, "grad_norm": 1.871267557144165, "learning_rate": 2.7981578947368417e-06, "loss": 0.0512, "step": 3925 }, { "epoch": 2.0542, "grad_norm": 2.0849483013153076, "learning_rate": 2.7866447368421047e-06, "loss": 0.0579, "step": 3950 }, { "epoch": 2.0567, "grad_norm": 1.6887531280517578, "learning_rate": 2.775131578947368e-06, "loss": 0.0575, "step": 3975 }, { "epoch": 2.0592, "grad_norm": 1.88097083568573, "learning_rate": 2.763618421052631e-06, "loss": 0.0683, "step": 4000 }, { "epoch": 2.0592, "eval_loss": 0.1342601627111435, "eval_runtime": 4125.8373, "eval_samples_per_second": 3.304, "eval_steps_per_second": 0.413, "eval_wer": 8.485103888013485, "step": 4000 }, { "epoch": 2.0617, "grad_norm": 2.1877427101135254, "learning_rate": 2.7521052631578946e-06, "loss": 0.0614, "step": 4025 }, { "epoch": 2.0642, "grad_norm": 1.4176368713378906, "learning_rate": 2.740592105263158e-06, "loss": 0.0559, "step": 4050 }, { "epoch": 2.0667, "grad_norm": 2.4362101554870605, "learning_rate": 2.729078947368421e-06, "loss": 0.0593, "step": 4075 }, { "epoch": 2.0692, "grad_norm": 1.8663033246994019, "learning_rate": 2.717565789473684e-06, "loss": 0.0591, "step": 4100 }, { "epoch": 2.0717, "grad_norm": 1.627626657485962, "learning_rate": 2.706052631578947e-06, "loss": 0.0637, "step": 4125 }, { "epoch": 2.0742, "grad_norm": 2.2072463035583496, "learning_rate": 2.6945394736842105e-06, "loss": 0.0571, "step": 4150 }, { "epoch": 2.0767, "grad_norm": 1.7411611080169678, "learning_rate": 2.6830263157894735e-06, "loss": 0.0588, "step": 4175 }, { "epoch": 2.0792, "grad_norm": 1.324000358581543, "learning_rate": 2.6715131578947365e-06, "loss": 0.0482, "step": 4200 }, { "epoch": 2.0817, "grad_norm": 1.4138795137405396, "learning_rate": 2.6599999999999995e-06, "loss": 0.0477, "step": 4225 }, { "epoch": 2.0842, "grad_norm": 2.403547763824463, "learning_rate": 2.648486842105263e-06, "loss": 0.0558, "step": 4250 }, { "epoch": 2.0867, "grad_norm": 1.3718703985214233, "learning_rate": 2.636973684210526e-06, "loss": 0.0546, "step": 4275 }, { "epoch": 2.0892, "grad_norm": 2.296445369720459, "learning_rate": 2.625460526315789e-06, "loss": 0.0554, "step": 4300 }, { "epoch": 2.0917, "grad_norm": 2.3471312522888184, "learning_rate": 2.613947368421052e-06, "loss": 0.051, "step": 4325 }, { "epoch": 2.0942, "grad_norm": 1.6061975955963135, "learning_rate": 2.602434210526316e-06, "loss": 0.0548, "step": 4350 }, { "epoch": 2.0967, "grad_norm": 2.979126453399658, "learning_rate": 2.590921052631579e-06, "loss": 0.0492, "step": 4375 }, { "epoch": 2.0992, "grad_norm": 1.7963169813156128, "learning_rate": 2.579407894736842e-06, "loss": 0.0514, "step": 4400 }, { "epoch": 2.1017, "grad_norm": 2.4996039867401123, "learning_rate": 2.5678947368421053e-06, "loss": 0.0399, "step": 4425 }, { "epoch": 2.1042, "grad_norm": 1.7498191595077515, "learning_rate": 2.5563815789473683e-06, "loss": 0.0522, "step": 4450 }, { "epoch": 2.1067, "grad_norm": 1.413889765739441, "learning_rate": 2.5448684210526313e-06, "loss": 0.0517, "step": 4475 }, { "epoch": 2.1092, "grad_norm": 2.0956978797912598, "learning_rate": 2.5333552631578943e-06, "loss": 0.0482, "step": 4500 }, { "epoch": 2.1092, "eval_loss": 0.1336347758769989, "eval_runtime": 4119.9162, "eval_samples_per_second": 3.308, "eval_steps_per_second": 0.414, "eval_wer": 8.104914067939463, "step": 4500 }, { "epoch": 2.1117, "grad_norm": 3.138298749923706, "learning_rate": 2.5218421052631578e-06, "loss": 0.0568, "step": 4525 }, { "epoch": 2.1142, "grad_norm": 1.4262772798538208, "learning_rate": 2.510328947368421e-06, "loss": 0.0475, "step": 4550 }, { "epoch": 2.1167, "grad_norm": 3.3500139713287354, "learning_rate": 2.498815789473684e-06, "loss": 0.0474, "step": 4575 }, { "epoch": 2.1192, "grad_norm": 4.509912014007568, "learning_rate": 2.4873026315789472e-06, "loss": 0.0586, "step": 4600 }, { "epoch": 2.1217, "grad_norm": 2.1386468410491943, "learning_rate": 2.4757894736842102e-06, "loss": 0.062, "step": 4625 }, { "epoch": 2.1242, "grad_norm": 1.1121129989624023, "learning_rate": 2.4642763157894733e-06, "loss": 0.0563, "step": 4650 }, { "epoch": 2.1267, "grad_norm": 1.677538514137268, "learning_rate": 2.4527631578947363e-06, "loss": 0.0519, "step": 4675 }, { "epoch": 2.1292, "grad_norm": 1.579513430595398, "learning_rate": 2.44125e-06, "loss": 0.0544, "step": 4700 }, { "epoch": 2.1317, "grad_norm": 2.1100914478302, "learning_rate": 2.429736842105263e-06, "loss": 0.0578, "step": 4725 }, { "epoch": 2.1342, "grad_norm": 1.779682993888855, "learning_rate": 2.418223684210526e-06, "loss": 0.0486, "step": 4750 }, { "epoch": 2.1367, "grad_norm": 1.7443439960479736, "learning_rate": 2.4067105263157896e-06, "loss": 0.0534, "step": 4775 }, { "epoch": 2.1391999999999998, "grad_norm": 1.9388935565948486, "learning_rate": 2.3951973684210526e-06, "loss": 0.0516, "step": 4800 }, { "epoch": 2.1417, "grad_norm": 1.82517409324646, "learning_rate": 2.3836842105263156e-06, "loss": 0.0451, "step": 4825 }, { "epoch": 2.1442, "grad_norm": 1.9101967811584473, "learning_rate": 2.3721710526315786e-06, "loss": 0.0546, "step": 4850 }, { "epoch": 2.1467, "grad_norm": 1.7242915630340576, "learning_rate": 2.360657894736842e-06, "loss": 0.0495, "step": 4875 }, { "epoch": 2.1492, "grad_norm": 1.9127079248428345, "learning_rate": 2.349144736842105e-06, "loss": 0.0465, "step": 4900 }, { "epoch": 2.1517, "grad_norm": 2.7716519832611084, "learning_rate": 2.337631578947368e-06, "loss": 0.0493, "step": 4925 }, { "epoch": 2.1542, "grad_norm": 3.141706705093384, "learning_rate": 2.326118421052631e-06, "loss": 0.046, "step": 4950 }, { "epoch": 2.1567, "grad_norm": 2.2624270915985107, "learning_rate": 2.3146052631578945e-06, "loss": 0.0522, "step": 4975 }, { "epoch": 2.1592000000000002, "grad_norm": 1.2777652740478516, "learning_rate": 2.3030921052631575e-06, "loss": 0.0548, "step": 5000 }, { "epoch": 2.1592000000000002, "eval_loss": 0.13162237405776978, "eval_runtime": 4127.2085, "eval_samples_per_second": 3.302, "eval_steps_per_second": 0.413, "eval_wer": 7.9244384184103485, "step": 5000 }, { "epoch": 2.1617, "grad_norm": 2.106818675994873, "learning_rate": 2.2915789473684206e-06, "loss": 0.0527, "step": 5025 }, { "epoch": 2.1642, "grad_norm": 2.2705554962158203, "learning_rate": 2.2800657894736844e-06, "loss": 0.0483, "step": 5050 }, { "epoch": 2.1667, "grad_norm": 1.5468271970748901, "learning_rate": 2.2685526315789474e-06, "loss": 0.0516, "step": 5075 }, { "epoch": 2.1692, "grad_norm": 2.0331270694732666, "learning_rate": 2.2570394736842104e-06, "loss": 0.0551, "step": 5100 }, { "epoch": 3.0013, "grad_norm": 1.107423186302185, "learning_rate": 2.2455263157894734e-06, "loss": 0.0434, "step": 5125 }, { "epoch": 3.0038, "grad_norm": 3.9103100299835205, "learning_rate": 2.234013157894737e-06, "loss": 0.0362, "step": 5150 }, { "epoch": 3.0063, "grad_norm": 1.193088173866272, "learning_rate": 2.2225e-06, "loss": 0.0327, "step": 5175 }, { "epoch": 3.0088, "grad_norm": 1.0432852506637573, "learning_rate": 2.210986842105263e-06, "loss": 0.0326, "step": 5200 }, { "epoch": 3.0113, "grad_norm": 0.7116020917892456, "learning_rate": 2.199473684210526e-06, "loss": 0.0296, "step": 5225 }, { "epoch": 3.0138, "grad_norm": 2.009617805480957, "learning_rate": 2.1879605263157894e-06, "loss": 0.0367, "step": 5250 }, { "epoch": 3.0163, "grad_norm": 1.9047244787216187, "learning_rate": 2.1764473684210524e-06, "loss": 0.0347, "step": 5275 }, { "epoch": 3.0188, "grad_norm": 1.630439043045044, "learning_rate": 2.164934210526316e-06, "loss": 0.0291, "step": 5300 }, { "epoch": 3.0213, "grad_norm": 1.4158824682235718, "learning_rate": 2.153421052631579e-06, "loss": 0.0321, "step": 5325 }, { "epoch": 3.0238, "grad_norm": 1.2792794704437256, "learning_rate": 2.141907894736842e-06, "loss": 0.0338, "step": 5350 }, { "epoch": 3.0263, "grad_norm": 1.6505346298217773, "learning_rate": 2.1303947368421053e-06, "loss": 0.0348, "step": 5375 }, { "epoch": 3.0288, "grad_norm": 1.5343618392944336, "learning_rate": 2.1188815789473683e-06, "loss": 0.0318, "step": 5400 }, { "epoch": 3.0313, "grad_norm": 1.8325493335723877, "learning_rate": 2.1073684210526313e-06, "loss": 0.0333, "step": 5425 }, { "epoch": 3.0338, "grad_norm": 1.7224900722503662, "learning_rate": 2.0958552631578943e-06, "loss": 0.0322, "step": 5450 }, { "epoch": 3.0362999999999998, "grad_norm": 1.3443737030029297, "learning_rate": 2.0843421052631577e-06, "loss": 0.0304, "step": 5475 }, { "epoch": 3.0388, "grad_norm": 1.3260679244995117, "learning_rate": 2.0728289473684207e-06, "loss": 0.0282, "step": 5500 }, { "epoch": 3.0388, "eval_loss": 0.13909843564033508, "eval_runtime": 4135.2147, "eval_samples_per_second": 3.296, "eval_steps_per_second": 0.412, "eval_wer": 7.8181684927992965, "step": 5500 }, { "epoch": 3.0413, "grad_norm": 1.0075204372406006, "learning_rate": 2.061315789473684e-06, "loss": 0.0308, "step": 5525 }, { "epoch": 3.0438, "grad_norm": 1.0206842422485352, "learning_rate": 2.049802631578947e-06, "loss": 0.0306, "step": 5550 }, { "epoch": 3.0463, "grad_norm": 1.411301851272583, "learning_rate": 2.03828947368421e-06, "loss": 0.0243, "step": 5575 }, { "epoch": 3.0488, "grad_norm": 0.959862470626831, "learning_rate": 2.0267763157894732e-06, "loss": 0.0272, "step": 5600 }, { "epoch": 3.0513, "grad_norm": 2.2999842166900635, "learning_rate": 2.0152631578947367e-06, "loss": 0.0246, "step": 5625 }, { "epoch": 3.0538, "grad_norm": 2.890066146850586, "learning_rate": 2.00375e-06, "loss": 0.0299, "step": 5650 }, { "epoch": 3.0563, "grad_norm": 1.7101376056671143, "learning_rate": 1.992236842105263e-06, "loss": 0.0322, "step": 5675 }, { "epoch": 3.0588, "grad_norm": 1.531943917274475, "learning_rate": 1.980723684210526e-06, "loss": 0.0345, "step": 5700 }, { "epoch": 3.0613, "grad_norm": 1.6334413290023804, "learning_rate": 1.969210526315789e-06, "loss": 0.032, "step": 5725 }, { "epoch": 3.0638, "grad_norm": 2.112278461456299, "learning_rate": 1.9576973684210526e-06, "loss": 0.0304, "step": 5750 }, { "epoch": 3.0663, "grad_norm": 1.7582517862319946, "learning_rate": 1.9461842105263156e-06, "loss": 0.0254, "step": 5775 }, { "epoch": 3.0688, "grad_norm": 1.3391777276992798, "learning_rate": 1.934671052631579e-06, "loss": 0.0316, "step": 5800 }, { "epoch": 3.0713, "grad_norm": 0.8350562453269958, "learning_rate": 1.923157894736842e-06, "loss": 0.0329, "step": 5825 }, { "epoch": 3.0738, "grad_norm": 0.7084619402885437, "learning_rate": 1.911644736842105e-06, "loss": 0.0325, "step": 5850 }, { "epoch": 3.0763, "grad_norm": 1.2961277961730957, "learning_rate": 1.9001315789473683e-06, "loss": 0.0313, "step": 5875 }, { "epoch": 3.0788, "grad_norm": 1.032840371131897, "learning_rate": 1.8886184210526315e-06, "loss": 0.0224, "step": 5900 }, { "epoch": 3.0813, "grad_norm": 1.2073044776916504, "learning_rate": 1.8771052631578945e-06, "loss": 0.0215, "step": 5925 }, { "epoch": 3.0838, "grad_norm": 0.8210967779159546, "learning_rate": 1.8655921052631577e-06, "loss": 0.0258, "step": 5950 }, { "epoch": 3.0863, "grad_norm": 1.5273653268814087, "learning_rate": 1.854078947368421e-06, "loss": 0.0254, "step": 5975 }, { "epoch": 3.0888, "grad_norm": 3.194197177886963, "learning_rate": 1.8425657894736842e-06, "loss": 0.025, "step": 6000 }, { "epoch": 3.0888, "eval_loss": 0.14247554540634155, "eval_runtime": 4123.5746, "eval_samples_per_second": 3.305, "eval_steps_per_second": 0.413, "eval_wer": 7.940928579281029, "step": 6000 }, { "epoch": 3.0913, "grad_norm": 2.1373400688171387, "learning_rate": 1.8310526315789472e-06, "loss": 0.031, "step": 6025 }, { "epoch": 3.0938, "grad_norm": 1.0779415369033813, "learning_rate": 1.8195394736842104e-06, "loss": 0.024, "step": 6050 }, { "epoch": 3.0963, "grad_norm": 0.9637121558189392, "learning_rate": 1.8080263157894734e-06, "loss": 0.0282, "step": 6075 }, { "epoch": 3.0987999999999998, "grad_norm": 1.1645703315734863, "learning_rate": 1.7965131578947366e-06, "loss": 0.0278, "step": 6100 }, { "epoch": 3.1013, "grad_norm": 1.2814173698425293, "learning_rate": 1.7849999999999996e-06, "loss": 0.0199, "step": 6125 }, { "epoch": 3.1038, "grad_norm": 1.458809494972229, "learning_rate": 1.773486842105263e-06, "loss": 0.0264, "step": 6150 }, { "epoch": 3.1063, "grad_norm": 1.6669671535491943, "learning_rate": 1.7619736842105263e-06, "loss": 0.0272, "step": 6175 }, { "epoch": 3.1088, "grad_norm": 1.5049173831939697, "learning_rate": 1.7504605263157893e-06, "loss": 0.0243, "step": 6200 }, { "epoch": 3.1113, "grad_norm": 0.861107587814331, "learning_rate": 1.7389473684210525e-06, "loss": 0.0274, "step": 6225 }, { "epoch": 3.1138, "grad_norm": 1.0454998016357422, "learning_rate": 1.7274342105263155e-06, "loss": 0.0258, "step": 6250 }, { "epoch": 3.1163, "grad_norm": 1.7108014822006226, "learning_rate": 1.7159210526315788e-06, "loss": 0.0259, "step": 6275 }, { "epoch": 3.1188, "grad_norm": 0.8804712295532227, "learning_rate": 1.704407894736842e-06, "loss": 0.0255, "step": 6300 }, { "epoch": 3.1213, "grad_norm": 2.0050883293151855, "learning_rate": 1.6928947368421052e-06, "loss": 0.0304, "step": 6325 }, { "epoch": 3.1238, "grad_norm": 1.4400875568389893, "learning_rate": 1.6813815789473682e-06, "loss": 0.0333, "step": 6350 }, { "epoch": 3.1263, "grad_norm": 1.4423948526382446, "learning_rate": 1.6698684210526315e-06, "loss": 0.0279, "step": 6375 }, { "epoch": 3.1288, "grad_norm": 1.3972327709197998, "learning_rate": 1.6583552631578947e-06, "loss": 0.0255, "step": 6400 }, { "epoch": 3.1313, "grad_norm": 1.6908966302871704, "learning_rate": 1.6468421052631577e-06, "loss": 0.0267, "step": 6425 }, { "epoch": 3.1338, "grad_norm": 0.9540082216262817, "learning_rate": 1.635328947368421e-06, "loss": 0.0265, "step": 6450 }, { "epoch": 3.1363, "grad_norm": 1.41488778591156, "learning_rate": 1.6238157894736841e-06, "loss": 0.0224, "step": 6475 }, { "epoch": 3.1388, "grad_norm": 0.4790860116481781, "learning_rate": 1.6123026315789474e-06, "loss": 0.0274, "step": 6500 }, { "epoch": 3.1388, "eval_loss": 0.13914132118225098, "eval_runtime": 4133.8202, "eval_samples_per_second": 3.297, "eval_steps_per_second": 0.412, "eval_wer": 7.731137088204039, "step": 6500 }, { "epoch": 3.1413, "grad_norm": 2.5638585090637207, "learning_rate": 1.6007894736842104e-06, "loss": 0.025, "step": 6525 }, { "epoch": 3.1438, "grad_norm": 1.8847306966781616, "learning_rate": 1.5892763157894736e-06, "loss": 0.0294, "step": 6550 }, { "epoch": 3.1463, "grad_norm": 1.0196236371994019, "learning_rate": 1.5777631578947366e-06, "loss": 0.0255, "step": 6575 }, { "epoch": 3.1488, "grad_norm": 1.0703202486038208, "learning_rate": 1.5662499999999998e-06, "loss": 0.0246, "step": 6600 }, { "epoch": 3.1513, "grad_norm": 2.646519422531128, "learning_rate": 1.5547368421052628e-06, "loss": 0.0213, "step": 6625 }, { "epoch": 3.1538, "grad_norm": 1.7430530786514282, "learning_rate": 1.5432236842105263e-06, "loss": 0.0267, "step": 6650 }, { "epoch": 3.1563, "grad_norm": 1.0606240034103394, "learning_rate": 1.5317105263157895e-06, "loss": 0.0269, "step": 6675 }, { "epoch": 3.1588, "grad_norm": 1.4670476913452148, "learning_rate": 1.5201973684210525e-06, "loss": 0.0271, "step": 6700 }, { "epoch": 3.1612999999999998, "grad_norm": 2.345014810562134, "learning_rate": 1.5086842105263157e-06, "loss": 0.0252, "step": 6725 }, { "epoch": 3.1638, "grad_norm": 2.9098987579345703, "learning_rate": 1.4971710526315787e-06, "loss": 0.0272, "step": 6750 }, { "epoch": 3.1663, "grad_norm": 0.5682694911956787, "learning_rate": 1.485657894736842e-06, "loss": 0.0237, "step": 6775 }, { "epoch": 3.1688, "grad_norm": 1.4645904302597046, "learning_rate": 1.4746052631578947e-06, "loss": 0.0303, "step": 6800 }, { "epoch": 4.0009, "grad_norm": 1.3764489889144897, "learning_rate": 1.4630921052631578e-06, "loss": 0.0242, "step": 6825 }, { "epoch": 4.0034, "grad_norm": 0.8848748803138733, "learning_rate": 1.451578947368421e-06, "loss": 0.0163, "step": 6850 }, { "epoch": 4.0059, "grad_norm": 0.619125485420227, "learning_rate": 1.440065789473684e-06, "loss": 0.0188, "step": 6875 }, { "epoch": 4.0084, "grad_norm": 0.9328649044036865, "learning_rate": 1.4285526315789472e-06, "loss": 0.0173, "step": 6900 }, { "epoch": 4.0109, "grad_norm": 1.77474045753479, "learning_rate": 1.4170394736842104e-06, "loss": 0.0146, "step": 6925 }, { "epoch": 4.0134, "grad_norm": 1.3934537172317505, "learning_rate": 1.4055263157894737e-06, "loss": 0.0156, "step": 6950 }, { "epoch": 4.0159, "grad_norm": 1.2856354713439941, "learning_rate": 1.3940131578947367e-06, "loss": 0.0173, "step": 6975 }, { "epoch": 4.0184, "grad_norm": 2.1229758262634277, "learning_rate": 1.3824999999999999e-06, "loss": 0.0155, "step": 7000 }, { "epoch": 4.0184, "eval_loss": 0.14916160702705383, "eval_runtime": 4128.7355, "eval_samples_per_second": 3.301, "eval_steps_per_second": 0.413, "eval_wer": 7.697240646414307, "step": 7000 }, { "epoch": 4.0209, "grad_norm": 0.44512999057769775, "learning_rate": 1.3709868421052631e-06, "loss": 0.0153, "step": 7025 }, { "epoch": 4.0234, "grad_norm": 1.8791674375534058, "learning_rate": 1.3594736842105261e-06, "loss": 0.0165, "step": 7050 }, { "epoch": 4.0259, "grad_norm": 5.244405746459961, "learning_rate": 1.3479605263157894e-06, "loss": 0.0179, "step": 7075 }, { "epoch": 4.0284, "grad_norm": 1.1926153898239136, "learning_rate": 1.3364473684210526e-06, "loss": 0.0161, "step": 7100 }, { "epoch": 4.0309, "grad_norm": 1.1147819757461548, "learning_rate": 1.3249342105263158e-06, "loss": 0.015, "step": 7125 }, { "epoch": 4.0334, "grad_norm": 1.9370721578598022, "learning_rate": 1.3134210526315788e-06, "loss": 0.0142, "step": 7150 }, { "epoch": 4.0359, "grad_norm": 0.49344903230667114, "learning_rate": 1.301907894736842e-06, "loss": 0.0134, "step": 7175 }, { "epoch": 4.0384, "grad_norm": 1.8190902471542358, "learning_rate": 1.290394736842105e-06, "loss": 0.0168, "step": 7200 }, { "epoch": 4.0409, "grad_norm": 0.7560425400733948, "learning_rate": 1.2788815789473683e-06, "loss": 0.0143, "step": 7225 }, { "epoch": 4.0434, "grad_norm": 1.0451087951660156, "learning_rate": 1.2673684210526313e-06, "loss": 0.0149, "step": 7250 }, { "epoch": 4.0459, "grad_norm": 1.0334726572036743, "learning_rate": 1.2558552631578947e-06, "loss": 0.0136, "step": 7275 }, { "epoch": 4.0484, "grad_norm": 0.6531663537025452, "learning_rate": 1.244342105263158e-06, "loss": 0.0137, "step": 7300 }, { "epoch": 4.0509, "grad_norm": 0.8954887986183167, "learning_rate": 1.232828947368421e-06, "loss": 0.0118, "step": 7325 }, { "epoch": 4.0534, "grad_norm": 1.0640511512756348, "learning_rate": 1.2213157894736842e-06, "loss": 0.0126, "step": 7350 }, { "epoch": 4.0559, "grad_norm": 0.2824617922306061, "learning_rate": 1.2098026315789472e-06, "loss": 0.0139, "step": 7375 }, { "epoch": 4.0584, "grad_norm": 1.0095443725585938, "learning_rate": 1.1982894736842104e-06, "loss": 0.018, "step": 7400 }, { "epoch": 4.0609, "grad_norm": 1.1475225687026978, "learning_rate": 1.1867763157894734e-06, "loss": 0.0133, "step": 7425 }, { "epoch": 4.0634, "grad_norm": 1.5951991081237793, "learning_rate": 1.1752631578947369e-06, "loss": 0.013, "step": 7450 }, { "epoch": 4.0659, "grad_norm": 0.3482917249202728, "learning_rate": 1.1637499999999999e-06, "loss": 0.0154, "step": 7475 }, { "epoch": 4.0684, "grad_norm": 1.1572391986846924, "learning_rate": 1.152236842105263e-06, "loss": 0.0189, "step": 7500 }, { "epoch": 4.0684, "eval_loss": 0.15172211825847626, "eval_runtime": 4117.5679, "eval_samples_per_second": 3.31, "eval_steps_per_second": 0.414, "eval_wer": 7.656931364285977, "step": 7500 }, { "epoch": 4.0709, "grad_norm": 1.3942557573318481, "learning_rate": 1.140723684210526e-06, "loss": 0.0143, "step": 7525 }, { "epoch": 4.0734, "grad_norm": 0.8097572326660156, "learning_rate": 1.1292105263157893e-06, "loss": 0.0127, "step": 7550 }, { "epoch": 4.0759, "grad_norm": 0.740375816822052, "learning_rate": 1.1176973684210526e-06, "loss": 0.0124, "step": 7575 }, { "epoch": 4.0784, "grad_norm": 0.8702480792999268, "learning_rate": 1.1061842105263156e-06, "loss": 0.0137, "step": 7600 }, { "epoch": 4.0809, "grad_norm": 1.223105788230896, "learning_rate": 1.094671052631579e-06, "loss": 0.0137, "step": 7625 }, { "epoch": 4.0834, "grad_norm": 0.43614983558654785, "learning_rate": 1.083157894736842e-06, "loss": 0.0109, "step": 7650 }, { "epoch": 4.0859, "grad_norm": 1.0974986553192139, "learning_rate": 1.0716447368421052e-06, "loss": 0.0118, "step": 7675 }, { "epoch": 4.0884, "grad_norm": 0.7234652042388916, "learning_rate": 1.0601315789473682e-06, "loss": 0.0125, "step": 7700 }, { "epoch": 4.0909, "grad_norm": 0.7752431035041809, "learning_rate": 1.0486184210526315e-06, "loss": 0.0135, "step": 7725 }, { "epoch": 4.0934, "grad_norm": 0.8796952366828918, "learning_rate": 1.0371052631578947e-06, "loss": 0.0158, "step": 7750 }, { "epoch": 4.0959, "grad_norm": 3.9135661125183105, "learning_rate": 1.0255921052631577e-06, "loss": 0.0139, "step": 7775 }, { "epoch": 4.0984, "grad_norm": 0.4837290942668915, "learning_rate": 1.014078947368421e-06, "loss": 0.0103, "step": 7800 }, { "epoch": 4.1009, "grad_norm": 1.1155998706817627, "learning_rate": 1.0025657894736842e-06, "loss": 0.0106, "step": 7825 }, { "epoch": 4.1034, "grad_norm": 2.628676652908325, "learning_rate": 9.910526315789474e-07, "loss": 0.0089, "step": 7850 }, { "epoch": 4.1059, "grad_norm": 1.716665506362915, "learning_rate": 9.795394736842104e-07, "loss": 0.0132, "step": 7875 }, { "epoch": 4.1084, "grad_norm": 1.6751716136932373, "learning_rate": 9.680263157894736e-07, "loss": 0.0137, "step": 7900 }, { "epoch": 4.1109, "grad_norm": 0.9773244261741638, "learning_rate": 9.565131578947368e-07, "loss": 0.0111, "step": 7925 }, { "epoch": 4.1134, "grad_norm": 1.44219172000885, "learning_rate": 9.45e-07, "loss": 0.0139, "step": 7950 }, { "epoch": 4.1159, "grad_norm": 0.8723123073577881, "learning_rate": 9.334868421052631e-07, "loss": 0.0117, "step": 7975 }, { "epoch": 4.1184, "grad_norm": 0.6484673023223877, "learning_rate": 9.219736842105263e-07, "loss": 0.0139, "step": 8000 }, { "epoch": 4.1184, "eval_loss": 0.15393850207328796, "eval_runtime": 4128.9341, "eval_samples_per_second": 3.301, "eval_steps_per_second": 0.413, "eval_wer": 7.626699402689728, "step": 8000 }, { "epoch": 4.1209, "grad_norm": 1.3702197074890137, "learning_rate": 9.104605263157894e-07, "loss": 0.0158, "step": 8025 }, { "epoch": 4.1234, "grad_norm": 1.425645351409912, "learning_rate": 8.989473684210525e-07, "loss": 0.0117, "step": 8050 }, { "epoch": 4.1259, "grad_norm": 1.4255399703979492, "learning_rate": 8.874342105263158e-07, "loss": 0.015, "step": 8075 }, { "epoch": 4.1284, "grad_norm": 0.6988621950149536, "learning_rate": 8.759210526315789e-07, "loss": 0.0141, "step": 8100 }, { "epoch": 4.1309, "grad_norm": 1.1563546657562256, "learning_rate": 8.64407894736842e-07, "loss": 0.0122, "step": 8125 }, { "epoch": 4.1334, "grad_norm": 1.2023714780807495, "learning_rate": 8.528947368421051e-07, "loss": 0.013, "step": 8150 }, { "epoch": 4.1359, "grad_norm": 0.9450110197067261, "learning_rate": 8.413815789473683e-07, "loss": 0.0123, "step": 8175 }, { "epoch": 4.1384, "grad_norm": 0.9265995621681213, "learning_rate": 8.298684210526316e-07, "loss": 0.0114, "step": 8200 }, { "epoch": 4.1409, "grad_norm": 0.4234980046749115, "learning_rate": 8.183552631578947e-07, "loss": 0.0085, "step": 8225 }, { "epoch": 4.1434, "grad_norm": 1.3323073387145996, "learning_rate": 8.068421052631579e-07, "loss": 0.014, "step": 8250 }, { "epoch": 4.1459, "grad_norm": 1.2050007581710815, "learning_rate": 7.95328947368421e-07, "loss": 0.0106, "step": 8275 }, { "epoch": 4.1484, "grad_norm": 1.261042594909668, "learning_rate": 7.838157894736841e-07, "loss": 0.0107, "step": 8300 }, { "epoch": 4.1509, "grad_norm": 1.2892303466796875, "learning_rate": 7.723026315789474e-07, "loss": 0.0145, "step": 8325 }, { "epoch": 4.1534, "grad_norm": 1.1626112461090088, "learning_rate": 7.607894736842105e-07, "loss": 0.0139, "step": 8350 }, { "epoch": 4.1559, "grad_norm": 1.0547322034835815, "learning_rate": 7.492763157894736e-07, "loss": 0.0154, "step": 8375 }, { "epoch": 4.1584, "grad_norm": 0.44805532693862915, "learning_rate": 7.377631578947367e-07, "loss": 0.0109, "step": 8400 }, { "epoch": 4.1609, "grad_norm": 0.7095866203308105, "learning_rate": 7.262499999999999e-07, "loss": 0.0114, "step": 8425 }, { "epoch": 4.1634, "grad_norm": 1.4220194816589355, "learning_rate": 7.14736842105263e-07, "loss": 0.0134, "step": 8450 }, { "epoch": 4.1659, "grad_norm": 1.0814168453216553, "learning_rate": 7.032236842105263e-07, "loss": 0.0142, "step": 8475 }, { "epoch": 4.1684, "grad_norm": 0.7026916146278381, "learning_rate": 6.917105263157895e-07, "loss": 0.0141, "step": 8500 }, { "epoch": 4.1684, "eval_loss": 0.15496784448623657, "eval_runtime": 4124.1829, "eval_samples_per_second": 3.305, "eval_steps_per_second": 0.413, "eval_wer": 7.542416358239584, "step": 8500 }, { "epoch": 5.0005, "grad_norm": 4.648550033569336, "learning_rate": 6.801973684210526e-07, "loss": 0.0285, "step": 8525 }, { "epoch": 5.003, "grad_norm": 1.9204503297805786, "learning_rate": 6.691447368421053e-07, "loss": 0.0761, "step": 8550 }, { "epoch": 5.0055, "grad_norm": 1.7285746335983276, "learning_rate": 6.576315789473684e-07, "loss": 0.0602, "step": 8575 }, { "epoch": 5.008, "grad_norm": 1.1516830921173096, "learning_rate": 6.461184210526315e-07, "loss": 0.0585, "step": 8600 }, { "epoch": 5.0105, "grad_norm": 3.3867828845977783, "learning_rate": 6.346052631578947e-07, "loss": 0.0656, "step": 8625 }, { "epoch": 5.013, "grad_norm": 4.064920902252197, "learning_rate": 6.230921052631579e-07, "loss": 0.0683, "step": 8650 }, { "epoch": 5.0155, "grad_norm": 3.695047378540039, "learning_rate": 6.11578947368421e-07, "loss": 0.0659, "step": 8675 }, { "epoch": 5.018, "grad_norm": 2.9087939262390137, "learning_rate": 6.000657894736842e-07, "loss": 0.0611, "step": 8700 }, { "epoch": 5.0205, "grad_norm": 3.368290424346924, "learning_rate": 5.885526315789473e-07, "loss": 0.0603, "step": 8725 }, { "epoch": 5.023, "grad_norm": 3.7565319538116455, "learning_rate": 5.770394736842104e-07, "loss": 0.0614, "step": 8750 }, { "epoch": 5.0255, "grad_norm": 2.4887771606445312, "learning_rate": 5.655263157894735e-07, "loss": 0.0497, "step": 8775 }, { "epoch": 5.028, "grad_norm": 2.1670076847076416, "learning_rate": 5.540131578947369e-07, "loss": 0.0662, "step": 8800 }, { "epoch": 5.0305, "grad_norm": 1.3746148347854614, "learning_rate": 5.425e-07, "loss": 0.0507, "step": 8825 }, { "epoch": 5.033, "grad_norm": 1.8274154663085938, "learning_rate": 5.309868421052631e-07, "loss": 0.0449, "step": 8850 }, { "epoch": 5.0355, "grad_norm": 2.9424078464508057, "learning_rate": 5.194736842105262e-07, "loss": 0.0529, "step": 8875 }, { "epoch": 5.038, "grad_norm": 2.457754611968994, "learning_rate": 5.079605263157895e-07, "loss": 0.042, "step": 8900 }, { "epoch": 5.0405, "grad_norm": 2.208768606185913, "learning_rate": 4.964473684210526e-07, "loss": 0.0407, "step": 8925 }, { "epoch": 5.043, "grad_norm": 1.9554438591003418, "learning_rate": 4.849342105263158e-07, "loss": 0.0465, "step": 8950 }, { "epoch": 5.0455, "grad_norm": 1.1464567184448242, "learning_rate": 4.734210526315789e-07, "loss": 0.0537, "step": 8975 }, { "epoch": 5.048, "grad_norm": 3.1216509342193604, "learning_rate": 4.6190789473684203e-07, "loss": 0.0368, "step": 9000 }, { "epoch": 5.048, "eval_loss": 0.12588092684745789, "eval_runtime": 4149.257, "eval_samples_per_second": 3.285, "eval_steps_per_second": 0.411, "eval_wer": 7.215361500971087, "step": 9000 }, { "epoch": 5.048, "step": 9000, "total_flos": 4.891718061785088e+20, "train_loss": 0.0, "train_runtime": 289.8068, "train_samples_per_second": 552.092, "train_steps_per_second": 34.506 } ], "logging_steps": 25, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.891718061785088e+20, "train_batch_size": 16, "trial_name": null, "trial_params": null }