|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.964200477326969, |
|
"eval_steps": 52, |
|
"global_step": 520, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00954653937947494, |
|
"grad_norm": 0.541497528553009, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.8864, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00954653937947494, |
|
"eval_loss": 0.975593626499176, |
|
"eval_runtime": 12.7833, |
|
"eval_samples_per_second": 13.846, |
|
"eval_steps_per_second": 1.799, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01909307875894988, |
|
"grad_norm": 0.7477102875709534, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.9382, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.028639618138424822, |
|
"grad_norm": 0.8567990064620972, |
|
"learning_rate": 6e-06, |
|
"loss": 1.0294, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03818615751789976, |
|
"grad_norm": 0.737090528011322, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.9398, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0477326968973747, |
|
"grad_norm": 0.6384573578834534, |
|
"learning_rate": 1e-05, |
|
"loss": 0.9345, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.057279236276849645, |
|
"grad_norm": 0.7606269121170044, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.941, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.06682577565632458, |
|
"grad_norm": 0.7251742482185364, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.958, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.07637231503579953, |
|
"grad_norm": 0.6773186922073364, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.9318, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08591885441527446, |
|
"grad_norm": 0.8379804491996765, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.9919, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0954653937947494, |
|
"grad_norm": 0.9668668508529663, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1819, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10501193317422435, |
|
"grad_norm": 0.5556192398071289, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 0.9474, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.11455847255369929, |
|
"grad_norm": 0.6156389117240906, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.9212, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.12410501193317422, |
|
"grad_norm": 0.6029731035232544, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.8982, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.13365155131264916, |
|
"grad_norm": 0.6002732515335083, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.9832, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1431980906921241, |
|
"grad_norm": 0.6266693472862244, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9485, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.15274463007159905, |
|
"grad_norm": 0.5493901371955872, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.9013, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.162291169451074, |
|
"grad_norm": 0.5467607378959656, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.8975, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.1718377088305489, |
|
"grad_norm": 0.556983232498169, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.1175, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.18138424821002386, |
|
"grad_norm": 0.4399558901786804, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.8975, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.1909307875894988, |
|
"grad_norm": 0.4225325882434845, |
|
"learning_rate": 4e-05, |
|
"loss": 0.8186, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.20047732696897375, |
|
"grad_norm": 0.38200807571411133, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.82, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2100238663484487, |
|
"grad_norm": 0.3150412440299988, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.844, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.21957040572792363, |
|
"grad_norm": 0.3351333737373352, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.9004, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.22911694510739858, |
|
"grad_norm": 0.3316827416419983, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.8965, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2386634844868735, |
|
"grad_norm": 0.31210237741470337, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9084, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.24821002386634844, |
|
"grad_norm": 0.33221927285194397, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 0.9448, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2577565632458234, |
|
"grad_norm": 0.35407039523124695, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 0.82, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.26730310262529833, |
|
"grad_norm": 0.34163302183151245, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 0.8843, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.27684964200477324, |
|
"grad_norm": 0.3265129029750824, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.8376, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.2863961813842482, |
|
"grad_norm": 0.38681215047836304, |
|
"learning_rate": 6e-05, |
|
"loss": 0.9903, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.29594272076372313, |
|
"grad_norm": 0.3341940641403198, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.8054, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.3054892601431981, |
|
"grad_norm": 0.3521149158477783, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.8931, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.315035799522673, |
|
"grad_norm": 0.3392002284526825, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.7776, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.324582338902148, |
|
"grad_norm": 0.3627275824546814, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.7253, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.3341288782816229, |
|
"grad_norm": 0.33696770668029785, |
|
"learning_rate": 7e-05, |
|
"loss": 0.9011, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.3436754176610978, |
|
"grad_norm": 0.3550478518009186, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.8064, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3532219570405728, |
|
"grad_norm": 0.3183474838733673, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.8094, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.3627684964200477, |
|
"grad_norm": 0.33637434244155884, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.8637, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3723150357995227, |
|
"grad_norm": 0.34323257207870483, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 0.8717, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.3818615751789976, |
|
"grad_norm": 0.290461003780365, |
|
"learning_rate": 8e-05, |
|
"loss": 0.8501, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3914081145584726, |
|
"grad_norm": 0.35435885190963745, |
|
"learning_rate": 8.2e-05, |
|
"loss": 0.8446, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.4009546539379475, |
|
"grad_norm": 0.30319270491600037, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.8175, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.4105011933174224, |
|
"grad_norm": 0.28563714027404785, |
|
"learning_rate": 8.6e-05, |
|
"loss": 0.7933, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.4200477326968974, |
|
"grad_norm": 0.26857540011405945, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 0.7177, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.4295942720763723, |
|
"grad_norm": 0.27898675203323364, |
|
"learning_rate": 9e-05, |
|
"loss": 0.7506, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.43914081145584727, |
|
"grad_norm": 0.2959842383861542, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.859, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.4486873508353222, |
|
"grad_norm": 0.2996789515018463, |
|
"learning_rate": 9.4e-05, |
|
"loss": 0.7429, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.45823389021479716, |
|
"grad_norm": 0.2459433376789093, |
|
"learning_rate": 9.6e-05, |
|
"loss": 0.6911, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.4677804295942721, |
|
"grad_norm": 0.318551629781723, |
|
"learning_rate": 9.8e-05, |
|
"loss": 0.8618, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.477326968973747, |
|
"grad_norm": 0.30586713552474976, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8546, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.48687350835322196, |
|
"grad_norm": 0.33441683650016785, |
|
"learning_rate": 9.999888302765345e-05, |
|
"loss": 0.8177, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4964200477326969, |
|
"grad_norm": 0.3031998872756958, |
|
"learning_rate": 9.99955321605189e-05, |
|
"loss": 0.8585, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.4964200477326969, |
|
"eval_loss": 0.8117080926895142, |
|
"eval_runtime": 12.9734, |
|
"eval_samples_per_second": 13.643, |
|
"eval_steps_per_second": 1.773, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5059665871121718, |
|
"grad_norm": 0.2897348403930664, |
|
"learning_rate": 9.99899475483094e-05, |
|
"loss": 0.731, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.5155131264916468, |
|
"grad_norm": 0.29636526107788086, |
|
"learning_rate": 9.99821294405392e-05, |
|
"loss": 0.8064, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5250596658711217, |
|
"grad_norm": 0.3027283847332001, |
|
"learning_rate": 9.997207818651274e-05, |
|
"loss": 0.7737, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5346062052505967, |
|
"grad_norm": 0.2626173794269562, |
|
"learning_rate": 9.995979423530892e-05, |
|
"loss": 0.793, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5441527446300716, |
|
"grad_norm": 0.2591354548931122, |
|
"learning_rate": 9.99452781357611e-05, |
|
"loss": 0.7764, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5536992840095465, |
|
"grad_norm": 0.3225250244140625, |
|
"learning_rate": 9.992853053643257e-05, |
|
"loss": 0.8997, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.5632458233890215, |
|
"grad_norm": 0.29806965589523315, |
|
"learning_rate": 9.99095521855875e-05, |
|
"loss": 0.73, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.5727923627684964, |
|
"grad_norm": 0.28511664271354675, |
|
"learning_rate": 9.988834393115767e-05, |
|
"loss": 0.7074, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5823389021479713, |
|
"grad_norm": 0.2897747755050659, |
|
"learning_rate": 9.986490672070437e-05, |
|
"loss": 0.8004, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5918854415274463, |
|
"grad_norm": 0.31389617919921875, |
|
"learning_rate": 9.983924160137625e-05, |
|
"loss": 0.7397, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6014319809069213, |
|
"grad_norm": 0.2554086744785309, |
|
"learning_rate": 9.98113497198625e-05, |
|
"loss": 0.8052, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6109785202863962, |
|
"grad_norm": 0.2977796494960785, |
|
"learning_rate": 9.978123232234147e-05, |
|
"loss": 0.7742, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6205250596658711, |
|
"grad_norm": 0.2979027330875397, |
|
"learning_rate": 9.974889075442521e-05, |
|
"loss": 0.7169, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.630071599045346, |
|
"grad_norm": 0.29229047894477844, |
|
"learning_rate": 9.971432646109919e-05, |
|
"loss": 0.8076, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.639618138424821, |
|
"grad_norm": 0.29552674293518066, |
|
"learning_rate": 9.967754098665778e-05, |
|
"loss": 0.7533, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.649164677804296, |
|
"grad_norm": 0.2626473605632782, |
|
"learning_rate": 9.963853597463533e-05, |
|
"loss": 0.7669, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.6587112171837709, |
|
"grad_norm": 0.28999584913253784, |
|
"learning_rate": 9.959731316773259e-05, |
|
"loss": 0.8175, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.6682577565632458, |
|
"grad_norm": 0.3006598949432373, |
|
"learning_rate": 9.9553874407739e-05, |
|
"loss": 0.822, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6778042959427207, |
|
"grad_norm": 0.3051898181438446, |
|
"learning_rate": 9.950822163545032e-05, |
|
"loss": 0.8513, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.6873508353221957, |
|
"grad_norm": 0.3299737572669983, |
|
"learning_rate": 9.946035689058188e-05, |
|
"loss": 0.8559, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.6968973747016707, |
|
"grad_norm": 0.2913152575492859, |
|
"learning_rate": 9.941028231167756e-05, |
|
"loss": 0.7742, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7064439140811456, |
|
"grad_norm": 0.2927692234516144, |
|
"learning_rate": 9.935800013601414e-05, |
|
"loss": 0.8335, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7159904534606205, |
|
"grad_norm": 0.28141114115715027, |
|
"learning_rate": 9.930351269950143e-05, |
|
"loss": 0.7976, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7255369928400954, |
|
"grad_norm": 0.2617853581905365, |
|
"learning_rate": 9.924682243657779e-05, |
|
"loss": 0.7707, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.7350835322195705, |
|
"grad_norm": 0.3233207166194916, |
|
"learning_rate": 9.918793188010147e-05, |
|
"loss": 0.8698, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.7446300715990454, |
|
"grad_norm": 0.29364439845085144, |
|
"learning_rate": 9.91268436612374e-05, |
|
"loss": 0.79, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.7541766109785203, |
|
"grad_norm": 0.3051166832447052, |
|
"learning_rate": 9.906356050933961e-05, |
|
"loss": 0.864, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.7637231503579952, |
|
"grad_norm": 0.2952063977718353, |
|
"learning_rate": 9.899808525182935e-05, |
|
"loss": 0.7312, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7732696897374701, |
|
"grad_norm": 0.2865234911441803, |
|
"learning_rate": 9.893042081406867e-05, |
|
"loss": 0.7051, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.7828162291169452, |
|
"grad_norm": 0.3515304625034332, |
|
"learning_rate": 9.886057021922982e-05, |
|
"loss": 0.917, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.7923627684964201, |
|
"grad_norm": 0.2626941204071045, |
|
"learning_rate": 9.878853658816014e-05, |
|
"loss": 0.7131, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.801909307875895, |
|
"grad_norm": 0.276617169380188, |
|
"learning_rate": 9.871432313924255e-05, |
|
"loss": 0.7683, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.8114558472553699, |
|
"grad_norm": 0.2965279519557953, |
|
"learning_rate": 9.863793318825186e-05, |
|
"loss": 0.686, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8210023866348448, |
|
"grad_norm": 0.3024349510669708, |
|
"learning_rate": 9.85593701482066e-05, |
|
"loss": 0.8203, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8305489260143198, |
|
"grad_norm": 0.31996044516563416, |
|
"learning_rate": 9.847863752921649e-05, |
|
"loss": 0.8381, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.8400954653937948, |
|
"grad_norm": 0.2804871201515198, |
|
"learning_rate": 9.839573893832563e-05, |
|
"loss": 0.8378, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.8496420047732697, |
|
"grad_norm": 0.3115004301071167, |
|
"learning_rate": 9.831067807935137e-05, |
|
"loss": 0.7382, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.8591885441527446, |
|
"grad_norm": 0.3251977264881134, |
|
"learning_rate": 9.822345875271883e-05, |
|
"loss": 0.8506, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8687350835322196, |
|
"grad_norm": 0.32227852940559387, |
|
"learning_rate": 9.813408485529103e-05, |
|
"loss": 0.7768, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.8782816229116945, |
|
"grad_norm": 0.33089524507522583, |
|
"learning_rate": 9.804256038019481e-05, |
|
"loss": 0.7763, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.8878281622911695, |
|
"grad_norm": 0.3890259563922882, |
|
"learning_rate": 9.794888941664253e-05, |
|
"loss": 0.9264, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.8973747016706444, |
|
"grad_norm": 0.31058862805366516, |
|
"learning_rate": 9.785307614974921e-05, |
|
"loss": 0.8139, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.9069212410501193, |
|
"grad_norm": 0.2780233323574066, |
|
"learning_rate": 9.775512486034563e-05, |
|
"loss": 0.8274, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9164677804295943, |
|
"grad_norm": 0.3168707489967346, |
|
"learning_rate": 9.765503992478704e-05, |
|
"loss": 0.8425, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.9260143198090692, |
|
"grad_norm": 0.3951367139816284, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 0.6909, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.9355608591885441, |
|
"grad_norm": 0.3271735608577728, |
|
"learning_rate": 9.744848709707091e-05, |
|
"loss": 0.7677, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.9451073985680191, |
|
"grad_norm": 0.26784732937812805, |
|
"learning_rate": 9.734202843346522e-05, |
|
"loss": 0.6579, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.954653937947494, |
|
"grad_norm": 0.3045744001865387, |
|
"learning_rate": 9.723345458039594e-05, |
|
"loss": 0.738, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.964200477326969, |
|
"grad_norm": 0.30207037925720215, |
|
"learning_rate": 9.712277038882273e-05, |
|
"loss": 0.7435, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.9737470167064439, |
|
"grad_norm": 0.26012739539146423, |
|
"learning_rate": 9.700998080399287e-05, |
|
"loss": 0.6577, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.9832935560859188, |
|
"grad_norm": 0.3377532660961151, |
|
"learning_rate": 9.689509086522019e-05, |
|
"loss": 0.8357, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.9928400954653938, |
|
"grad_norm": 0.2794972360134125, |
|
"learning_rate": 9.67781057056601e-05, |
|
"loss": 0.7169, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.9928400954653938, |
|
"eval_loss": 0.7859669327735901, |
|
"eval_runtime": 13.0799, |
|
"eval_samples_per_second": 13.532, |
|
"eval_steps_per_second": 1.758, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.0023866348448687, |
|
"grad_norm": 0.34849369525909424, |
|
"learning_rate": 9.665903055208014e-05, |
|
"loss": 0.7555, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.0119331742243436, |
|
"grad_norm": 0.32931214570999146, |
|
"learning_rate": 9.653787072462643e-05, |
|
"loss": 0.8744, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.0214797136038185, |
|
"grad_norm": 0.2845034897327423, |
|
"learning_rate": 9.641463163658605e-05, |
|
"loss": 0.7014, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.0310262529832936, |
|
"grad_norm": 0.3317318260669708, |
|
"learning_rate": 9.628931879414517e-05, |
|
"loss": 0.6797, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.0405727923627686, |
|
"grad_norm": 0.3143399655818939, |
|
"learning_rate": 9.616193779614294e-05, |
|
"loss": 0.7855, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.0501193317422435, |
|
"grad_norm": 0.3227895498275757, |
|
"learning_rate": 9.603249433382144e-05, |
|
"loss": 0.8448, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0596658711217184, |
|
"grad_norm": 0.29557734727859497, |
|
"learning_rate": 9.590099419057141e-05, |
|
"loss": 0.7718, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.0692124105011933, |
|
"grad_norm": 0.2954796254634857, |
|
"learning_rate": 9.57674432416738e-05, |
|
"loss": 0.7213, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.0787589498806682, |
|
"grad_norm": 0.33134913444519043, |
|
"learning_rate": 9.563184745403724e-05, |
|
"loss": 0.7673, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.0883054892601431, |
|
"grad_norm": 0.2836027145385742, |
|
"learning_rate": 9.549421288593157e-05, |
|
"loss": 0.6618, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.097852028639618, |
|
"grad_norm": 0.27743393182754517, |
|
"learning_rate": 9.535454568671704e-05, |
|
"loss": 0.5814, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.107398568019093, |
|
"grad_norm": 0.30395206809043884, |
|
"learning_rate": 9.521285209656962e-05, |
|
"loss": 0.7574, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.1169451073985681, |
|
"grad_norm": 0.3082098364830017, |
|
"learning_rate": 9.506913844620218e-05, |
|
"loss": 0.6795, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.126491646778043, |
|
"grad_norm": 0.34204941987991333, |
|
"learning_rate": 9.492341115658167e-05, |
|
"loss": 0.7607, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.136038186157518, |
|
"grad_norm": 0.2831597924232483, |
|
"learning_rate": 9.477567673864216e-05, |
|
"loss": 0.7202, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.1455847255369929, |
|
"grad_norm": 0.31007006764411926, |
|
"learning_rate": 9.462594179299406e-05, |
|
"loss": 0.7508, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.1551312649164678, |
|
"grad_norm": 0.36465707421302795, |
|
"learning_rate": 9.44742130096291e-05, |
|
"loss": 0.7767, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.1646778042959427, |
|
"grad_norm": 0.30296140909194946, |
|
"learning_rate": 9.43204971676215e-05, |
|
"loss": 0.7016, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.1742243436754176, |
|
"grad_norm": 0.30213695764541626, |
|
"learning_rate": 9.416480113482504e-05, |
|
"loss": 0.7981, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.1837708830548925, |
|
"grad_norm": 0.3844444155693054, |
|
"learning_rate": 9.400713186756625e-05, |
|
"loss": 0.8803, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.1933174224343674, |
|
"grad_norm": 0.35131993889808655, |
|
"learning_rate": 9.384749641033359e-05, |
|
"loss": 0.7987, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.2028639618138426, |
|
"grad_norm": 0.32640498876571655, |
|
"learning_rate": 9.368590189546269e-05, |
|
"loss": 0.7141, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.2124105011933175, |
|
"grad_norm": 0.33494001626968384, |
|
"learning_rate": 9.352235554281774e-05, |
|
"loss": 0.7661, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.2219570405727924, |
|
"grad_norm": 0.3069153130054474, |
|
"learning_rate": 9.335686465946887e-05, |
|
"loss": 0.7727, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.2315035799522673, |
|
"grad_norm": 0.34017837047576904, |
|
"learning_rate": 9.31894366393657e-05, |
|
"loss": 0.712, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.2410501193317423, |
|
"grad_norm": 0.3549365997314453, |
|
"learning_rate": 9.302007896300698e-05, |
|
"loss": 0.7778, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2505966587112172, |
|
"grad_norm": 0.36418265104293823, |
|
"learning_rate": 9.284879919710632e-05, |
|
"loss": 0.7858, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.260143198090692, |
|
"grad_norm": 0.35619720816612244, |
|
"learning_rate": 9.267560499425424e-05, |
|
"loss": 0.7254, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.269689737470167, |
|
"grad_norm": 0.3609873652458191, |
|
"learning_rate": 9.250050409257611e-05, |
|
"loss": 0.702, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.279236276849642, |
|
"grad_norm": 0.33549803495407104, |
|
"learning_rate": 9.232350431538656e-05, |
|
"loss": 0.753, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.288782816229117, |
|
"grad_norm": 0.3622001111507416, |
|
"learning_rate": 9.214461357083985e-05, |
|
"loss": 0.7751, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.2983293556085918, |
|
"grad_norm": 0.36122390627861023, |
|
"learning_rate": 9.196383985157656e-05, |
|
"loss": 0.7293, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.307875894988067, |
|
"grad_norm": 0.3313043713569641, |
|
"learning_rate": 9.17811912343665e-05, |
|
"loss": 0.7719, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.3174224343675418, |
|
"grad_norm": 0.420242041349411, |
|
"learning_rate": 9.159667587974785e-05, |
|
"loss": 0.8639, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.3269689737470167, |
|
"grad_norm": 0.3835316598415375, |
|
"learning_rate": 9.141030203166256e-05, |
|
"loss": 0.7759, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.3365155131264916, |
|
"grad_norm": 0.3210572898387909, |
|
"learning_rate": 9.122207801708802e-05, |
|
"loss": 0.7146, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.3460620525059666, |
|
"grad_norm": 0.3777942955493927, |
|
"learning_rate": 9.103201224566498e-05, |
|
"loss": 0.6785, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.3556085918854415, |
|
"grad_norm": 0.36900511384010315, |
|
"learning_rate": 9.084011320932189e-05, |
|
"loss": 0.7334, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.3651551312649164, |
|
"grad_norm": 0.34692367911338806, |
|
"learning_rate": 9.064638948189538e-05, |
|
"loss": 0.8239, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.3747016706443915, |
|
"grad_norm": 0.3698657155036926, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 0.816, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.3842482100238662, |
|
"grad_norm": 0.3782055974006653, |
|
"learning_rate": 9.025350265637815e-05, |
|
"loss": 0.7642, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.3937947494033414, |
|
"grad_norm": 0.32594063878059387, |
|
"learning_rate": 9.005435711203618e-05, |
|
"loss": 0.6679, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.4033412887828163, |
|
"grad_norm": 0.31659746170043945, |
|
"learning_rate": 8.985342198332407e-05, |
|
"loss": 0.6733, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.4128878281622912, |
|
"grad_norm": 0.3700142502784729, |
|
"learning_rate": 8.965070624780116e-05, |
|
"loss": 0.7628, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.422434367541766, |
|
"grad_norm": 0.36097973585128784, |
|
"learning_rate": 8.944621896258225e-05, |
|
"loss": 0.823, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.431980906921241, |
|
"grad_norm": 0.34358176589012146, |
|
"learning_rate": 8.923996926393305e-05, |
|
"loss": 0.8027, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.441527446300716, |
|
"grad_norm": 0.3484276831150055, |
|
"learning_rate": 8.903196636686197e-05, |
|
"loss": 0.7654, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.4510739856801909, |
|
"grad_norm": 0.3297461271286011, |
|
"learning_rate": 8.882221956470836e-05, |
|
"loss": 0.7309, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.460620525059666, |
|
"grad_norm": 0.337780237197876, |
|
"learning_rate": 8.861073822872734e-05, |
|
"loss": 0.6041, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.4701670644391407, |
|
"grad_norm": 0.4128996431827545, |
|
"learning_rate": 8.839753180767108e-05, |
|
"loss": 0.755, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.4797136038186158, |
|
"grad_norm": 0.39525777101516724, |
|
"learning_rate": 8.818260982736661e-05, |
|
"loss": 0.893, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.4892601431980907, |
|
"grad_norm": 0.364863783121109, |
|
"learning_rate": 8.79659818902903e-05, |
|
"loss": 0.7558, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.4892601431980907, |
|
"eval_loss": 0.7723400592803955, |
|
"eval_runtime": 12.9849, |
|
"eval_samples_per_second": 13.631, |
|
"eval_steps_per_second": 1.771, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.4988066825775657, |
|
"grad_norm": 0.3498753309249878, |
|
"learning_rate": 8.774765767513875e-05, |
|
"loss": 0.6726, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.5083532219570406, |
|
"grad_norm": 0.3801003694534302, |
|
"learning_rate": 8.752764693639638e-05, |
|
"loss": 0.8054, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.5178997613365155, |
|
"grad_norm": 0.33022618293762207, |
|
"learning_rate": 8.730595950389968e-05, |
|
"loss": 0.6711, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.5274463007159904, |
|
"grad_norm": 0.3503524363040924, |
|
"learning_rate": 8.708260528239788e-05, |
|
"loss": 0.695, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.5369928400954653, |
|
"grad_norm": 0.3124283254146576, |
|
"learning_rate": 8.685759425111056e-05, |
|
"loss": 0.6797, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.5465393794749405, |
|
"grad_norm": 0.3534424602985382, |
|
"learning_rate": 8.663093646328166e-05, |
|
"loss": 0.7181, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.5560859188544152, |
|
"grad_norm": 0.40379753708839417, |
|
"learning_rate": 8.640264204573047e-05, |
|
"loss": 0.7923, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.5656324582338903, |
|
"grad_norm": 0.4083033502101898, |
|
"learning_rate": 8.617272119839903e-05, |
|
"loss": 0.7997, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.575178997613365, |
|
"grad_norm": 0.3316013216972351, |
|
"learning_rate": 8.594118419389647e-05, |
|
"loss": 0.7135, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.5847255369928401, |
|
"grad_norm": 0.3352660834789276, |
|
"learning_rate": 8.570804137704003e-05, |
|
"loss": 0.6358, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.594272076372315, |
|
"grad_norm": 0.34296879172325134, |
|
"learning_rate": 8.547330316439291e-05, |
|
"loss": 0.683, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.60381861575179, |
|
"grad_norm": 0.362801730632782, |
|
"learning_rate": 8.523698004379877e-05, |
|
"loss": 0.7637, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.6133651551312649, |
|
"grad_norm": 0.3877812623977661, |
|
"learning_rate": 8.499908257391323e-05, |
|
"loss": 0.6848, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.6229116945107398, |
|
"grad_norm": 0.3417890965938568, |
|
"learning_rate": 8.475962138373213e-05, |
|
"loss": 0.6743, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.632458233890215, |
|
"grad_norm": 0.3690805435180664, |
|
"learning_rate": 8.451860717211653e-05, |
|
"loss": 0.717, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.6420047732696896, |
|
"grad_norm": 0.39610129594802856, |
|
"learning_rate": 8.427605070731482e-05, |
|
"loss": 0.831, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.6515513126491648, |
|
"grad_norm": 0.3383614718914032, |
|
"learning_rate": 8.403196282648156e-05, |
|
"loss": 0.6713, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.6610978520286395, |
|
"grad_norm": 0.4015936553478241, |
|
"learning_rate": 8.378635443519327e-05, |
|
"loss": 0.8089, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.6706443914081146, |
|
"grad_norm": 0.34833744168281555, |
|
"learning_rate": 8.353923650696118e-05, |
|
"loss": 0.6678, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.6801909307875895, |
|
"grad_norm": 0.35956260561943054, |
|
"learning_rate": 8.329062008274098e-05, |
|
"loss": 0.751, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.6897374701670644, |
|
"grad_norm": 0.35701537132263184, |
|
"learning_rate": 8.304051627043953e-05, |
|
"loss": 0.6618, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.6992840095465394, |
|
"grad_norm": 0.2929876446723938, |
|
"learning_rate": 8.278893624441848e-05, |
|
"loss": 0.647, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.7088305489260143, |
|
"grad_norm": 0.3412924110889435, |
|
"learning_rate": 8.253589124499512e-05, |
|
"loss": 0.6297, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.7183770883054894, |
|
"grad_norm": 0.3739149272441864, |
|
"learning_rate": 8.228139257794012e-05, |
|
"loss": 0.7222, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.727923627684964, |
|
"grad_norm": 0.327097624540329, |
|
"learning_rate": 8.202545161397242e-05, |
|
"loss": 0.7254, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.7374701670644392, |
|
"grad_norm": 0.4074615240097046, |
|
"learning_rate": 8.176807978825118e-05, |
|
"loss": 0.8142, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.747016706443914, |
|
"grad_norm": 0.36452674865722656, |
|
"learning_rate": 8.150928859986488e-05, |
|
"loss": 0.732, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.756563245823389, |
|
"grad_norm": 0.41651418805122375, |
|
"learning_rate": 8.124908961131759e-05, |
|
"loss": 0.8458, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.766109785202864, |
|
"grad_norm": 0.3859712779521942, |
|
"learning_rate": 8.098749444801224e-05, |
|
"loss": 0.852, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.775656324582339, |
|
"grad_norm": 0.3324730694293976, |
|
"learning_rate": 8.072451479773143e-05, |
|
"loss": 0.6524, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.7852028639618138, |
|
"grad_norm": 0.35326018929481506, |
|
"learning_rate": 8.0460162410115e-05, |
|
"loss": 0.6616, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.7947494033412887, |
|
"grad_norm": 0.3914497494697571, |
|
"learning_rate": 8.019444909613522e-05, |
|
"loss": 0.8023, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.8042959427207639, |
|
"grad_norm": 0.35949957370758057, |
|
"learning_rate": 7.992738672756909e-05, |
|
"loss": 0.6108, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.8138424821002386, |
|
"grad_norm": 0.366787314414978, |
|
"learning_rate": 7.965898723646776e-05, |
|
"loss": 0.8388, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.8233890214797137, |
|
"grad_norm": 0.3832429349422455, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 0.7082, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.8329355608591884, |
|
"grad_norm": 0.3898661732673645, |
|
"learning_rate": 7.911822491303452e-05, |
|
"loss": 0.7633, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.8424821002386635, |
|
"grad_norm": 0.3598467707633972, |
|
"learning_rate": 7.884588624136504e-05, |
|
"loss": 0.6604, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.8520286396181385, |
|
"grad_norm": 0.33222588896751404, |
|
"learning_rate": 7.857225876740584e-05, |
|
"loss": 0.7332, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.8615751789976134, |
|
"grad_norm": 0.410185307264328, |
|
"learning_rate": 7.829735471652978e-05, |
|
"loss": 0.8056, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.8711217183770883, |
|
"grad_norm": 0.41406041383743286, |
|
"learning_rate": 7.802118637114573e-05, |
|
"loss": 0.8366, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.8806682577565632, |
|
"grad_norm": 0.4121192693710327, |
|
"learning_rate": 7.774376607014995e-05, |
|
"loss": 0.8089, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.8902147971360383, |
|
"grad_norm": 0.3749670684337616, |
|
"learning_rate": 7.746510620837459e-05, |
|
"loss": 0.7294, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.899761336515513, |
|
"grad_norm": 0.42995452880859375, |
|
"learning_rate": 7.718521923603404e-05, |
|
"loss": 0.7776, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.9093078758949882, |
|
"grad_norm": 0.36848926544189453, |
|
"learning_rate": 7.690411765816864e-05, |
|
"loss": 0.7549, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.9188544152744629, |
|
"grad_norm": 0.40055760741233826, |
|
"learning_rate": 7.662181403408593e-05, |
|
"loss": 0.6901, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.928400954653938, |
|
"grad_norm": 0.38324177265167236, |
|
"learning_rate": 7.633832097679958e-05, |
|
"loss": 0.8203, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.937947494033413, |
|
"grad_norm": 0.31426626443862915, |
|
"learning_rate": 7.605365115246581e-05, |
|
"loss": 0.7143, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.9474940334128878, |
|
"grad_norm": 0.3708311915397644, |
|
"learning_rate": 7.576781727981749e-05, |
|
"loss": 0.7329, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.9570405727923628, |
|
"grad_norm": 0.3401066064834595, |
|
"learning_rate": 7.548083212959588e-05, |
|
"loss": 0.6827, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.9665871121718377, |
|
"grad_norm": 0.32935282588005066, |
|
"learning_rate": 7.519270852398001e-05, |
|
"loss": 0.6928, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.9761336515513126, |
|
"grad_norm": 0.30615073442459106, |
|
"learning_rate": 7.490345933601395e-05, |
|
"loss": 0.6326, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.9856801909307875, |
|
"grad_norm": 0.37611639499664307, |
|
"learning_rate": 7.461309748903139e-05, |
|
"loss": 0.7814, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.9856801909307875, |
|
"eval_loss": 0.7655227780342102, |
|
"eval_runtime": 12.9629, |
|
"eval_samples_per_second": 13.654, |
|
"eval_steps_per_second": 1.774, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.9952267303102627, |
|
"grad_norm": 0.4345564842224121, |
|
"learning_rate": 7.432163595607851e-05, |
|
"loss": 0.8073, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 2.0047732696897373, |
|
"grad_norm": 0.3374365270137787, |
|
"learning_rate": 7.402908775933419e-05, |
|
"loss": 0.6948, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.0143198090692125, |
|
"grad_norm": 0.34650278091430664, |
|
"learning_rate": 7.373546596952829e-05, |
|
"loss": 0.6353, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.023866348448687, |
|
"grad_norm": 0.40124908089637756, |
|
"learning_rate": 7.344078370535757e-05, |
|
"loss": 0.7312, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.0334128878281623, |
|
"grad_norm": 0.3434641361236572, |
|
"learning_rate": 7.314505413289964e-05, |
|
"loss": 0.6418, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.042959427207637, |
|
"grad_norm": 0.36048009991645813, |
|
"learning_rate": 7.284829046502468e-05, |
|
"loss": 0.7244, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.052505966587112, |
|
"grad_norm": 0.37354418635368347, |
|
"learning_rate": 7.255050596080509e-05, |
|
"loss": 0.7364, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.0620525059665873, |
|
"grad_norm": 0.3802691698074341, |
|
"learning_rate": 7.225171392492316e-05, |
|
"loss": 0.7152, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.071599045346062, |
|
"grad_norm": 0.3605879545211792, |
|
"learning_rate": 7.195192770707654e-05, |
|
"loss": 0.6506, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.081145584725537, |
|
"grad_norm": 0.3946097791194916, |
|
"learning_rate": 7.165116070138183e-05, |
|
"loss": 0.715, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.090692124105012, |
|
"grad_norm": 0.35205793380737305, |
|
"learning_rate": 7.134942634577614e-05, |
|
"loss": 0.7169, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.100238663484487, |
|
"grad_norm": 0.4253597557544708, |
|
"learning_rate": 7.104673812141675e-05, |
|
"loss": 0.7926, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.1097852028639617, |
|
"grad_norm": 0.43062886595726013, |
|
"learning_rate": 7.07431095520787e-05, |
|
"loss": 0.6923, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.119331742243437, |
|
"grad_norm": 0.38732174038887024, |
|
"learning_rate": 7.04385542035506e-05, |
|
"loss": 0.6641, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.128878281622912, |
|
"grad_norm": 0.37736237049102783, |
|
"learning_rate": 7.013308568302854e-05, |
|
"loss": 0.6602, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.1384248210023866, |
|
"grad_norm": 0.3910059928894043, |
|
"learning_rate": 6.982671763850814e-05, |
|
"loss": 0.5671, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.1479713603818618, |
|
"grad_norm": 0.4084903597831726, |
|
"learning_rate": 6.951946375817474e-05, |
|
"loss": 0.7144, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.1575178997613365, |
|
"grad_norm": 0.46185556054115295, |
|
"learning_rate": 6.921133776979186e-05, |
|
"loss": 0.7373, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.1670644391408116, |
|
"grad_norm": 0.38187095522880554, |
|
"learning_rate": 6.890235344008781e-05, |
|
"loss": 0.6753, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.1766109785202863, |
|
"grad_norm": 0.43176108598709106, |
|
"learning_rate": 6.859252457414067e-05, |
|
"loss": 0.683, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.1861575178997614, |
|
"grad_norm": 0.43367186188697815, |
|
"learning_rate": 6.828186501476144e-05, |
|
"loss": 0.7548, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.195704057279236, |
|
"grad_norm": 0.3524869978427887, |
|
"learning_rate": 6.797038864187564e-05, |
|
"loss": 0.6032, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.2052505966587113, |
|
"grad_norm": 0.40967708826065063, |
|
"learning_rate": 6.765810937190306e-05, |
|
"loss": 0.6378, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.214797136038186, |
|
"grad_norm": 0.3816625773906708, |
|
"learning_rate": 6.734504115713604e-05, |
|
"loss": 0.623, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.224343675417661, |
|
"grad_norm": 0.5187081694602966, |
|
"learning_rate": 6.703119798511612e-05, |
|
"loss": 0.838, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.2338902147971362, |
|
"grad_norm": 0.4348510503768921, |
|
"learning_rate": 6.67165938780091e-05, |
|
"loss": 0.7311, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.243436754176611, |
|
"grad_norm": 0.3842396140098572, |
|
"learning_rate": 6.640124289197845e-05, |
|
"loss": 0.6527, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.252983293556086, |
|
"grad_norm": 0.4316810965538025, |
|
"learning_rate": 6.608515911655744e-05, |
|
"loss": 0.7794, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.2625298329355608, |
|
"grad_norm": 0.3772410452365875, |
|
"learning_rate": 6.576835667401953e-05, |
|
"loss": 0.6369, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.272076372315036, |
|
"grad_norm": 0.4141497015953064, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.6993, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.2816229116945106, |
|
"grad_norm": 0.41354265809059143, |
|
"learning_rate": 6.513265243660057e-05, |
|
"loss": 0.6389, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.2911694510739857, |
|
"grad_norm": 0.3867475986480713, |
|
"learning_rate": 6.481377904428171e-05, |
|
"loss": 0.6306, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.300715990453461, |
|
"grad_norm": 0.40222805738449097, |
|
"learning_rate": 6.449424378870123e-05, |
|
"loss": 0.6857, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.3102625298329356, |
|
"grad_norm": 0.4215107858181, |
|
"learning_rate": 6.41740609463409e-05, |
|
"loss": 0.7309, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.3198090692124103, |
|
"grad_norm": 0.4149893820285797, |
|
"learning_rate": 6.385324482261597e-05, |
|
"loss": 0.6562, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.3293556085918854, |
|
"grad_norm": 0.4119661748409271, |
|
"learning_rate": 6.353180975123595e-05, |
|
"loss": 0.7544, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.3389021479713605, |
|
"grad_norm": 0.32324427366256714, |
|
"learning_rate": 6.320977009356431e-05, |
|
"loss": 0.5994, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.3484486873508352, |
|
"grad_norm": 0.4508344531059265, |
|
"learning_rate": 6.288714023797672e-05, |
|
"loss": 0.7047, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.3579952267303104, |
|
"grad_norm": 0.3957417905330658, |
|
"learning_rate": 6.256393459921824e-05, |
|
"loss": 0.6364, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.367541766109785, |
|
"grad_norm": 0.4180348813533783, |
|
"learning_rate": 6.224016761775933e-05, |
|
"loss": 0.6511, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.37708830548926, |
|
"grad_norm": 0.46107926964759827, |
|
"learning_rate": 6.191585375915055e-05, |
|
"loss": 0.6736, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.386634844868735, |
|
"grad_norm": 0.43949881196022034, |
|
"learning_rate": 6.159100751337642e-05, |
|
"loss": 0.6639, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.39618138424821, |
|
"grad_norm": 0.4427139461040497, |
|
"learning_rate": 6.126564339420784e-05, |
|
"loss": 0.6581, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.405727923627685, |
|
"grad_norm": 0.4241901636123657, |
|
"learning_rate": 6.093977593855375e-05, |
|
"loss": 0.6738, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.41527446300716, |
|
"grad_norm": 0.41828441619873047, |
|
"learning_rate": 6.061341970581165e-05, |
|
"loss": 0.685, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.424821002386635, |
|
"grad_norm": 0.4712134599685669, |
|
"learning_rate": 6.028658927721697e-05, |
|
"loss": 0.6853, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.4343675417661097, |
|
"grad_norm": 0.47678568959236145, |
|
"learning_rate": 5.99592992551918e-05, |
|
"loss": 0.673, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.443914081145585, |
|
"grad_norm": 0.46318480372428894, |
|
"learning_rate": 5.9631564262692274e-05, |
|
"loss": 0.688, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.4534606205250595, |
|
"grad_norm": 0.4256531000137329, |
|
"learning_rate": 5.930339894255532e-05, |
|
"loss": 0.6521, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.4630071599045347, |
|
"grad_norm": 0.39636510610580444, |
|
"learning_rate": 5.897481795684446e-05, |
|
"loss": 0.6713, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.4725536992840094, |
|
"grad_norm": 0.497344434261322, |
|
"learning_rate": 5.8645835986194676e-05, |
|
"loss": 0.7745, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.4821002386634845, |
|
"grad_norm": 0.4814034104347229, |
|
"learning_rate": 5.831646772915651e-05, |
|
"loss": 0.6849, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.4821002386634845, |
|
"eval_loss": 0.7669724225997925, |
|
"eval_runtime": 13.0227, |
|
"eval_samples_per_second": 13.592, |
|
"eval_steps_per_second": 1.766, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.491646778042959, |
|
"grad_norm": 0.4661683142185211, |
|
"learning_rate": 5.7986727901539374e-05, |
|
"loss": 0.7284, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.5011933174224343, |
|
"grad_norm": 0.4617115557193756, |
|
"learning_rate": 5.7656631235754014e-05, |
|
"loss": 0.7026, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.5107398568019095, |
|
"grad_norm": 0.48742860555648804, |
|
"learning_rate": 5.732619248015434e-05, |
|
"loss": 0.6964, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.520286396181384, |
|
"grad_norm": 0.4442538917064667, |
|
"learning_rate": 5.699542639837844e-05, |
|
"loss": 0.6804, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.5298329355608593, |
|
"grad_norm": 0.49106183648109436, |
|
"learning_rate": 5.666434776868895e-05, |
|
"loss": 0.6865, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.539379474940334, |
|
"grad_norm": 0.4551374316215515, |
|
"learning_rate": 5.633297138331285e-05, |
|
"loss": 0.7748, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.548926014319809, |
|
"grad_norm": 0.413185715675354, |
|
"learning_rate": 5.6001312047780486e-05, |
|
"loss": 0.5999, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.558472553699284, |
|
"grad_norm": 0.44866228103637695, |
|
"learning_rate": 5.5669384580264104e-05, |
|
"loss": 0.6102, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.568019093078759, |
|
"grad_norm": 0.45962679386138916, |
|
"learning_rate": 5.533720381091582e-05, |
|
"loss": 0.7214, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.577565632458234, |
|
"grad_norm": 0.45130231976509094, |
|
"learning_rate": 5.5004784581204927e-05, |
|
"loss": 0.7123, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.587112171837709, |
|
"grad_norm": 0.44015398621559143, |
|
"learning_rate": 5.467214174325493e-05, |
|
"loss": 0.6339, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.5966587112171835, |
|
"grad_norm": 0.45853525400161743, |
|
"learning_rate": 5.4339290159179875e-05, |
|
"loss": 0.7224, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.6062052505966586, |
|
"grad_norm": 0.45820721983909607, |
|
"learning_rate": 5.400624470042037e-05, |
|
"loss": 0.638, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.615751789976134, |
|
"grad_norm": 0.4199580252170563, |
|
"learning_rate": 5.367302024707911e-05, |
|
"loss": 0.6548, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.6252983293556085, |
|
"grad_norm": 0.5202221870422363, |
|
"learning_rate": 5.3339631687256084e-05, |
|
"loss": 0.7328, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.6348448687350836, |
|
"grad_norm": 0.41012313961982727, |
|
"learning_rate": 5.300609391638336e-05, |
|
"loss": 0.6315, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.6443914081145583, |
|
"grad_norm": 0.5391274094581604, |
|
"learning_rate": 5.267242183655961e-05, |
|
"loss": 0.7476, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.6539379474940334, |
|
"grad_norm": 0.4461412727832794, |
|
"learning_rate": 5.233863035588426e-05, |
|
"loss": 0.6223, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.663484486873508, |
|
"grad_norm": 0.4335575997829437, |
|
"learning_rate": 5.200473438779146e-05, |
|
"loss": 0.6816, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.6730310262529833, |
|
"grad_norm": 0.44996485114097595, |
|
"learning_rate": 5.167074885038373e-05, |
|
"loss": 0.6554, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.6825775656324584, |
|
"grad_norm": 0.4875689744949341, |
|
"learning_rate": 5.133668866576544e-05, |
|
"loss": 0.7889, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.692124105011933, |
|
"grad_norm": 0.45980238914489746, |
|
"learning_rate": 5.1002568759376134e-05, |
|
"loss": 0.6615, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.7016706443914082, |
|
"grad_norm": 0.43575319647789, |
|
"learning_rate": 5.0668404059323635e-05, |
|
"loss": 0.6595, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.711217183770883, |
|
"grad_norm": 0.438365638256073, |
|
"learning_rate": 5.033420949571712e-05, |
|
"loss": 0.6354, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.720763723150358, |
|
"grad_norm": 0.4110269546508789, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6814, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.7303102625298328, |
|
"grad_norm": 0.4653089642524719, |
|
"learning_rate": 4.96657905042829e-05, |
|
"loss": 0.7375, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.739856801909308, |
|
"grad_norm": 0.45703983306884766, |
|
"learning_rate": 4.933159594067637e-05, |
|
"loss": 0.7464, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.749403341288783, |
|
"grad_norm": 0.40393638610839844, |
|
"learning_rate": 4.899743124062388e-05, |
|
"loss": 0.6091, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.7589498806682577, |
|
"grad_norm": 0.468176007270813, |
|
"learning_rate": 4.866331133423456e-05, |
|
"loss": 0.7052, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.7684964200477324, |
|
"grad_norm": 0.4771003723144531, |
|
"learning_rate": 4.832925114961629e-05, |
|
"loss": 0.7071, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.7780429594272076, |
|
"grad_norm": 0.4809763729572296, |
|
"learning_rate": 4.799526561220855e-05, |
|
"loss": 0.7289, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.7875894988066827, |
|
"grad_norm": 0.46262210607528687, |
|
"learning_rate": 4.7661369644115755e-05, |
|
"loss": 0.6031, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.7971360381861574, |
|
"grad_norm": 0.48082613945007324, |
|
"learning_rate": 4.73275781634404e-05, |
|
"loss": 0.7008, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.8066825775656326, |
|
"grad_norm": 0.5203619003295898, |
|
"learning_rate": 4.6993906083616654e-05, |
|
"loss": 0.6972, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.8162291169451072, |
|
"grad_norm": 0.5060526728630066, |
|
"learning_rate": 4.666036831274392e-05, |
|
"loss": 0.844, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.8257756563245824, |
|
"grad_norm": 0.5173178911209106, |
|
"learning_rate": 4.63269797529209e-05, |
|
"loss": 0.711, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.835322195704057, |
|
"grad_norm": 0.40481331944465637, |
|
"learning_rate": 4.5993755299579626e-05, |
|
"loss": 0.6849, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.844868735083532, |
|
"grad_norm": 0.5248334407806396, |
|
"learning_rate": 4.566070984082013e-05, |
|
"loss": 0.8126, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.8544152744630074, |
|
"grad_norm": 0.469752699136734, |
|
"learning_rate": 4.5327858256745073e-05, |
|
"loss": 0.7265, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.863961813842482, |
|
"grad_norm": 0.4676867425441742, |
|
"learning_rate": 4.4995215418795085e-05, |
|
"loss": 0.7859, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.873508353221957, |
|
"grad_norm": 0.43738093972206116, |
|
"learning_rate": 4.466279618908419e-05, |
|
"loss": 0.5927, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.883054892601432, |
|
"grad_norm": 0.4657769799232483, |
|
"learning_rate": 4.433061541973591e-05, |
|
"loss": 0.6947, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.892601431980907, |
|
"grad_norm": 0.44069626927375793, |
|
"learning_rate": 4.399868795221951e-05, |
|
"loss": 0.6984, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.9021479713603817, |
|
"grad_norm": 0.3833423852920532, |
|
"learning_rate": 4.366702861668716e-05, |
|
"loss": 0.5624, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.911694510739857, |
|
"grad_norm": 0.42630308866500854, |
|
"learning_rate": 4.333565223131107e-05, |
|
"loss": 0.5505, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.921241050119332, |
|
"grad_norm": 0.46541622281074524, |
|
"learning_rate": 4.300457360162158e-05, |
|
"loss": 0.7071, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.9307875894988067, |
|
"grad_norm": 0.41140827536582947, |
|
"learning_rate": 4.267380751984568e-05, |
|
"loss": 0.5786, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.9403341288782814, |
|
"grad_norm": 0.42655548453330994, |
|
"learning_rate": 4.2343368764246e-05, |
|
"loss": 0.5977, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.9498806682577565, |
|
"grad_norm": 0.45397818088531494, |
|
"learning_rate": 4.201327209846065e-05, |
|
"loss": 0.6592, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.9594272076372317, |
|
"grad_norm": 0.5159749984741211, |
|
"learning_rate": 4.1683532270843504e-05, |
|
"loss": 0.7322, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.9689737470167064, |
|
"grad_norm": 0.4518764615058899, |
|
"learning_rate": 4.135416401380535e-05, |
|
"loss": 0.796, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.9785202863961815, |
|
"grad_norm": 0.47323092818260193, |
|
"learning_rate": 4.102518204315555e-05, |
|
"loss": 0.7456, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.9785202863961815, |
|
"eval_loss": 0.7645925879478455, |
|
"eval_runtime": 13.0059, |
|
"eval_samples_per_second": 13.609, |
|
"eval_steps_per_second": 1.768, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.988066825775656, |
|
"grad_norm": 0.49233347177505493, |
|
"learning_rate": 4.069660105744469e-05, |
|
"loss": 0.6878, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.9976133651551313, |
|
"grad_norm": 0.4664769470691681, |
|
"learning_rate": 4.036843573730774e-05, |
|
"loss": 0.7379, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 3.007159904534606, |
|
"grad_norm": 0.39737001061439514, |
|
"learning_rate": 4.0040700744808204e-05, |
|
"loss": 0.6706, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 3.016706443914081, |
|
"grad_norm": 0.46519792079925537, |
|
"learning_rate": 3.971341072278302e-05, |
|
"loss": 0.6626, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 3.0262529832935563, |
|
"grad_norm": 0.4189288914203644, |
|
"learning_rate": 3.938658029418837e-05, |
|
"loss": 0.6465, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 3.035799522673031, |
|
"grad_norm": 0.5005217790603638, |
|
"learning_rate": 3.9060224061446245e-05, |
|
"loss": 0.6552, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 3.045346062052506, |
|
"grad_norm": 0.4551185965538025, |
|
"learning_rate": 3.873435660579217e-05, |
|
"loss": 0.5767, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 3.054892601431981, |
|
"grad_norm": 0.44225507974624634, |
|
"learning_rate": 3.840899248662358e-05, |
|
"loss": 0.6387, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.064439140811456, |
|
"grad_norm": 0.5424272418022156, |
|
"learning_rate": 3.808414624084946e-05, |
|
"loss": 0.6887, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 3.0739856801909307, |
|
"grad_norm": 0.4525783658027649, |
|
"learning_rate": 3.77598323822407e-05, |
|
"loss": 0.6612, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 3.083532219570406, |
|
"grad_norm": 0.4461750388145447, |
|
"learning_rate": 3.7436065400781774e-05, |
|
"loss": 0.6418, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 3.0930787589498805, |
|
"grad_norm": 0.524474561214447, |
|
"learning_rate": 3.7112859762023314e-05, |
|
"loss": 0.6662, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 3.1026252983293556, |
|
"grad_norm": 0.5077919960021973, |
|
"learning_rate": 3.6790229906435705e-05, |
|
"loss": 0.6368, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 3.1121718377088303, |
|
"grad_norm": 0.5111605525016785, |
|
"learning_rate": 3.646819024876406e-05, |
|
"loss": 0.6471, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 3.1217183770883055, |
|
"grad_norm": 0.47017544507980347, |
|
"learning_rate": 3.614675517738405e-05, |
|
"loss": 0.6412, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 3.1312649164677806, |
|
"grad_norm": 0.5017347931861877, |
|
"learning_rate": 3.5825939053659116e-05, |
|
"loss": 0.6719, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 3.1408114558472553, |
|
"grad_norm": 0.4742693305015564, |
|
"learning_rate": 3.550575621129878e-05, |
|
"loss": 0.6389, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 3.1503579952267304, |
|
"grad_norm": 0.48239150643348694, |
|
"learning_rate": 3.5186220955718306e-05, |
|
"loss": 0.6071, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.159904534606205, |
|
"grad_norm": 0.461012065410614, |
|
"learning_rate": 3.486734756339943e-05, |
|
"loss": 0.5409, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 3.1694510739856803, |
|
"grad_norm": 0.5519300103187561, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 0.6617, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 3.178997613365155, |
|
"grad_norm": 0.4272817373275757, |
|
"learning_rate": 3.423164332598049e-05, |
|
"loss": 0.4943, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 3.18854415274463, |
|
"grad_norm": 0.5129975080490112, |
|
"learning_rate": 3.391484088344257e-05, |
|
"loss": 0.6569, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 3.1980906921241052, |
|
"grad_norm": 0.5243316292762756, |
|
"learning_rate": 3.3598757108021546e-05, |
|
"loss": 0.6659, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.20763723150358, |
|
"grad_norm": 0.4862256348133087, |
|
"learning_rate": 3.3283406121990915e-05, |
|
"loss": 0.6414, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 3.217183770883055, |
|
"grad_norm": 0.5197688937187195, |
|
"learning_rate": 3.2968802014883874e-05, |
|
"loss": 0.6751, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 3.2267303102625298, |
|
"grad_norm": 0.5474854111671448, |
|
"learning_rate": 3.265495884286397e-05, |
|
"loss": 0.7531, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 3.236276849642005, |
|
"grad_norm": 0.47088584303855896, |
|
"learning_rate": 3.234189062809695e-05, |
|
"loss": 0.6319, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 3.2458233890214796, |
|
"grad_norm": 0.4922322630882263, |
|
"learning_rate": 3.202961135812437e-05, |
|
"loss": 0.6297, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.2553699284009547, |
|
"grad_norm": 0.5747278332710266, |
|
"learning_rate": 3.1718134985238567e-05, |
|
"loss": 0.6269, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 3.2649164677804294, |
|
"grad_norm": 0.5492677092552185, |
|
"learning_rate": 3.1407475425859345e-05, |
|
"loss": 0.7295, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 3.2744630071599046, |
|
"grad_norm": 0.487875759601593, |
|
"learning_rate": 3.109764655991221e-05, |
|
"loss": 0.6013, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 3.2840095465393793, |
|
"grad_norm": 0.5148051381111145, |
|
"learning_rate": 3.078866223020815e-05, |
|
"loss": 0.7156, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 3.2935560859188544, |
|
"grad_norm": 0.4531029760837555, |
|
"learning_rate": 3.0480536241825263e-05, |
|
"loss": 0.581, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.3031026252983295, |
|
"grad_norm": 0.4673330783843994, |
|
"learning_rate": 3.0173282361491868e-05, |
|
"loss": 0.5355, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 3.3126491646778042, |
|
"grad_norm": 0.4933515191078186, |
|
"learning_rate": 2.9866914316971477e-05, |
|
"loss": 0.5584, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 3.3221957040572794, |
|
"grad_norm": 0.5033287405967712, |
|
"learning_rate": 2.9561445796449415e-05, |
|
"loss": 0.6598, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 3.331742243436754, |
|
"grad_norm": 0.5585161447525024, |
|
"learning_rate": 2.925689044792132e-05, |
|
"loss": 0.7268, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 3.341288782816229, |
|
"grad_norm": 0.4780008792877197, |
|
"learning_rate": 2.895326187858326e-05, |
|
"loss": 0.6363, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.350835322195704, |
|
"grad_norm": 0.5018965005874634, |
|
"learning_rate": 2.865057365422386e-05, |
|
"loss": 0.6098, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 3.360381861575179, |
|
"grad_norm": 0.4768741726875305, |
|
"learning_rate": 2.8348839298618178e-05, |
|
"loss": 0.6504, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 3.369928400954654, |
|
"grad_norm": 0.47006916999816895, |
|
"learning_rate": 2.8048072292923465e-05, |
|
"loss": 0.7122, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 3.379474940334129, |
|
"grad_norm": 0.5839781165122986, |
|
"learning_rate": 2.7748286075076835e-05, |
|
"loss": 0.688, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 3.389021479713604, |
|
"grad_norm": 0.48642516136169434, |
|
"learning_rate": 2.74494940391949e-05, |
|
"loss": 0.5968, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.3985680190930787, |
|
"grad_norm": 0.5563519597053528, |
|
"learning_rate": 2.7151709534975324e-05, |
|
"loss": 0.6904, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 3.408114558472554, |
|
"grad_norm": 0.489467591047287, |
|
"learning_rate": 2.685494586710038e-05, |
|
"loss": 0.6965, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 3.4176610978520285, |
|
"grad_norm": 0.5098894238471985, |
|
"learning_rate": 2.655921629464245e-05, |
|
"loss": 0.5593, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 3.4272076372315037, |
|
"grad_norm": 0.5460035800933838, |
|
"learning_rate": 2.626453403047172e-05, |
|
"loss": 0.7281, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 3.4367541766109784, |
|
"grad_norm": 0.5174149870872498, |
|
"learning_rate": 2.5970912240665813e-05, |
|
"loss": 0.6505, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.4463007159904535, |
|
"grad_norm": 0.5922746658325195, |
|
"learning_rate": 2.5678364043921504e-05, |
|
"loss": 0.7072, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 3.455847255369928, |
|
"grad_norm": 0.501711368560791, |
|
"learning_rate": 2.5386902510968625e-05, |
|
"loss": 0.7021, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 3.4653937947494033, |
|
"grad_norm": 0.551275372505188, |
|
"learning_rate": 2.5096540663986067e-05, |
|
"loss": 0.6635, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 3.4749403341288785, |
|
"grad_norm": 0.510543942451477, |
|
"learning_rate": 2.4807291476019995e-05, |
|
"loss": 0.6037, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 3.4749403341288785, |
|
"eval_loss": 0.7706080079078674, |
|
"eval_runtime": 12.9841, |
|
"eval_samples_per_second": 13.632, |
|
"eval_steps_per_second": 1.771, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 3.484486873508353, |
|
"grad_norm": 0.5204513072967529, |
|
"learning_rate": 2.4519167870404125e-05, |
|
"loss": 0.6473, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.4940334128878283, |
|
"grad_norm": 0.5144868493080139, |
|
"learning_rate": 2.4232182720182522e-05, |
|
"loss": 0.5738, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 3.503579952267303, |
|
"grad_norm": 0.4935360848903656, |
|
"learning_rate": 2.3946348847534194e-05, |
|
"loss": 0.6041, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 3.513126491646778, |
|
"grad_norm": 0.5450278520584106, |
|
"learning_rate": 2.3661679023200422e-05, |
|
"loss": 0.6976, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 3.522673031026253, |
|
"grad_norm": 0.5161455869674683, |
|
"learning_rate": 2.337818596591408e-05, |
|
"loss": 0.7566, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 3.532219570405728, |
|
"grad_norm": 0.47960028052330017, |
|
"learning_rate": 2.3095882341831372e-05, |
|
"loss": 0.6354, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.541766109785203, |
|
"grad_norm": 0.46854427456855774, |
|
"learning_rate": 2.281478076396596e-05, |
|
"loss": 0.6636, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 3.551312649164678, |
|
"grad_norm": 0.49230995774269104, |
|
"learning_rate": 2.2534893791625406e-05, |
|
"loss": 0.5336, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 3.5608591885441525, |
|
"grad_norm": 0.475827693939209, |
|
"learning_rate": 2.2256233929850044e-05, |
|
"loss": 0.6397, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 3.5704057279236276, |
|
"grad_norm": 0.5518814921379089, |
|
"learning_rate": 2.197881362885426e-05, |
|
"loss": 0.6751, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.579952267303103, |
|
"grad_norm": 0.5297905802726746, |
|
"learning_rate": 2.1702645283470236e-05, |
|
"loss": 0.6379, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.5894988066825775, |
|
"grad_norm": 0.5582184195518494, |
|
"learning_rate": 2.1427741232594184e-05, |
|
"loss": 0.6456, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 3.5990453460620526, |
|
"grad_norm": 0.6382442712783813, |
|
"learning_rate": 2.115411375863497e-05, |
|
"loss": 0.7669, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 3.6085918854415273, |
|
"grad_norm": 0.5114546418190002, |
|
"learning_rate": 2.0881775086965495e-05, |
|
"loss": 0.6209, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 3.6181384248210025, |
|
"grad_norm": 0.5362977981567383, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 0.5689, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 3.627684964200477, |
|
"grad_norm": 0.5577702522277832, |
|
"learning_rate": 2.0341012763532243e-05, |
|
"loss": 0.68, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.6372315035799523, |
|
"grad_norm": 0.4712672233581543, |
|
"learning_rate": 2.0072613272430923e-05, |
|
"loss": 0.6012, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 3.6467780429594274, |
|
"grad_norm": 0.5355048179626465, |
|
"learning_rate": 1.9805550903864774e-05, |
|
"loss": 0.6409, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 3.656324582338902, |
|
"grad_norm": 0.5459690093994141, |
|
"learning_rate": 1.9539837589885024e-05, |
|
"loss": 0.615, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 3.665871121718377, |
|
"grad_norm": 0.5169499516487122, |
|
"learning_rate": 1.9275485202268572e-05, |
|
"loss": 0.5431, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 3.675417661097852, |
|
"grad_norm": 0.4908435344696045, |
|
"learning_rate": 1.9012505551987765e-05, |
|
"loss": 0.5995, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.684964200477327, |
|
"grad_norm": 0.5475894808769226, |
|
"learning_rate": 1.875091038868243e-05, |
|
"loss": 0.5709, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 3.694510739856802, |
|
"grad_norm": 0.4647465944290161, |
|
"learning_rate": 1.8490711400135118e-05, |
|
"loss": 0.6245, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 3.704057279236277, |
|
"grad_norm": 0.5150587558746338, |
|
"learning_rate": 1.823192021174882e-05, |
|
"loss": 0.6999, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 3.713603818615752, |
|
"grad_norm": 0.5281825661659241, |
|
"learning_rate": 1.7974548386027585e-05, |
|
"loss": 0.6953, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 3.7231503579952268, |
|
"grad_norm": 0.5862619876861572, |
|
"learning_rate": 1.771860742205988e-05, |
|
"loss": 0.7626, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.7326968973747015, |
|
"grad_norm": 0.5895105600357056, |
|
"learning_rate": 1.746410875500488e-05, |
|
"loss": 0.7215, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 3.7422434367541766, |
|
"grad_norm": 0.5397690534591675, |
|
"learning_rate": 1.7211063755581525e-05, |
|
"loss": 0.6534, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 3.7517899761336517, |
|
"grad_norm": 0.5140255689620972, |
|
"learning_rate": 1.695948372956047e-05, |
|
"loss": 0.6316, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 3.7613365155131264, |
|
"grad_norm": 0.555587887763977, |
|
"learning_rate": 1.6709379917259028e-05, |
|
"loss": 0.5928, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 3.7708830548926016, |
|
"grad_norm": 0.5376729965209961, |
|
"learning_rate": 1.646076349303884e-05, |
|
"loss": 0.7066, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.7804295942720763, |
|
"grad_norm": 0.5037944912910461, |
|
"learning_rate": 1.621364556480675e-05, |
|
"loss": 0.7138, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 3.7899761336515514, |
|
"grad_norm": 0.5480453372001648, |
|
"learning_rate": 1.596803717351845e-05, |
|
"loss": 0.6835, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 3.799522673031026, |
|
"grad_norm": 0.5427130460739136, |
|
"learning_rate": 1.5723949292685192e-05, |
|
"loss": 0.645, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 3.8090692124105012, |
|
"grad_norm": 0.5332582592964172, |
|
"learning_rate": 1.548139282788349e-05, |
|
"loss": 0.6536, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 3.8186157517899764, |
|
"grad_norm": 0.45773863792419434, |
|
"learning_rate": 1.5240378616267886e-05, |
|
"loss": 0.4132, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.828162291169451, |
|
"grad_norm": 0.5269774794578552, |
|
"learning_rate": 1.5000917426086768e-05, |
|
"loss": 0.709, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 3.8377088305489258, |
|
"grad_norm": 0.5353370904922485, |
|
"learning_rate": 1.4763019956201252e-05, |
|
"loss": 0.6245, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 3.847255369928401, |
|
"grad_norm": 0.5651338696479797, |
|
"learning_rate": 1.452669683560709e-05, |
|
"loss": 0.7354, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 3.856801909307876, |
|
"grad_norm": 0.5225863456726074, |
|
"learning_rate": 1.4291958622959973e-05, |
|
"loss": 0.6269, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 3.8663484486873507, |
|
"grad_norm": 0.4741249680519104, |
|
"learning_rate": 1.4058815806103542e-05, |
|
"loss": 0.6367, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.875894988066826, |
|
"grad_norm": 0.4927036166191101, |
|
"learning_rate": 1.3827278801600979e-05, |
|
"loss": 0.5917, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 3.8854415274463006, |
|
"grad_norm": 0.528948187828064, |
|
"learning_rate": 1.3597357954269535e-05, |
|
"loss": 0.6245, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 3.8949880668257757, |
|
"grad_norm": 0.499723881483078, |
|
"learning_rate": 1.3369063536718345e-05, |
|
"loss": 0.6612, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 3.9045346062052504, |
|
"grad_norm": 0.5520623326301575, |
|
"learning_rate": 1.3142405748889457e-05, |
|
"loss": 0.6276, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 3.9140811455847255, |
|
"grad_norm": 0.519705593585968, |
|
"learning_rate": 1.2917394717602121e-05, |
|
"loss": 0.6639, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.9236276849642007, |
|
"grad_norm": 0.5106028318405151, |
|
"learning_rate": 1.2694040496100318e-05, |
|
"loss": 0.6402, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 3.9331742243436754, |
|
"grad_norm": 0.5074647665023804, |
|
"learning_rate": 1.2472353063603625e-05, |
|
"loss": 0.6424, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 3.9427207637231505, |
|
"grad_norm": 0.594458281993866, |
|
"learning_rate": 1.2252342324861272e-05, |
|
"loss": 0.6677, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 3.952267303102625, |
|
"grad_norm": 0.5148235559463501, |
|
"learning_rate": 1.2034018109709716e-05, |
|
"loss": 0.5508, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 3.9618138424821003, |
|
"grad_norm": 0.5786213278770447, |
|
"learning_rate": 1.1817390172633403e-05, |
|
"loss": 0.5669, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.971360381861575, |
|
"grad_norm": 0.5178529024124146, |
|
"learning_rate": 1.1602468192328936e-05, |
|
"loss": 0.6335, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 3.971360381861575, |
|
"eval_loss": 0.7703084945678711, |
|
"eval_runtime": 13.1275, |
|
"eval_samples_per_second": 13.483, |
|
"eval_steps_per_second": 1.752, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 3.98090692124105, |
|
"grad_norm": 0.6362994313240051, |
|
"learning_rate": 1.1389261771272663e-05, |
|
"loss": 0.667, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 3.9904534606205253, |
|
"grad_norm": 0.46682727336883545, |
|
"learning_rate": 1.117778043529164e-05, |
|
"loss": 0.5675, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.5248848795890808, |
|
"learning_rate": 1.096803363313803e-05, |
|
"loss": 0.6431, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 4.009546539379475, |
|
"grad_norm": 0.4764450192451477, |
|
"learning_rate": 1.0760030736066951e-05, |
|
"loss": 0.5765, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.01909307875895, |
|
"grad_norm": 0.5138590335845947, |
|
"learning_rate": 1.055378103741777e-05, |
|
"loss": 0.5938, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 4.028639618138425, |
|
"grad_norm": 0.560528039932251, |
|
"learning_rate": 1.034929375219884e-05, |
|
"loss": 0.7372, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 4.0381861575179, |
|
"grad_norm": 0.5144191384315491, |
|
"learning_rate": 1.0146578016675934e-05, |
|
"loss": 0.6935, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 4.047732696897374, |
|
"grad_norm": 0.4730660915374756, |
|
"learning_rate": 9.945642887963841e-06, |
|
"loss": 0.5619, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 4.05727923627685, |
|
"grad_norm": 0.5379086136817932, |
|
"learning_rate": 9.746497343621857e-06, |
|
"loss": 0.5955, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 4.066825775656325, |
|
"grad_norm": 0.46496352553367615, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 0.5427, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 4.076372315035799, |
|
"grad_norm": 0.4938548803329468, |
|
"learning_rate": 9.353610518104611e-06, |
|
"loss": 0.6966, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 4.085918854415274, |
|
"grad_norm": 0.5221308469772339, |
|
"learning_rate": 9.159886790678124e-06, |
|
"loss": 0.6781, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 4.09546539379475, |
|
"grad_norm": 0.4903409779071808, |
|
"learning_rate": 8.967987754335022e-06, |
|
"loss": 0.6055, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 4.105011933174224, |
|
"grad_norm": 0.5302107930183411, |
|
"learning_rate": 8.777921982911996e-06, |
|
"loss": 0.5903, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 4.114558472553699, |
|
"grad_norm": 0.4461494982242584, |
|
"learning_rate": 8.589697968337446e-06, |
|
"loss": 0.5499, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 4.124105011933175, |
|
"grad_norm": 0.6021783351898193, |
|
"learning_rate": 8.40332412025216e-06, |
|
"loss": 0.6363, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 4.133651551312649, |
|
"grad_norm": 0.5821279883384705, |
|
"learning_rate": 8.218808765633512e-06, |
|
"loss": 0.5479, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 4.143198090692124, |
|
"grad_norm": 0.5231468677520752, |
|
"learning_rate": 8.036160148423449e-06, |
|
"loss": 0.6066, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 4.152744630071599, |
|
"grad_norm": 0.5418100357055664, |
|
"learning_rate": 7.85538642916015e-06, |
|
"loss": 0.6153, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 4.162291169451074, |
|
"grad_norm": 0.5310184359550476, |
|
"learning_rate": 7.676495684613432e-06, |
|
"loss": 0.541, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 4.171837708830549, |
|
"grad_norm": 0.5160778164863586, |
|
"learning_rate": 7.499495907423887e-06, |
|
"loss": 0.6041, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 4.181384248210024, |
|
"grad_norm": 0.5340070128440857, |
|
"learning_rate": 7.324395005745771e-06, |
|
"loss": 0.6006, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 4.190930787589499, |
|
"grad_norm": 0.5116534233093262, |
|
"learning_rate": 7.151200802893682e-06, |
|
"loss": 0.6093, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 4.200477326968974, |
|
"grad_norm": 0.5754937529563904, |
|
"learning_rate": 6.979921036993042e-06, |
|
"loss": 0.6219, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.210023866348449, |
|
"grad_norm": 0.6205330491065979, |
|
"learning_rate": 6.810563360634298e-06, |
|
"loss": 0.6091, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 4.219570405727923, |
|
"grad_norm": 0.47774946689605713, |
|
"learning_rate": 6.643135340531137e-06, |
|
"loss": 0.5943, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 4.229116945107399, |
|
"grad_norm": 0.5198448896408081, |
|
"learning_rate": 6.477644457182275e-06, |
|
"loss": 0.711, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 4.238663484486874, |
|
"grad_norm": 0.548129141330719, |
|
"learning_rate": 6.314098104537325e-06, |
|
"loss": 0.5444, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 4.248210023866348, |
|
"grad_norm": 0.549915611743927, |
|
"learning_rate": 6.152503589666425e-06, |
|
"loss": 0.6128, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.257756563245824, |
|
"grad_norm": 0.473899245262146, |
|
"learning_rate": 5.992868132433754e-06, |
|
"loss": 0.5761, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 4.2673031026252985, |
|
"grad_norm": 0.6241645216941833, |
|
"learning_rate": 5.835198865174956e-06, |
|
"loss": 0.6581, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 4.276849642004773, |
|
"grad_norm": 0.5191392302513123, |
|
"learning_rate": 5.679502832378497e-06, |
|
"loss": 0.6428, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 4.286396181384248, |
|
"grad_norm": 0.5194590091705322, |
|
"learning_rate": 5.5257869903709015e-06, |
|
"loss": 0.6437, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 4.2959427207637235, |
|
"grad_norm": 0.565933108329773, |
|
"learning_rate": 5.374058207005944e-06, |
|
"loss": 0.6357, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.305489260143198, |
|
"grad_norm": 0.5410757064819336, |
|
"learning_rate": 5.224323261357844e-06, |
|
"loss": 0.6178, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 4.315035799522673, |
|
"grad_norm": 0.5989289283752441, |
|
"learning_rate": 5.0765888434183454e-06, |
|
"loss": 0.6333, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 4.324582338902148, |
|
"grad_norm": 0.48727279901504517, |
|
"learning_rate": 4.930861553797822e-06, |
|
"loss": 0.5238, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 4.334128878281623, |
|
"grad_norm": 0.5582761764526367, |
|
"learning_rate": 4.7871479034303835e-06, |
|
"loss": 0.5644, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 4.343675417661098, |
|
"grad_norm": 0.5856003165245056, |
|
"learning_rate": 4.645454313282965e-06, |
|
"loss": 0.7528, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 4.353221957040573, |
|
"grad_norm": 0.5278400778770447, |
|
"learning_rate": 4.505787114068432e-06, |
|
"loss": 0.5257, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 4.362768496420047, |
|
"grad_norm": 0.555724024772644, |
|
"learning_rate": 4.3681525459627614e-06, |
|
"loss": 0.6286, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 4.372315035799523, |
|
"grad_norm": 0.5980591773986816, |
|
"learning_rate": 4.232556758326212e-06, |
|
"loss": 0.7107, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 4.3818615751789975, |
|
"grad_norm": 0.5622268915176392, |
|
"learning_rate": 4.099005809428596e-06, |
|
"loss": 0.6305, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 4.391408114558472, |
|
"grad_norm": 0.5708304643630981, |
|
"learning_rate": 3.967505666178556e-06, |
|
"loss": 0.5354, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.400954653937948, |
|
"grad_norm": 0.5751599073410034, |
|
"learning_rate": 3.838062203857074e-06, |
|
"loss": 0.5867, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 4.4105011933174225, |
|
"grad_norm": 0.5867375135421753, |
|
"learning_rate": 3.7106812058548377e-06, |
|
"loss": 0.5848, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 4.420047732696897, |
|
"grad_norm": 0.5069125294685364, |
|
"learning_rate": 3.5853683634139434e-06, |
|
"loss": 0.5737, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 4.429594272076372, |
|
"grad_norm": 0.5603401064872742, |
|
"learning_rate": 3.462129275373577e-06, |
|
"loss": 0.6771, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 4.4391408114558475, |
|
"grad_norm": 0.4407503306865692, |
|
"learning_rate": 3.340969447919873e-06, |
|
"loss": 0.4828, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 4.448687350835322, |
|
"grad_norm": 0.5868078470230103, |
|
"learning_rate": 3.2218942943399112e-06, |
|
"loss": 0.6198, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 4.458233890214797, |
|
"grad_norm": 0.5815831422805786, |
|
"learning_rate": 3.104909134779821e-06, |
|
"loss": 0.6983, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 4.4677804295942725, |
|
"grad_norm": 0.5096269845962524, |
|
"learning_rate": 2.9900191960071544e-06, |
|
"loss": 0.5835, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 4.4677804295942725, |
|
"eval_loss": 0.774871826171875, |
|
"eval_runtime": 13.0266, |
|
"eval_samples_per_second": 13.588, |
|
"eval_steps_per_second": 1.766, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 4.477326968973747, |
|
"grad_norm": 0.5428768396377563, |
|
"learning_rate": 2.877229611177268e-06, |
|
"loss": 0.6312, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 4.486873508353222, |
|
"grad_norm": 0.6652801632881165, |
|
"learning_rate": 2.7665454196040664e-06, |
|
"loss": 0.564, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.4964200477326965, |
|
"grad_norm": 0.6429077982902527, |
|
"learning_rate": 2.6579715665347893e-06, |
|
"loss": 0.6679, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 4.505966587112172, |
|
"grad_norm": 0.5690730810165405, |
|
"learning_rate": 2.5515129029290986e-06, |
|
"loss": 0.615, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 4.515513126491647, |
|
"grad_norm": 0.6044589877128601, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 0.6089, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 4.5250596658711215, |
|
"grad_norm": 0.5761138200759888, |
|
"learning_rate": 2.34496007521296e-06, |
|
"loss": 0.6444, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 4.534606205250597, |
|
"grad_norm": 0.6119807958602905, |
|
"learning_rate": 2.2448751396543787e-06, |
|
"loss": 0.7381, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 4.544152744630072, |
|
"grad_norm": 0.5442492961883545, |
|
"learning_rate": 2.1469238502507925e-06, |
|
"loss": 0.6084, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 4.5536992840095465, |
|
"grad_norm": 0.5115097761154175, |
|
"learning_rate": 2.0511105833574683e-06, |
|
"loss": 0.5036, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 4.563245823389021, |
|
"grad_norm": 0.5502949953079224, |
|
"learning_rate": 1.957439619805196e-06, |
|
"loss": 0.6587, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 4.572792362768497, |
|
"grad_norm": 0.5296341776847839, |
|
"learning_rate": 1.865915144708985e-06, |
|
"loss": 0.6043, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 4.5823389021479715, |
|
"grad_norm": 0.5244660377502441, |
|
"learning_rate": 1.7765412472811771e-06, |
|
"loss": 0.7407, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.591885441527446, |
|
"grad_norm": 0.4937599301338196, |
|
"learning_rate": 1.6893219206486233e-06, |
|
"loss": 0.653, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 4.601431980906922, |
|
"grad_norm": 0.5165035128593445, |
|
"learning_rate": 1.6042610616743781e-06, |
|
"loss": 0.6745, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 4.610978520286396, |
|
"grad_norm": 0.5757156014442444, |
|
"learning_rate": 1.5213624707835273e-06, |
|
"loss": 0.7117, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 4.620525059665871, |
|
"grad_norm": 0.5235234498977661, |
|
"learning_rate": 1.4406298517934069e-06, |
|
"loss": 0.5699, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 4.630071599045346, |
|
"grad_norm": 0.5536040663719177, |
|
"learning_rate": 1.3620668117481472e-06, |
|
"loss": 0.6527, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 4.6396181384248205, |
|
"grad_norm": 0.5768601894378662, |
|
"learning_rate": 1.2856768607574566e-06, |
|
"loss": 0.7039, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 4.649164677804296, |
|
"grad_norm": 0.5293567776679993, |
|
"learning_rate": 1.2114634118398638e-06, |
|
"loss": 0.5016, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 4.658711217183771, |
|
"grad_norm": 0.4922030568122864, |
|
"learning_rate": 1.1394297807701737e-06, |
|
"loss": 0.6378, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 4.6682577565632455, |
|
"grad_norm": 0.5279848575592041, |
|
"learning_rate": 1.0695791859313298e-06, |
|
"loss": 0.5854, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 4.677804295942721, |
|
"grad_norm": 0.5893853902816772, |
|
"learning_rate": 1.0019147481706625e-06, |
|
"loss": 0.6481, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.687350835322196, |
|
"grad_norm": 0.5379951000213623, |
|
"learning_rate": 9.364394906603901e-07, |
|
"loss": 0.5932, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 4.6968973747016705, |
|
"grad_norm": 0.5492742657661438, |
|
"learning_rate": 8.731563387626096e-07, |
|
"loss": 0.6475, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 4.706443914081145, |
|
"grad_norm": 0.596684455871582, |
|
"learning_rate": 8.120681198985292e-07, |
|
"loss": 0.6257, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 4.715990453460621, |
|
"grad_norm": 0.5613247156143188, |
|
"learning_rate": 7.531775634222138e-07, |
|
"loss": 0.6233, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 4.725536992840095, |
|
"grad_norm": 0.5937960743904114, |
|
"learning_rate": 6.964873004985717e-07, |
|
"loss": 0.6734, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.73508353221957, |
|
"grad_norm": 0.5525433421134949, |
|
"learning_rate": 6.419998639858538e-07, |
|
"loss": 0.5601, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 4.744630071599046, |
|
"grad_norm": 0.5674152374267578, |
|
"learning_rate": 5.897176883224442e-07, |
|
"loss": 0.6594, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 4.75417661097852, |
|
"grad_norm": 0.5498500466346741, |
|
"learning_rate": 5.396431094181198e-07, |
|
"loss": 0.616, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 4.763723150357995, |
|
"grad_norm": 0.5305196046829224, |
|
"learning_rate": 4.917783645496888e-07, |
|
"loss": 0.4995, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 4.77326968973747, |
|
"grad_norm": 0.5790063738822937, |
|
"learning_rate": 4.461255922609986e-07, |
|
"loss": 0.5518, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.782816229116945, |
|
"grad_norm": 0.548302412033081, |
|
"learning_rate": 4.0268683226741265e-07, |
|
"loss": 0.6202, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 4.79236276849642, |
|
"grad_norm": 0.548893928527832, |
|
"learning_rate": 3.6146402536468283e-07, |
|
"loss": 0.6218, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 4.801909307875895, |
|
"grad_norm": 0.5890070199966431, |
|
"learning_rate": 3.2245901334221895e-07, |
|
"loss": 0.6638, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 4.81145584725537, |
|
"grad_norm": 0.4871584475040436, |
|
"learning_rate": 2.856735389008269e-07, |
|
"loss": 0.6107, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 4.821002386634845, |
|
"grad_norm": 0.5432624816894531, |
|
"learning_rate": 2.511092455747932e-07, |
|
"loss": 0.583, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 4.83054892601432, |
|
"grad_norm": 0.5359986424446106, |
|
"learning_rate": 2.1876767765853234e-07, |
|
"loss": 0.5368, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 4.840095465393794, |
|
"grad_norm": 0.5359886288642883, |
|
"learning_rate": 1.8865028013751452e-07, |
|
"loss": 0.6259, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 4.84964200477327, |
|
"grad_norm": 0.5111921429634094, |
|
"learning_rate": 1.6075839862374488e-07, |
|
"loss": 0.5609, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 4.859188544152745, |
|
"grad_norm": 0.6437258124351501, |
|
"learning_rate": 1.3509327929563942e-07, |
|
"loss": 0.6395, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 4.868735083532219, |
|
"grad_norm": 0.5992398262023926, |
|
"learning_rate": 1.1165606884234181e-07, |
|
"loss": 0.6546, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.878281622911695, |
|
"grad_norm": 0.5831811428070068, |
|
"learning_rate": 9.044781441249207e-08, |
|
"loss": 0.609, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 4.88782816229117, |
|
"grad_norm": 0.5561614632606506, |
|
"learning_rate": 7.146946356743067e-08, |
|
"loss": 0.6699, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 4.897374701670644, |
|
"grad_norm": 0.5337750911712646, |
|
"learning_rate": 5.472186423889358e-08, |
|
"loss": 0.5614, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 4.906921241050119, |
|
"grad_norm": 0.41179969906806946, |
|
"learning_rate": 4.020576469108139e-08, |
|
"loss": 0.4451, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 4.916467780429595, |
|
"grad_norm": 0.4820442795753479, |
|
"learning_rate": 2.792181348726941e-08, |
|
"loss": 0.5897, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 4.926014319809069, |
|
"grad_norm": 0.5927594900131226, |
|
"learning_rate": 1.7870559460814173e-08, |
|
"loss": 0.6788, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 4.935560859188544, |
|
"grad_norm": 0.5302107334136963, |
|
"learning_rate": 1.0052451690617527e-08, |
|
"loss": 0.6105, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 4.945107398568019, |
|
"grad_norm": 0.5596168041229248, |
|
"learning_rate": 4.46783948109819e-09, |
|
"loss": 0.5537, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 4.954653937947494, |
|
"grad_norm": 0.5655501484870911, |
|
"learning_rate": 1.1169723465487281e-09, |
|
"loss": 0.6915, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 4.964200477326969, |
|
"grad_norm": 0.5537259578704834, |
|
"learning_rate": 0.0, |
|
"loss": 0.7157, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.964200477326969, |
|
"eval_loss": 0.7753176689147949, |
|
"eval_runtime": 12.9676, |
|
"eval_samples_per_second": 13.649, |
|
"eval_steps_per_second": 1.774, |
|
"step": 520 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 520, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 20, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.0257258893869056e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|