|
{ |
|
"best_metric": 1.151181697845459, |
|
"best_model_checkpoint": "miner_id_besimray/checkpoint-80", |
|
"epoch": 5.894736842105263, |
|
"eval_steps": 20, |
|
"global_step": 280, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.021052631578947368, |
|
"grad_norm": 0.25488582253456116, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2983, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.021052631578947368, |
|
"eval_loss": 1.2585705518722534, |
|
"eval_runtime": 14.9372, |
|
"eval_samples_per_second": 6.695, |
|
"eval_steps_per_second": 0.669, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.042105263157894736, |
|
"grad_norm": 0.2551250755786896, |
|
"learning_rate": 4e-05, |
|
"loss": 1.4576, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.06315789473684211, |
|
"grad_norm": 0.22965364158153534, |
|
"learning_rate": 6e-05, |
|
"loss": 1.2758, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.08421052631578947, |
|
"grad_norm": 0.25596627593040466, |
|
"learning_rate": 8e-05, |
|
"loss": 1.4291, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.10526315789473684, |
|
"grad_norm": 0.21169574558734894, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2482, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.12631578947368421, |
|
"grad_norm": 0.2537442147731781, |
|
"learning_rate": 0.00012, |
|
"loss": 1.4111, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.14736842105263157, |
|
"grad_norm": 0.22837992012500763, |
|
"learning_rate": 0.00014, |
|
"loss": 1.338, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.16842105263157894, |
|
"grad_norm": 0.16500858962535858, |
|
"learning_rate": 0.00016, |
|
"loss": 1.2243, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.18947368421052632, |
|
"grad_norm": 0.2048870176076889, |
|
"learning_rate": 0.00018, |
|
"loss": 1.2117, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.21052631578947367, |
|
"grad_norm": 0.22877398133277893, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3922, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.23157894736842105, |
|
"grad_norm": 0.25871542096138, |
|
"learning_rate": 0.0001999999780359183, |
|
"loss": 1.2073, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.25263157894736843, |
|
"grad_norm": 0.18334853649139404, |
|
"learning_rate": 0.00019999991214368284, |
|
"loss": 1.2268, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.2736842105263158, |
|
"grad_norm": 0.1385767012834549, |
|
"learning_rate": 0.0001999998023233226, |
|
"loss": 1.0728, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.29473684210526313, |
|
"grad_norm": 0.12759718298912048, |
|
"learning_rate": 0.0001999996485748858, |
|
"loss": 1.0786, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.3157894736842105, |
|
"grad_norm": 0.13563676178455353, |
|
"learning_rate": 0.00019999945089843994, |
|
"loss": 1.0021, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.3368421052631579, |
|
"grad_norm": 0.12118455767631531, |
|
"learning_rate": 0.0001999992092940719, |
|
"loss": 1.2168, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.35789473684210527, |
|
"grad_norm": 0.17731498181819916, |
|
"learning_rate": 0.00019999892376188782, |
|
"loss": 1.2883, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.37894736842105264, |
|
"grad_norm": 0.1495707631111145, |
|
"learning_rate": 0.0001999985943020131, |
|
"loss": 1.2307, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.14801433682441711, |
|
"learning_rate": 0.00019999822091459248, |
|
"loss": 1.2703, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 0.15497420728206635, |
|
"learning_rate": 0.00019999780359979, |
|
"loss": 1.3601, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"eval_loss": 1.1756575107574463, |
|
"eval_runtime": 15.1128, |
|
"eval_samples_per_second": 6.617, |
|
"eval_steps_per_second": 0.662, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4421052631578947, |
|
"grad_norm": 0.154701828956604, |
|
"learning_rate": 0.00019999734235778894, |
|
"loss": 1.0972, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.4631578947368421, |
|
"grad_norm": 0.12257974594831467, |
|
"learning_rate": 0.00019999683718879195, |
|
"loss": 1.1799, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.4842105263157895, |
|
"grad_norm": 0.14035549759864807, |
|
"learning_rate": 0.0001999962880930209, |
|
"loss": 1.2342, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.5052631578947369, |
|
"grad_norm": 0.1992308348417282, |
|
"learning_rate": 0.00019999569507071706, |
|
"loss": 1.2253, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 0.1479017287492752, |
|
"learning_rate": 0.00019999505812214085, |
|
"loss": 1.2039, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5473684210526316, |
|
"grad_norm": 0.16983704268932343, |
|
"learning_rate": 0.00019999437724757218, |
|
"loss": 1.2489, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.5684210526315789, |
|
"grad_norm": 0.1630581021308899, |
|
"learning_rate": 0.00019999365244731, |
|
"loss": 1.2576, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5894736842105263, |
|
"grad_norm": 0.15563088655471802, |
|
"learning_rate": 0.00019999288372167287, |
|
"loss": 1.231, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.6105263157894737, |
|
"grad_norm": 0.15619848668575287, |
|
"learning_rate": 0.00019999207107099834, |
|
"loss": 1.3292, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.631578947368421, |
|
"grad_norm": 0.1286384016275406, |
|
"learning_rate": 0.00019999121449564347, |
|
"loss": 1.1393, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6526315789473685, |
|
"grad_norm": 0.13140268623828888, |
|
"learning_rate": 0.0001999903139959845, |
|
"loss": 1.0837, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.6736842105263158, |
|
"grad_norm": 0.13603107631206512, |
|
"learning_rate": 0.000199989369572417, |
|
"loss": 1.1516, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.6947368421052632, |
|
"grad_norm": 0.1404484510421753, |
|
"learning_rate": 0.00019998838122535585, |
|
"loss": 1.235, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.7157894736842105, |
|
"grad_norm": 0.18797928094863892, |
|
"learning_rate": 0.00019998734895523525, |
|
"loss": 1.1864, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.7368421052631579, |
|
"grad_norm": 0.1484900712966919, |
|
"learning_rate": 0.00019998627276250858, |
|
"loss": 1.1262, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.7578947368421053, |
|
"grad_norm": 0.14105963706970215, |
|
"learning_rate": 0.0001999851526476487, |
|
"loss": 1.238, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.7789473684210526, |
|
"grad_norm": 0.1366693079471588, |
|
"learning_rate": 0.00019998398861114752, |
|
"loss": 1.2757, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.16491512954235077, |
|
"learning_rate": 0.00019998278065351646, |
|
"loss": 1.2666, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.8210526315789474, |
|
"grad_norm": 0.13820528984069824, |
|
"learning_rate": 0.0001999815287752862, |
|
"loss": 1.1497, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 0.15959441661834717, |
|
"learning_rate": 0.00019998023297700658, |
|
"loss": 1.2034, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"eval_loss": 1.1566897630691528, |
|
"eval_runtime": 15.1159, |
|
"eval_samples_per_second": 6.616, |
|
"eval_steps_per_second": 0.662, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8631578947368421, |
|
"grad_norm": 0.18244534730911255, |
|
"learning_rate": 0.00019997889325924683, |
|
"loss": 1.2506, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.8842105263157894, |
|
"grad_norm": 0.13269871473312378, |
|
"learning_rate": 0.0001999775096225955, |
|
"loss": 1.1754, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.9052631578947369, |
|
"grad_norm": 0.14465025067329407, |
|
"learning_rate": 0.00019997608206766038, |
|
"loss": 1.1623, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.9263157894736842, |
|
"grad_norm": 0.14105112850666046, |
|
"learning_rate": 0.00019997461059506857, |
|
"loss": 1.15, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.9473684210526315, |
|
"grad_norm": 0.14860253036022186, |
|
"learning_rate": 0.00019997309520546647, |
|
"loss": 1.2272, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.968421052631579, |
|
"grad_norm": 0.14539220929145813, |
|
"learning_rate": 0.00019997153589951973, |
|
"loss": 1.1894, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.9894736842105263, |
|
"grad_norm": 0.1519959717988968, |
|
"learning_rate": 0.00019996993267791337, |
|
"loss": 1.0394, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.0105263157894737, |
|
"grad_norm": 0.14171597361564636, |
|
"learning_rate": 0.00019996828554135162, |
|
"loss": 1.3091, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.0315789473684212, |
|
"grad_norm": 0.1611461490392685, |
|
"learning_rate": 0.0001999665944905581, |
|
"loss": 1.1096, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 0.18341802060604095, |
|
"learning_rate": 0.00019996485952627552, |
|
"loss": 1.1828, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.0736842105263158, |
|
"grad_norm": 0.15702421963214874, |
|
"learning_rate": 0.00019996308064926615, |
|
"loss": 1.1569, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.0947368421052632, |
|
"grad_norm": 0.16117393970489502, |
|
"learning_rate": 0.00019996125786031138, |
|
"loss": 1.1337, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.1157894736842104, |
|
"grad_norm": 0.16280895471572876, |
|
"learning_rate": 0.00019995939116021193, |
|
"loss": 1.2093, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.1368421052631579, |
|
"grad_norm": 0.16143617033958435, |
|
"learning_rate": 0.00019995748054978777, |
|
"loss": 1.1394, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.1578947368421053, |
|
"grad_norm": 0.1600114107131958, |
|
"learning_rate": 0.00019995552602987827, |
|
"loss": 1.0491, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.1789473684210527, |
|
"grad_norm": 0.148758202791214, |
|
"learning_rate": 0.00019995352760134193, |
|
"loss": 1.0947, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.18547968566417694, |
|
"learning_rate": 0.00019995148526505665, |
|
"loss": 1.0401, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.2210526315789474, |
|
"grad_norm": 0.16114592552185059, |
|
"learning_rate": 0.00019994939902191964, |
|
"loss": 1.0556, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.2421052631578948, |
|
"grad_norm": 0.21020422875881195, |
|
"learning_rate": 0.0001999472688728473, |
|
"loss": 1.1656, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.263157894736842, |
|
"grad_norm": 0.16495747864246368, |
|
"learning_rate": 0.00019994509481877537, |
|
"loss": 1.1302, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.263157894736842, |
|
"eval_loss": 1.1534416675567627, |
|
"eval_runtime": 15.0862, |
|
"eval_samples_per_second": 6.629, |
|
"eval_steps_per_second": 0.663, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.2842105263157895, |
|
"grad_norm": 0.18857495486736298, |
|
"learning_rate": 0.00019994287686065886, |
|
"loss": 1.0208, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.305263157894737, |
|
"grad_norm": 0.15040083229541779, |
|
"learning_rate": 0.00019994061499947212, |
|
"loss": 1.0189, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.3263157894736843, |
|
"grad_norm": 0.18986788392066956, |
|
"learning_rate": 0.00019993830923620872, |
|
"loss": 1.0854, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.3473684210526315, |
|
"grad_norm": 0.18154074251651764, |
|
"learning_rate": 0.00019993595957188152, |
|
"loss": 1.1744, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.368421052631579, |
|
"grad_norm": 0.1574270874261856, |
|
"learning_rate": 0.00019993356600752276, |
|
"loss": 1.1356, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.3894736842105262, |
|
"grad_norm": 0.19574891030788422, |
|
"learning_rate": 0.0001999311285441838, |
|
"loss": 1.0907, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.4105263157894736, |
|
"grad_norm": 0.20714887976646423, |
|
"learning_rate": 0.0001999286471829354, |
|
"loss": 1.1834, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.431578947368421, |
|
"grad_norm": 0.21967269480228424, |
|
"learning_rate": 0.0001999261219248676, |
|
"loss": 1.2089, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.4526315789473685, |
|
"grad_norm": 0.21682047843933105, |
|
"learning_rate": 0.00019992355277108966, |
|
"loss": 1.1635, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.4736842105263157, |
|
"grad_norm": 0.17350642383098602, |
|
"learning_rate": 0.00019992093972273018, |
|
"loss": 1.015, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.4947368421052631, |
|
"grad_norm": 0.2019474357366562, |
|
"learning_rate": 0.00019991828278093706, |
|
"loss": 1.2032, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.5157894736842106, |
|
"grad_norm": 0.2160518765449524, |
|
"learning_rate": 0.0001999155819468774, |
|
"loss": 1.0798, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.5368421052631578, |
|
"grad_norm": 0.17687132954597473, |
|
"learning_rate": 0.00019991283722173764, |
|
"loss": 1.1356, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.5578947368421052, |
|
"grad_norm": 0.16519969701766968, |
|
"learning_rate": 0.0001999100486067235, |
|
"loss": 1.0129, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.5789473684210527, |
|
"grad_norm": 0.15248972177505493, |
|
"learning_rate": 0.00019990721610305996, |
|
"loss": 1.0204, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.1959000527858734, |
|
"learning_rate": 0.00019990433971199125, |
|
"loss": 1.026, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.6210526315789475, |
|
"grad_norm": 0.20230266451835632, |
|
"learning_rate": 0.00019990141943478098, |
|
"loss": 1.0012, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.6421052631578947, |
|
"grad_norm": 0.18247124552726746, |
|
"learning_rate": 0.00019989845527271195, |
|
"loss": 1.1552, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.663157894736842, |
|
"grad_norm": 0.21317291259765625, |
|
"learning_rate": 0.00019989544722708618, |
|
"loss": 1.1461, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.6842105263157894, |
|
"grad_norm": 0.2219182848930359, |
|
"learning_rate": 0.0001998923952992252, |
|
"loss": 1.0958, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.6842105263157894, |
|
"eval_loss": 1.151181697845459, |
|
"eval_runtime": 15.09, |
|
"eval_samples_per_second": 6.627, |
|
"eval_steps_per_second": 0.663, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.7052631578947368, |
|
"grad_norm": 0.20883385837078094, |
|
"learning_rate": 0.00019988929949046958, |
|
"loss": 1.1361, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.7263157894736842, |
|
"grad_norm": 0.23448118567466736, |
|
"learning_rate": 0.00019988615980217925, |
|
"loss": 1.1583, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.7473684210526317, |
|
"grad_norm": 0.19605010747909546, |
|
"learning_rate": 0.00019988297623573344, |
|
"loss": 1.2165, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.768421052631579, |
|
"grad_norm": 0.2519710063934326, |
|
"learning_rate": 0.0001998797487925306, |
|
"loss": 1.188, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.7894736842105263, |
|
"grad_norm": 0.2063812017440796, |
|
"learning_rate": 0.00019987647747398852, |
|
"loss": 1.2146, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.8105263157894735, |
|
"grad_norm": 0.18549425899982452, |
|
"learning_rate": 0.00019987316228154423, |
|
"loss": 1.1634, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.831578947368421, |
|
"grad_norm": 0.22328101098537445, |
|
"learning_rate": 0.00019986980321665403, |
|
"loss": 1.4089, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.8526315789473684, |
|
"grad_norm": 0.23256567120552063, |
|
"learning_rate": 0.00019986640028079347, |
|
"loss": 0.9991, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.8736842105263158, |
|
"grad_norm": 0.23284031450748444, |
|
"learning_rate": 0.0001998629534754574, |
|
"loss": 1.1857, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.8947368421052633, |
|
"grad_norm": 0.25720322132110596, |
|
"learning_rate": 0.00019985946280215994, |
|
"loss": 1.1692, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.9157894736842105, |
|
"grad_norm": 0.25529375672340393, |
|
"learning_rate": 0.00019985592826243453, |
|
"loss": 1.1913, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.936842105263158, |
|
"grad_norm": 0.21553778648376465, |
|
"learning_rate": 0.0001998523498578338, |
|
"loss": 1.0827, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.9578947368421051, |
|
"grad_norm": 0.22560931742191315, |
|
"learning_rate": 0.00019984872758992963, |
|
"loss": 1.2614, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.9789473684210526, |
|
"grad_norm": 0.21169520914554596, |
|
"learning_rate": 0.00019984506146031325, |
|
"loss": 1.1164, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.22062988579273224, |
|
"learning_rate": 0.00019984135147059514, |
|
"loss": 1.1638, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.0210526315789474, |
|
"grad_norm": 0.21019504964351654, |
|
"learning_rate": 0.00019983759762240503, |
|
"loss": 1.0949, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 2.042105263157895, |
|
"grad_norm": 0.19634267687797546, |
|
"learning_rate": 0.00019983379991739188, |
|
"loss": 1.0106, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 2.0631578947368423, |
|
"grad_norm": 0.24126608669757843, |
|
"learning_rate": 0.00019982995835722398, |
|
"loss": 0.9002, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 2.0842105263157893, |
|
"grad_norm": 0.22683700919151306, |
|
"learning_rate": 0.0001998260729435889, |
|
"loss": 1.0105, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 0.27286496758461, |
|
"learning_rate": 0.00019982214367819328, |
|
"loss": 1.0285, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"eval_loss": 1.1652569770812988, |
|
"eval_runtime": 15.0915, |
|
"eval_samples_per_second": 6.626, |
|
"eval_steps_per_second": 0.663, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.126315789473684, |
|
"grad_norm": 0.24511495232582092, |
|
"learning_rate": 0.00019981817056276337, |
|
"loss": 1.0178, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 2.1473684210526316, |
|
"grad_norm": 0.23129835724830627, |
|
"learning_rate": 0.00019981415359904435, |
|
"loss": 0.9067, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.168421052631579, |
|
"grad_norm": 0.25192540884017944, |
|
"learning_rate": 0.00019981009278880087, |
|
"loss": 1.0198, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 2.1894736842105265, |
|
"grad_norm": 0.28074589371681213, |
|
"learning_rate": 0.0001998059881338167, |
|
"loss": 0.9983, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.2105263157894735, |
|
"grad_norm": 0.22504180669784546, |
|
"learning_rate": 0.00019980183963589504, |
|
"loss": 0.9203, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.231578947368421, |
|
"grad_norm": 0.25999030470848083, |
|
"learning_rate": 0.00019979764729685813, |
|
"loss": 1.1073, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.2526315789473683, |
|
"grad_norm": 0.2536907196044922, |
|
"learning_rate": 0.00019979341111854768, |
|
"loss": 1.1423, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 2.2736842105263158, |
|
"grad_norm": 0.24934524297714233, |
|
"learning_rate": 0.0001997891311028245, |
|
"loss": 0.854, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.294736842105263, |
|
"grad_norm": 0.24719196557998657, |
|
"learning_rate": 0.0001997848072515688, |
|
"loss": 1.0318, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.3157894736842106, |
|
"grad_norm": 0.2829945385456085, |
|
"learning_rate": 0.0001997804395666799, |
|
"loss": 0.9709, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.336842105263158, |
|
"grad_norm": 0.29261600971221924, |
|
"learning_rate": 0.00019977602805007648, |
|
"loss": 0.942, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.3578947368421055, |
|
"grad_norm": 0.26871809363365173, |
|
"learning_rate": 0.0001997715727036964, |
|
"loss": 1.1248, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.3789473684210525, |
|
"grad_norm": 0.2754141390323639, |
|
"learning_rate": 0.00019976707352949684, |
|
"loss": 1.1282, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.26692995429039, |
|
"learning_rate": 0.00019976253052945425, |
|
"loss": 0.8413, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.4210526315789473, |
|
"grad_norm": 0.30057746171951294, |
|
"learning_rate": 0.00019975794370556417, |
|
"loss": 1.118, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.442105263157895, |
|
"grad_norm": 0.2569466531276703, |
|
"learning_rate": 0.0001997533130598416, |
|
"loss": 1.0404, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.463157894736842, |
|
"grad_norm": 0.30639445781707764, |
|
"learning_rate": 0.00019974863859432068, |
|
"loss": 1.0001, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.4842105263157896, |
|
"grad_norm": 0.26039522886276245, |
|
"learning_rate": 0.00019974392031105482, |
|
"loss": 1.1001, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.5052631578947366, |
|
"grad_norm": 0.31316864490509033, |
|
"learning_rate": 0.00019973915821211666, |
|
"loss": 0.9833, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.526315789473684, |
|
"grad_norm": 0.28356632590293884, |
|
"learning_rate": 0.00019973435229959813, |
|
"loss": 1.1265, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.526315789473684, |
|
"eval_loss": 1.1785136461257935, |
|
"eval_runtime": 15.0968, |
|
"eval_samples_per_second": 6.624, |
|
"eval_steps_per_second": 0.662, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.5473684210526315, |
|
"grad_norm": 0.3286098837852478, |
|
"learning_rate": 0.0001997295025756103, |
|
"loss": 0.9399, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.568421052631579, |
|
"grad_norm": 0.29396986961364746, |
|
"learning_rate": 0.00019972460904228365, |
|
"loss": 0.9961, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.5894736842105264, |
|
"grad_norm": 0.31531810760498047, |
|
"learning_rate": 0.0001997196717017678, |
|
"loss": 1.0899, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.610526315789474, |
|
"grad_norm": 0.2522255480289459, |
|
"learning_rate": 0.00019971469055623162, |
|
"loss": 0.94, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"grad_norm": 0.2962912917137146, |
|
"learning_rate": 0.00019970966560786324, |
|
"loss": 0.976, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.6526315789473687, |
|
"grad_norm": 0.2782577872276306, |
|
"learning_rate": 0.00019970459685887, |
|
"loss": 0.985, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.6736842105263157, |
|
"grad_norm": 0.31023719906806946, |
|
"learning_rate": 0.00019969948431147858, |
|
"loss": 0.9228, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.694736842105263, |
|
"grad_norm": 0.35043251514434814, |
|
"learning_rate": 0.00019969432796793478, |
|
"loss": 1.0626, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.7157894736842105, |
|
"grad_norm": 0.28269392251968384, |
|
"learning_rate": 0.00019968912783050366, |
|
"loss": 0.9771, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.736842105263158, |
|
"grad_norm": 0.3904775381088257, |
|
"learning_rate": 0.0001996838839014696, |
|
"loss": 1.0986, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.7578947368421054, |
|
"grad_norm": 0.3603789806365967, |
|
"learning_rate": 0.00019967859618313612, |
|
"loss": 0.8981, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.7789473684210524, |
|
"grad_norm": 0.3876775801181793, |
|
"learning_rate": 0.00019967326467782605, |
|
"loss": 0.999, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.33544307947158813, |
|
"learning_rate": 0.0001996678893878814, |
|
"loss": 0.9868, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.8210526315789473, |
|
"grad_norm": 0.27033519744873047, |
|
"learning_rate": 0.00019966247031566345, |
|
"loss": 0.969, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.8421052631578947, |
|
"grad_norm": 0.29576611518859863, |
|
"learning_rate": 0.0001996570074635527, |
|
"loss": 1.1384, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.863157894736842, |
|
"grad_norm": 0.2918352484703064, |
|
"learning_rate": 0.00019965150083394885, |
|
"loss": 0.9385, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.8842105263157896, |
|
"grad_norm": 0.26381438970565796, |
|
"learning_rate": 0.00019964595042927088, |
|
"loss": 0.9742, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.905263157894737, |
|
"grad_norm": 0.3624316155910492, |
|
"learning_rate": 0.000199640356251957, |
|
"loss": 1.1089, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.9263157894736844, |
|
"grad_norm": 0.3607443869113922, |
|
"learning_rate": 0.00019963471830446462, |
|
"loss": 1.0243, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.9473684210526314, |
|
"grad_norm": 0.3374713063240051, |
|
"learning_rate": 0.00019962903658927037, |
|
"loss": 1.0215, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.9473684210526314, |
|
"eval_loss": 1.1921439170837402, |
|
"eval_runtime": 15.1014, |
|
"eval_samples_per_second": 6.622, |
|
"eval_steps_per_second": 0.662, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.968421052631579, |
|
"grad_norm": 0.2549107074737549, |
|
"learning_rate": 0.00019962331110887015, |
|
"loss": 0.9862, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.9894736842105263, |
|
"grad_norm": 0.28837814927101135, |
|
"learning_rate": 0.00019961754186577902, |
|
"loss": 1.0686, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 3.0105263157894737, |
|
"grad_norm": 0.2922016978263855, |
|
"learning_rate": 0.00019961172886253135, |
|
"loss": 0.9724, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 3.031578947368421, |
|
"grad_norm": 0.3212421238422394, |
|
"learning_rate": 0.00019960587210168064, |
|
"loss": 0.9277, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 3.0526315789473686, |
|
"grad_norm": 0.2664543390274048, |
|
"learning_rate": 0.00019959997158579967, |
|
"loss": 0.851, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 3.0736842105263156, |
|
"grad_norm": 0.25021764636039734, |
|
"learning_rate": 0.00019959402731748046, |
|
"loss": 0.8162, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 3.094736842105263, |
|
"grad_norm": 0.2933099567890167, |
|
"learning_rate": 0.00019958803929933421, |
|
"loss": 0.8726, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 3.1157894736842104, |
|
"grad_norm": 0.2778937518596649, |
|
"learning_rate": 0.0001995820075339913, |
|
"loss": 0.8583, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 3.136842105263158, |
|
"grad_norm": 0.3159140646457672, |
|
"learning_rate": 0.0001995759320241014, |
|
"loss": 0.9162, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 3.1578947368421053, |
|
"grad_norm": 0.3638457655906677, |
|
"learning_rate": 0.0001995698127723334, |
|
"loss": 0.8698, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.1789473684210527, |
|
"grad_norm": 0.3314703702926636, |
|
"learning_rate": 0.00019956364978137534, |
|
"loss": 0.7974, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.3441965878009796, |
|
"learning_rate": 0.00019955744305393452, |
|
"loss": 0.8227, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 3.221052631578947, |
|
"grad_norm": 0.3249785304069519, |
|
"learning_rate": 0.00019955119259273743, |
|
"loss": 1.0359, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 3.2421052631578946, |
|
"grad_norm": 0.3665757477283478, |
|
"learning_rate": 0.0001995448984005298, |
|
"loss": 0.8252, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 3.263157894736842, |
|
"grad_norm": 0.36660316586494446, |
|
"learning_rate": 0.00019953856048007652, |
|
"loss": 0.951, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.2842105263157895, |
|
"grad_norm": 0.3821450173854828, |
|
"learning_rate": 0.0001995321788341618, |
|
"loss": 0.8429, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 3.305263157894737, |
|
"grad_norm": 0.3389227092266083, |
|
"learning_rate": 0.0001995257534655889, |
|
"loss": 0.8443, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 3.3263157894736843, |
|
"grad_norm": 0.37333276867866516, |
|
"learning_rate": 0.00019951928437718039, |
|
"loss": 0.8121, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 3.3473684210526318, |
|
"grad_norm": 0.3941514194011688, |
|
"learning_rate": 0.000199512771571778, |
|
"loss": 0.7905, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 3.3684210526315788, |
|
"grad_norm": 0.36983492970466614, |
|
"learning_rate": 0.00019950621505224273, |
|
"loss": 0.8495, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.3684210526315788, |
|
"eval_loss": 1.2672936916351318, |
|
"eval_runtime": 15.1025, |
|
"eval_samples_per_second": 6.621, |
|
"eval_steps_per_second": 0.662, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.389473684210526, |
|
"grad_norm": 0.3236539363861084, |
|
"learning_rate": 0.00019949961482145474, |
|
"loss": 0.7451, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 3.4105263157894736, |
|
"grad_norm": 0.3834153413772583, |
|
"learning_rate": 0.00019949297088231335, |
|
"loss": 0.8501, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 3.431578947368421, |
|
"grad_norm": 0.40946483612060547, |
|
"learning_rate": 0.00019948628323773716, |
|
"loss": 0.7928, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 3.4526315789473685, |
|
"grad_norm": 0.3998974859714508, |
|
"learning_rate": 0.00019947955189066388, |
|
"loss": 0.8213, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 3.473684210526316, |
|
"grad_norm": 0.3378157317638397, |
|
"learning_rate": 0.00019947277684405056, |
|
"loss": 0.8236, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.4947368421052634, |
|
"grad_norm": 0.3661404848098755, |
|
"learning_rate": 0.00019946595810087323, |
|
"loss": 0.8427, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 3.515789473684211, |
|
"grad_norm": 0.3834976255893707, |
|
"learning_rate": 0.0001994590956641273, |
|
"loss": 0.7935, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 3.536842105263158, |
|
"grad_norm": 0.38265261054039, |
|
"learning_rate": 0.00019945218953682734, |
|
"loss": 0.8425, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 3.557894736842105, |
|
"grad_norm": 0.3965548276901245, |
|
"learning_rate": 0.00019944523972200705, |
|
"loss": 0.7629, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 3.5789473684210527, |
|
"grad_norm": 0.33844998478889465, |
|
"learning_rate": 0.00019943824622271935, |
|
"loss": 0.8272, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.3963635563850403, |
|
"learning_rate": 0.0001994312090420364, |
|
"loss": 0.8769, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 3.6210526315789475, |
|
"grad_norm": 0.4465744197368622, |
|
"learning_rate": 0.00019942412818304943, |
|
"loss": 0.8587, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 3.6421052631578945, |
|
"grad_norm": 0.4285814166069031, |
|
"learning_rate": 0.00019941700364886899, |
|
"loss": 0.8129, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 3.663157894736842, |
|
"grad_norm": 0.4269882142543793, |
|
"learning_rate": 0.00019940983544262472, |
|
"loss": 1.0085, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 3.6842105263157894, |
|
"grad_norm": 0.4019848108291626, |
|
"learning_rate": 0.00019940262356746554, |
|
"loss": 0.8352, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.705263157894737, |
|
"grad_norm": 0.3888452351093292, |
|
"learning_rate": 0.00019939536802655945, |
|
"loss": 0.9718, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 3.7263157894736842, |
|
"grad_norm": 0.4387454688549042, |
|
"learning_rate": 0.00019938806882309368, |
|
"loss": 1.1492, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 3.7473684210526317, |
|
"grad_norm": 0.47144615650177, |
|
"learning_rate": 0.00019938072596027462, |
|
"loss": 0.8351, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 3.768421052631579, |
|
"grad_norm": 0.37782907485961914, |
|
"learning_rate": 0.0001993733394413279, |
|
"loss": 0.8221, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 3.7894736842105265, |
|
"grad_norm": 0.3969016969203949, |
|
"learning_rate": 0.0001993659092694982, |
|
"loss": 0.901, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.7894736842105265, |
|
"eval_loss": 1.2610888481140137, |
|
"eval_runtime": 15.0998, |
|
"eval_samples_per_second": 6.623, |
|
"eval_steps_per_second": 0.662, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.8105263157894735, |
|
"grad_norm": 0.34853246808052063, |
|
"learning_rate": 0.00019935843544804956, |
|
"loss": 0.9159, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 3.831578947368421, |
|
"grad_norm": 0.3776082396507263, |
|
"learning_rate": 0.00019935091798026507, |
|
"loss": 0.9256, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 3.8526315789473684, |
|
"grad_norm": 0.42846596240997314, |
|
"learning_rate": 0.00019934335686944694, |
|
"loss": 0.9343, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 3.873684210526316, |
|
"grad_norm": 0.38637641072273254, |
|
"learning_rate": 0.0001993357521189167, |
|
"loss": 0.9058, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 3.8947368421052633, |
|
"grad_norm": 0.35184672474861145, |
|
"learning_rate": 0.00019932810373201495, |
|
"loss": 0.8512, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.9157894736842103, |
|
"grad_norm": 0.3937157690525055, |
|
"learning_rate": 0.00019932041171210151, |
|
"loss": 0.926, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 3.9368421052631577, |
|
"grad_norm": 0.4368656277656555, |
|
"learning_rate": 0.0001993126760625553, |
|
"loss": 1.0166, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 3.957894736842105, |
|
"grad_norm": 0.3807603418827057, |
|
"learning_rate": 0.0001993048967867745, |
|
"loss": 0.9002, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 3.9789473684210526, |
|
"grad_norm": 0.3753417432308197, |
|
"learning_rate": 0.00019929707388817637, |
|
"loss": 0.9097, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.4108405113220215, |
|
"learning_rate": 0.00019928920737019733, |
|
"loss": 0.9339, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.021052631578947, |
|
"grad_norm": 0.3922579288482666, |
|
"learning_rate": 0.0001992812972362931, |
|
"loss": 0.7981, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 4.042105263157895, |
|
"grad_norm": 0.377907931804657, |
|
"learning_rate": 0.00019927334348993837, |
|
"loss": 0.7216, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 4.063157894736842, |
|
"grad_norm": 0.3310043513774872, |
|
"learning_rate": 0.00019926534613462707, |
|
"loss": 0.6362, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 4.08421052631579, |
|
"grad_norm": 0.38383597135543823, |
|
"learning_rate": 0.00019925730517387239, |
|
"loss": 0.6881, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 4.105263157894737, |
|
"grad_norm": 0.4816322326660156, |
|
"learning_rate": 0.00019924922061120644, |
|
"loss": 0.6735, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 4.126315789473685, |
|
"grad_norm": 0.4419063329696655, |
|
"learning_rate": 0.00019924109245018072, |
|
"loss": 0.6995, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 4.147368421052631, |
|
"grad_norm": 0.4181762933731079, |
|
"learning_rate": 0.00019923292069436578, |
|
"loss": 0.7638, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 4.168421052631579, |
|
"grad_norm": 0.4450121521949768, |
|
"learning_rate": 0.00019922470534735123, |
|
"loss": 0.6745, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 4.189473684210526, |
|
"grad_norm": 0.4250476658344269, |
|
"learning_rate": 0.000199216446412746, |
|
"loss": 0.8314, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 4.2105263157894735, |
|
"grad_norm": 0.4420212507247925, |
|
"learning_rate": 0.0001992081438941781, |
|
"loss": 0.7058, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.2105263157894735, |
|
"eval_loss": 1.3737410306930542, |
|
"eval_runtime": 15.0784, |
|
"eval_samples_per_second": 6.632, |
|
"eval_steps_per_second": 0.663, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.231578947368421, |
|
"grad_norm": 0.4359537363052368, |
|
"learning_rate": 0.00019919979779529462, |
|
"loss": 0.7697, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 4.252631578947368, |
|
"grad_norm": 0.44620540738105774, |
|
"learning_rate": 0.0001991914081197619, |
|
"loss": 0.6539, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 4.273684210526316, |
|
"grad_norm": 0.4683506488800049, |
|
"learning_rate": 0.0001991829748712653, |
|
"loss": 0.6729, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 4.294736842105263, |
|
"grad_norm": 0.45054468512535095, |
|
"learning_rate": 0.00019917449805350947, |
|
"loss": 0.7936, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 4.315789473684211, |
|
"grad_norm": 0.3688434958457947, |
|
"learning_rate": 0.00019916597767021807, |
|
"loss": 0.7257, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 4.336842105263158, |
|
"grad_norm": 0.44829288125038147, |
|
"learning_rate": 0.00019915741372513398, |
|
"loss": 0.8966, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 4.3578947368421055, |
|
"grad_norm": 0.5000961422920227, |
|
"learning_rate": 0.00019914880622201912, |
|
"loss": 0.6876, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 4.378947368421053, |
|
"grad_norm": 0.4218606948852539, |
|
"learning_rate": 0.0001991401551646547, |
|
"loss": 0.6577, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.41971078515052795, |
|
"learning_rate": 0.00019913146055684092, |
|
"loss": 0.6466, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 4.421052631578947, |
|
"grad_norm": 0.5077884793281555, |
|
"learning_rate": 0.00019912272240239716, |
|
"loss": 0.6927, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.442105263157894, |
|
"grad_norm": 0.4335264265537262, |
|
"learning_rate": 0.00019911394070516194, |
|
"loss": 0.6726, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 4.463157894736842, |
|
"grad_norm": 0.4315473139286041, |
|
"learning_rate": 0.0001991051154689929, |
|
"loss": 0.6704, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 4.484210526315789, |
|
"grad_norm": 0.41848331689834595, |
|
"learning_rate": 0.0001990962466977668, |
|
"loss": 0.7625, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 4.505263157894737, |
|
"grad_norm": 0.43444666266441345, |
|
"learning_rate": 0.0001990873343953795, |
|
"loss": 0.6806, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 4.526315789473684, |
|
"grad_norm": 0.45425352454185486, |
|
"learning_rate": 0.00019907837856574607, |
|
"loss": 0.6422, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 4.5473684210526315, |
|
"grad_norm": 0.39107388257980347, |
|
"learning_rate": 0.0001990693792128006, |
|
"loss": 0.6979, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 4.568421052631579, |
|
"grad_norm": 0.5015237927436829, |
|
"learning_rate": 0.00019906033634049637, |
|
"loss": 0.76, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 4.589473684210526, |
|
"grad_norm": 0.5905852317810059, |
|
"learning_rate": 0.00019905124995280572, |
|
"loss": 0.6172, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 4.610526315789474, |
|
"grad_norm": 0.4459174573421478, |
|
"learning_rate": 0.0001990421200537201, |
|
"loss": 0.6808, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 4.631578947368421, |
|
"grad_norm": 0.4470289647579193, |
|
"learning_rate": 0.0001990329466472502, |
|
"loss": 0.7428, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.631578947368421, |
|
"eval_loss": 1.3823609352111816, |
|
"eval_runtime": 15.1002, |
|
"eval_samples_per_second": 6.622, |
|
"eval_steps_per_second": 0.662, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.652631578947369, |
|
"grad_norm": 0.5562801361083984, |
|
"learning_rate": 0.00019902372973742565, |
|
"loss": 0.6417, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 4.673684210526316, |
|
"grad_norm": 0.4418918788433075, |
|
"learning_rate": 0.00019901446932829532, |
|
"loss": 0.6833, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 4.6947368421052635, |
|
"grad_norm": 0.4580042064189911, |
|
"learning_rate": 0.00019900516542392712, |
|
"loss": 0.7247, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 4.715789473684211, |
|
"grad_norm": 0.4451601207256317, |
|
"learning_rate": 0.00019899581802840802, |
|
"loss": 0.6519, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 4.7368421052631575, |
|
"grad_norm": 0.46985530853271484, |
|
"learning_rate": 0.00019898642714584428, |
|
"loss": 0.6464, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 4.757894736842105, |
|
"grad_norm": 0.4769364893436432, |
|
"learning_rate": 0.00019897699278036108, |
|
"loss": 0.6946, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 4.778947368421052, |
|
"grad_norm": 0.4480087459087372, |
|
"learning_rate": 0.00019896751493610272, |
|
"loss": 0.5716, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.4228164851665497, |
|
"learning_rate": 0.00019895799361723272, |
|
"loss": 0.6828, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 4.821052631578947, |
|
"grad_norm": 0.488390177488327, |
|
"learning_rate": 0.00019894842882793362, |
|
"loss": 0.7358, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 4.842105263157895, |
|
"grad_norm": 0.4492754340171814, |
|
"learning_rate": 0.000198938820572407, |
|
"loss": 0.6468, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.863157894736842, |
|
"grad_norm": 0.48078247904777527, |
|
"learning_rate": 0.00019892916885487356, |
|
"loss": 0.7425, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 4.88421052631579, |
|
"grad_norm": 0.3910196125507355, |
|
"learning_rate": 0.00019891947367957322, |
|
"loss": 0.7077, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 4.905263157894737, |
|
"grad_norm": 0.5010913610458374, |
|
"learning_rate": 0.00019890973505076485, |
|
"loss": 0.791, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 4.926315789473684, |
|
"grad_norm": 0.3905860483646393, |
|
"learning_rate": 0.00019889995297272647, |
|
"loss": 0.6987, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 4.947368421052632, |
|
"grad_norm": 0.5714398622512817, |
|
"learning_rate": 0.00019889012744975508, |
|
"loss": 0.6656, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 4.968421052631579, |
|
"grad_norm": 0.47835731506347656, |
|
"learning_rate": 0.00019888025848616695, |
|
"loss": 0.6948, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 4.989473684210527, |
|
"grad_norm": 0.4666154086589813, |
|
"learning_rate": 0.00019887034608629734, |
|
"loss": 0.7628, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 5.010526315789473, |
|
"grad_norm": 0.470474511384964, |
|
"learning_rate": 0.0001988603902545005, |
|
"loss": 0.529, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 5.031578947368421, |
|
"grad_norm": 0.4203038513660431, |
|
"learning_rate": 0.00019885039099514992, |
|
"loss": 0.529, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 5.052631578947368, |
|
"grad_norm": 0.46336621046066284, |
|
"learning_rate": 0.00019884034831263808, |
|
"loss": 0.4866, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.052631578947368, |
|
"eval_loss": 1.4474551677703857, |
|
"eval_runtime": 15.0991, |
|
"eval_samples_per_second": 6.623, |
|
"eval_steps_per_second": 0.662, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.073684210526316, |
|
"grad_norm": 0.4180259704589844, |
|
"learning_rate": 0.00019883026221137652, |
|
"loss": 0.526, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 5.094736842105263, |
|
"grad_norm": 0.42054420709609985, |
|
"learning_rate": 0.00019882013269579584, |
|
"loss": 0.4412, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 5.11578947368421, |
|
"grad_norm": 0.5846607089042664, |
|
"learning_rate": 0.00019880995977034584, |
|
"loss": 0.5306, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 5.136842105263158, |
|
"grad_norm": 0.6321561932563782, |
|
"learning_rate": 0.00019879974343949526, |
|
"loss": 0.575, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 5.157894736842105, |
|
"grad_norm": 0.48956233263015747, |
|
"learning_rate": 0.00019878948370773193, |
|
"loss": 0.4667, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 5.178947368421053, |
|
"grad_norm": 0.49197542667388916, |
|
"learning_rate": 0.00019877918057956278, |
|
"loss": 0.473, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 0.5268818736076355, |
|
"learning_rate": 0.00019876883405951377, |
|
"loss": 0.6249, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 5.221052631578948, |
|
"grad_norm": 0.4883573651313782, |
|
"learning_rate": 0.00019875844415212997, |
|
"loss": 0.5239, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 5.242105263157895, |
|
"grad_norm": 0.45860010385513306, |
|
"learning_rate": 0.00019874801086197544, |
|
"loss": 0.5462, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 5.2631578947368425, |
|
"grad_norm": 0.41302675008773804, |
|
"learning_rate": 0.00019873753419363336, |
|
"loss": 0.5144, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.284210526315789, |
|
"grad_norm": 0.550791323184967, |
|
"learning_rate": 0.00019872701415170593, |
|
"loss": 0.5071, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 5.3052631578947365, |
|
"grad_norm": 0.4419604539871216, |
|
"learning_rate": 0.00019871645074081434, |
|
"loss": 0.4598, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 5.326315789473684, |
|
"grad_norm": 0.5271047353744507, |
|
"learning_rate": 0.00019870584396559902, |
|
"loss": 0.5444, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 5.347368421052631, |
|
"grad_norm": 0.4978967308998108, |
|
"learning_rate": 0.00019869519383071928, |
|
"loss": 0.5829, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 5.368421052631579, |
|
"grad_norm": 0.5046519041061401, |
|
"learning_rate": 0.00019868450034085352, |
|
"loss": 0.5343, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 5.389473684210526, |
|
"grad_norm": 0.5924373865127563, |
|
"learning_rate": 0.0001986737635006992, |
|
"loss": 0.514, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 5.410526315789474, |
|
"grad_norm": 0.47235432267189026, |
|
"learning_rate": 0.00019866298331497283, |
|
"loss": 0.4899, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 5.431578947368421, |
|
"grad_norm": 0.49679791927337646, |
|
"learning_rate": 0.0001986521597884099, |
|
"loss": 0.5483, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 5.4526315789473685, |
|
"grad_norm": 0.4871433973312378, |
|
"learning_rate": 0.00019864129292576505, |
|
"loss": 0.5544, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 5.473684210526316, |
|
"grad_norm": 0.5678947567939758, |
|
"learning_rate": 0.00019863038273181186, |
|
"loss": 0.5298, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.473684210526316, |
|
"eval_loss": 1.5484461784362793, |
|
"eval_runtime": 15.0951, |
|
"eval_samples_per_second": 6.625, |
|
"eval_steps_per_second": 0.662, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.494736842105263, |
|
"grad_norm": 0.541333019733429, |
|
"learning_rate": 0.00019861942921134298, |
|
"loss": 0.5321, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 5.515789473684211, |
|
"grad_norm": 0.45463690161705017, |
|
"learning_rate": 0.0001986084323691701, |
|
"loss": 0.5239, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 5.536842105263158, |
|
"grad_norm": 0.5732460618019104, |
|
"learning_rate": 0.0001985973922101239, |
|
"loss": 0.4861, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 5.557894736842105, |
|
"grad_norm": 0.4361143112182617, |
|
"learning_rate": 0.00019858630873905418, |
|
"loss": 0.5427, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 5.578947368421053, |
|
"grad_norm": 0.48954471945762634, |
|
"learning_rate": 0.00019857518196082964, |
|
"loss": 0.5614, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 0.5832586884498596, |
|
"learning_rate": 0.0001985640118803381, |
|
"loss": 0.4603, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 5.621052631578947, |
|
"grad_norm": 0.5026202201843262, |
|
"learning_rate": 0.0001985527985024864, |
|
"loss": 0.6399, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 5.6421052631578945, |
|
"grad_norm": 0.4579145908355713, |
|
"learning_rate": 0.0001985415418322003, |
|
"loss": 0.5354, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 5.663157894736842, |
|
"grad_norm": 0.545054018497467, |
|
"learning_rate": 0.00019853024187442472, |
|
"loss": 0.5158, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 5.684210526315789, |
|
"grad_norm": 0.48174452781677246, |
|
"learning_rate": 0.00019851889863412345, |
|
"loss": 0.5014, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.705263157894737, |
|
"grad_norm": 0.5417779684066772, |
|
"learning_rate": 0.00019850751211627945, |
|
"loss": 0.54, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 5.726315789473684, |
|
"grad_norm": 0.46869099140167236, |
|
"learning_rate": 0.00019849608232589457, |
|
"loss": 0.5416, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 5.747368421052632, |
|
"grad_norm": 0.6471317410469055, |
|
"learning_rate": 0.00019848460926798968, |
|
"loss": 0.5962, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 5.768421052631579, |
|
"grad_norm": 0.5855197310447693, |
|
"learning_rate": 0.00019847309294760473, |
|
"loss": 0.6314, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 5.7894736842105265, |
|
"grad_norm": 0.5380208492279053, |
|
"learning_rate": 0.00019846153336979856, |
|
"loss": 0.5651, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 5.810526315789474, |
|
"grad_norm": 0.46017733216285706, |
|
"learning_rate": 0.00019844993053964917, |
|
"loss": 0.5575, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 5.831578947368421, |
|
"grad_norm": 0.49735313653945923, |
|
"learning_rate": 0.00019843828446225342, |
|
"loss": 0.5628, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 5.852631578947369, |
|
"grad_norm": 0.5164270401000977, |
|
"learning_rate": 0.0001984265951427272, |
|
"loss": 0.5026, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 5.873684210526315, |
|
"grad_norm": 0.5263252258300781, |
|
"learning_rate": 0.00019841486258620545, |
|
"loss": 0.5588, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 5.894736842105263, |
|
"grad_norm": 0.47757405042648315, |
|
"learning_rate": 0.00019840308679784207, |
|
"loss": 0.5671, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.894736842105263, |
|
"eval_loss": 1.5009753704071045, |
|
"eval_runtime": 15.1017, |
|
"eval_samples_per_second": 6.622, |
|
"eval_steps_per_second": 0.662, |
|
"step": 280 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 4750, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 102, |
|
"save_steps": 20, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 10, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 10 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.740650038525952e+17, |
|
"train_batch_size": 10, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|