|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9996519317786287, |
|
"eval_steps": 180, |
|
"global_step": 718, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.08854816108942032, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1956, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 1.1464052200317383, |
|
"eval_runtime": 96.6552, |
|
"eval_samples_per_second": 51.73, |
|
"eval_steps_per_second": 12.933, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.10053814947605133, |
|
"learning_rate": 4e-05, |
|
"loss": 1.1428, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.09509150683879852, |
|
"learning_rate": 6e-05, |
|
"loss": 1.1963, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.0889890268445015, |
|
"learning_rate": 8e-05, |
|
"loss": 1.2185, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.0988571047782898, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1157, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.089688241481781, |
|
"learning_rate": 0.00012, |
|
"loss": 1.2244, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.08509726822376251, |
|
"learning_rate": 0.00014, |
|
"loss": 1.2232, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.07318675518035889, |
|
"learning_rate": 0.00016, |
|
"loss": 1.131, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.07840697467327118, |
|
"learning_rate": 0.00018, |
|
"loss": 1.2006, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.08998328447341919, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0947, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.110952228307724, |
|
"learning_rate": 0.0001999998926455355, |
|
"loss": 1.0462, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.08632127940654755, |
|
"learning_rate": 0.0001999995705823725, |
|
"loss": 1.1094, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.06929520517587662, |
|
"learning_rate": 0.00019999903381120245, |
|
"loss": 1.1388, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.058088913559913635, |
|
"learning_rate": 0.0001999982823331779, |
|
"loss": 1.0518, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.05951884761452675, |
|
"learning_rate": 0.0001999973161499123, |
|
"loss": 1.1296, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.06432141363620758, |
|
"learning_rate": 0.00019999613526348019, |
|
"loss": 1.2231, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.05998155102133751, |
|
"learning_rate": 0.00019999473967641696, |
|
"loss": 1.1548, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.06434385478496552, |
|
"learning_rate": 0.00019999312939171914, |
|
"loss": 1.2023, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.059631019830703735, |
|
"learning_rate": 0.00019999130441284408, |
|
"loss": 1.176, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.06641547381877899, |
|
"learning_rate": 0.00019998926474371022, |
|
"loss": 1.1081, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.05724109336733818, |
|
"learning_rate": 0.0001999870103886969, |
|
"loss": 1.1644, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.05426528677344322, |
|
"learning_rate": 0.00019998454135264444, |
|
"loss": 1.0424, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.06212550774216652, |
|
"learning_rate": 0.00019998185764085404, |
|
"loss": 1.1966, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.05805288627743721, |
|
"learning_rate": 0.0001999789592590879, |
|
"loss": 1.0189, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.06322558224201202, |
|
"learning_rate": 0.00019997584621356907, |
|
"loss": 1.1605, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.05367085337638855, |
|
"learning_rate": 0.0001999725185109816, |
|
"loss": 1.1008, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.06441543996334076, |
|
"learning_rate": 0.00019996897615847032, |
|
"loss": 1.1974, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.06023159250617027, |
|
"learning_rate": 0.00019996521916364096, |
|
"loss": 1.0898, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.05423330143094063, |
|
"learning_rate": 0.00019996124753456017, |
|
"loss": 1.1021, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.05826232209801674, |
|
"learning_rate": 0.00019995706127975537, |
|
"loss": 1.1573, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.06136321276426315, |
|
"learning_rate": 0.00019995266040821483, |
|
"loss": 1.0987, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.057079821825027466, |
|
"learning_rate": 0.0001999480449293876, |
|
"loss": 1.1706, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.0606551319360733, |
|
"learning_rate": 0.00019994321485318354, |
|
"loss": 1.1549, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.059561554342508316, |
|
"learning_rate": 0.00019993817018997323, |
|
"loss": 1.1983, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.05861816182732582, |
|
"learning_rate": 0.00019993291095058804, |
|
"loss": 1.1644, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.05660370737314224, |
|
"learning_rate": 0.00019992743714632, |
|
"loss": 1.1671, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.05508609116077423, |
|
"learning_rate": 0.00019992174878892186, |
|
"loss": 1.2054, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.058200910687446594, |
|
"learning_rate": 0.0001999158458906071, |
|
"loss": 1.1693, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.05510352924466133, |
|
"learning_rate": 0.00019990972846404967, |
|
"loss": 1.0654, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.05602727457880974, |
|
"learning_rate": 0.0001999033965223843, |
|
"loss": 1.0574, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.06220879405736923, |
|
"learning_rate": 0.0001998968500792062, |
|
"loss": 1.1197, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.0568472184240818, |
|
"learning_rate": 0.00019989008914857116, |
|
"loss": 1.0929, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.06510251760482788, |
|
"learning_rate": 0.0001998831137449955, |
|
"loss": 1.1372, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.05792449042201042, |
|
"learning_rate": 0.00019987592388345611, |
|
"loss": 1.1506, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.05337774381041527, |
|
"learning_rate": 0.00019986851957939016, |
|
"loss": 1.1419, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.05266134813427925, |
|
"learning_rate": 0.00019986090084869545, |
|
"loss": 1.1284, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.05520001798868179, |
|
"learning_rate": 0.00019985306770773, |
|
"loss": 1.1934, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.056133098900318146, |
|
"learning_rate": 0.00019984502017331225, |
|
"loss": 1.1348, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.05897597223520279, |
|
"learning_rate": 0.00019983675826272106, |
|
"loss": 1.0986, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.0529349111020565, |
|
"learning_rate": 0.00019982828199369541, |
|
"loss": 1.0987, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.0615835078060627, |
|
"learning_rate": 0.00019981959138443467, |
|
"loss": 1.1104, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.059136223047971725, |
|
"learning_rate": 0.0001998106864535983, |
|
"loss": 1.1611, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.05952458456158638, |
|
"learning_rate": 0.000199801567220306, |
|
"loss": 1.0697, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.059737298637628555, |
|
"learning_rate": 0.00019979223370413763, |
|
"loss": 1.1294, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.05678507685661316, |
|
"learning_rate": 0.00019978268592513296, |
|
"loss": 1.1584, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.05472412705421448, |
|
"learning_rate": 0.00019977292390379207, |
|
"loss": 1.1416, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.05907364934682846, |
|
"learning_rate": 0.00019976294766107478, |
|
"loss": 1.141, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.05549243837594986, |
|
"learning_rate": 0.00019975275721840103, |
|
"loss": 1.0759, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.062153417617082596, |
|
"learning_rate": 0.0001997423525976506, |
|
"loss": 1.1715, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.05978319048881531, |
|
"learning_rate": 0.0001997317338211631, |
|
"loss": 1.1452, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.053922947496175766, |
|
"learning_rate": 0.00019972090091173805, |
|
"loss": 1.204, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.055929865688085556, |
|
"learning_rate": 0.00019970985389263467, |
|
"loss": 1.1136, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.05876065790653229, |
|
"learning_rate": 0.00019969859278757185, |
|
"loss": 1.1364, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.06039357930421829, |
|
"learning_rate": 0.0001996871176207282, |
|
"loss": 1.0784, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.055668413639068604, |
|
"learning_rate": 0.00019967542841674196, |
|
"loss": 1.0948, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.054827943444252014, |
|
"learning_rate": 0.0001996635252007109, |
|
"loss": 1.1803, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.05121610313653946, |
|
"learning_rate": 0.00019965140799819224, |
|
"loss": 1.0376, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.05753542110323906, |
|
"learning_rate": 0.00019963907683520274, |
|
"loss": 1.1206, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.055351294577121735, |
|
"learning_rate": 0.00019962653173821843, |
|
"loss": 1.1842, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.061469532549381256, |
|
"learning_rate": 0.00019961377273417487, |
|
"loss": 1.0909, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.054115746170282364, |
|
"learning_rate": 0.0001996007998504667, |
|
"loss": 1.0843, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.05452101677656174, |
|
"learning_rate": 0.0001995876131149479, |
|
"loss": 1.1006, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.055213477462530136, |
|
"learning_rate": 0.00019957421255593154, |
|
"loss": 1.1685, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.055633626878261566, |
|
"learning_rate": 0.00019956059820218982, |
|
"loss": 1.1772, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.05437927320599556, |
|
"learning_rate": 0.000199546770082954, |
|
"loss": 1.1043, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.058048781007528305, |
|
"learning_rate": 0.00019953272822791424, |
|
"loss": 1.1045, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.06041900813579559, |
|
"learning_rate": 0.0001995184726672197, |
|
"loss": 1.1975, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.060865532606840134, |
|
"learning_rate": 0.00019950400343147833, |
|
"loss": 1.1731, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.056755781173706055, |
|
"learning_rate": 0.00019948932055175686, |
|
"loss": 1.0745, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.058669883757829666, |
|
"learning_rate": 0.00019947442405958074, |
|
"loss": 1.0734, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.058248650282621384, |
|
"learning_rate": 0.00019945931398693408, |
|
"loss": 1.1543, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.060298267751932144, |
|
"learning_rate": 0.0001994439903662596, |
|
"loss": 1.1762, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.05813004449009895, |
|
"learning_rate": 0.00019942845323045837, |
|
"loss": 1.1652, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.056509047746658325, |
|
"learning_rate": 0.00019941270261289012, |
|
"loss": 1.1391, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.05784199759364128, |
|
"learning_rate": 0.00019939673854737277, |
|
"loss": 1.1592, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.05908602103590965, |
|
"learning_rate": 0.00019938056106818261, |
|
"loss": 1.1867, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.05213363468647003, |
|
"learning_rate": 0.00019936417021005414, |
|
"loss": 1.0747, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.0627497211098671, |
|
"learning_rate": 0.00019934756600817997, |
|
"loss": 1.1086, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.05405977740883827, |
|
"learning_rate": 0.00019933074849821084, |
|
"loss": 0.9786, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.057349707931280136, |
|
"learning_rate": 0.00019931371771625544, |
|
"loss": 1.1537, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.05519171431660652, |
|
"learning_rate": 0.00019929647369888034, |
|
"loss": 1.1859, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.05836858972907066, |
|
"learning_rate": 0.00019927901648311003, |
|
"loss": 1.1702, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.058726366609334946, |
|
"learning_rate": 0.00019926134610642667, |
|
"loss": 1.0712, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.06261308491230011, |
|
"learning_rate": 0.0001992434626067702, |
|
"loss": 1.0476, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.05999467894434929, |
|
"learning_rate": 0.00019922536602253802, |
|
"loss": 1.0721, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.06016852334141731, |
|
"learning_rate": 0.00019920705639258517, |
|
"loss": 1.1634, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.055001530796289444, |
|
"learning_rate": 0.00019918853375622402, |
|
"loss": 1.1175, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.05680612102150917, |
|
"learning_rate": 0.00019916979815322433, |
|
"loss": 1.1508, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.061409611254930496, |
|
"learning_rate": 0.0001991508496238131, |
|
"loss": 1.0693, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.06438402086496353, |
|
"learning_rate": 0.00019913168820867458, |
|
"loss": 1.1216, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.06238175556063652, |
|
"learning_rate": 0.00019911231394894997, |
|
"loss": 1.1668, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.057234276086091995, |
|
"learning_rate": 0.00019909272688623756, |
|
"loss": 1.1387, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.07677511870861053, |
|
"learning_rate": 0.00019907292706259253, |
|
"loss": 1.2115, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.059833280742168427, |
|
"learning_rate": 0.00019905291452052687, |
|
"loss": 1.1444, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.05804947763681412, |
|
"learning_rate": 0.00019903268930300926, |
|
"loss": 1.1475, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.06504326313734055, |
|
"learning_rate": 0.0001990122514534651, |
|
"loss": 1.1432, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.057522453367710114, |
|
"learning_rate": 0.00019899160101577624, |
|
"loss": 1.1549, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.06902632117271423, |
|
"learning_rate": 0.00019897073803428104, |
|
"loss": 1.1719, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.05860123783349991, |
|
"learning_rate": 0.00019894966255377416, |
|
"loss": 1.0437, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.05878465995192528, |
|
"learning_rate": 0.00019892837461950652, |
|
"loss": 1.1757, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.05310744047164917, |
|
"learning_rate": 0.00019890687427718528, |
|
"loss": 1.0382, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.05617301166057587, |
|
"learning_rate": 0.00019888516157297358, |
|
"loss": 1.0004, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.05957025662064552, |
|
"learning_rate": 0.00019886323655349047, |
|
"loss": 1.0967, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.05660326033830643, |
|
"learning_rate": 0.00019884109926581096, |
|
"loss": 1.1104, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.0532040037214756, |
|
"learning_rate": 0.00019881874975746582, |
|
"loss": 1.1462, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.054993633180856705, |
|
"learning_rate": 0.00019879618807644138, |
|
"loss": 1.1964, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.06195160746574402, |
|
"learning_rate": 0.00019877341427117966, |
|
"loss": 1.1926, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.0648837760090828, |
|
"learning_rate": 0.00019875042839057798, |
|
"loss": 1.1438, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.055000726133584976, |
|
"learning_rate": 0.00019872723048398912, |
|
"loss": 1.1134, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.06383106857538223, |
|
"learning_rate": 0.000198703820601221, |
|
"loss": 1.1825, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.059399884194135666, |
|
"learning_rate": 0.00019868019879253684, |
|
"loss": 1.1175, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.061064597219228745, |
|
"learning_rate": 0.00019865636510865467, |
|
"loss": 1.1299, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.05886513367295265, |
|
"learning_rate": 0.00019863231960074758, |
|
"loss": 1.1096, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.05704960227012634, |
|
"learning_rate": 0.00019860806232044337, |
|
"loss": 1.1557, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.05626309663057327, |
|
"learning_rate": 0.00019858359331982467, |
|
"loss": 1.1348, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.05755053088068962, |
|
"learning_rate": 0.0001985589126514286, |
|
"loss": 1.2132, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.05787818506360054, |
|
"learning_rate": 0.0001985340203682467, |
|
"loss": 1.146, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.05755281075835228, |
|
"learning_rate": 0.0001985089165237249, |
|
"loss": 1.1248, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.05296372249722481, |
|
"learning_rate": 0.00019848360117176352, |
|
"loss": 1.1296, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.05723942816257477, |
|
"learning_rate": 0.0001984580743667168, |
|
"loss": 1.0936, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.057743217796087265, |
|
"learning_rate": 0.00019843233616339306, |
|
"loss": 1.1184, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.05883733928203583, |
|
"learning_rate": 0.00019840638661705454, |
|
"loss": 1.1413, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.06587976962327957, |
|
"learning_rate": 0.00019838022578341725, |
|
"loss": 1.1704, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.053919676691293716, |
|
"learning_rate": 0.0001983538537186508, |
|
"loss": 1.08, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.05607760325074196, |
|
"learning_rate": 0.0001983272704793784, |
|
"loss": 1.095, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.055869605392217636, |
|
"learning_rate": 0.00019830047612267663, |
|
"loss": 1.1441, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.058360978960990906, |
|
"learning_rate": 0.00019827347070607536, |
|
"loss": 1.1664, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.0630207508802414, |
|
"learning_rate": 0.0001982462542875576, |
|
"loss": 1.2234, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.05601505562663078, |
|
"learning_rate": 0.0001982188269255595, |
|
"loss": 1.1665, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.06406079232692719, |
|
"learning_rate": 0.00019819118867897003, |
|
"loss": 1.21, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.05720619857311249, |
|
"learning_rate": 0.00019816333960713094, |
|
"loss": 1.0835, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.05414717271924019, |
|
"learning_rate": 0.0001981352797698367, |
|
"loss": 1.0968, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.060023292899131775, |
|
"learning_rate": 0.00019810700922733428, |
|
"loss": 1.12, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.06230263411998749, |
|
"learning_rate": 0.00019807852804032305, |
|
"loss": 1.2331, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.05766064673662186, |
|
"learning_rate": 0.00019804983626995468, |
|
"loss": 1.1895, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.06487365067005157, |
|
"learning_rate": 0.00019802093397783296, |
|
"loss": 1.116, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.05351267755031586, |
|
"learning_rate": 0.0001979918212260137, |
|
"loss": 1.1481, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.06439248472452164, |
|
"learning_rate": 0.00019796249807700457, |
|
"loss": 1.2206, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.06001568213105202, |
|
"learning_rate": 0.00019793296459376494, |
|
"loss": 1.1055, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.055722612887620926, |
|
"learning_rate": 0.0001979032208397059, |
|
"loss": 1.0998, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.06338882446289062, |
|
"learning_rate": 0.00019787326687868993, |
|
"loss": 1.0981, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.053316015750169754, |
|
"learning_rate": 0.00019784310277503085, |
|
"loss": 1.0689, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.055426474660634995, |
|
"learning_rate": 0.00019781272859349368, |
|
"loss": 1.1028, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.08009687811136246, |
|
"learning_rate": 0.00019778214439929452, |
|
"loss": 1.1934, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.06281601637601852, |
|
"learning_rate": 0.00019775135025810035, |
|
"loss": 1.1871, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.05907437577843666, |
|
"learning_rate": 0.00019772034623602894, |
|
"loss": 1.1801, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.05395223945379257, |
|
"learning_rate": 0.00019768913239964872, |
|
"loss": 1.1755, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.05565226450562477, |
|
"learning_rate": 0.00019765770881597855, |
|
"loss": 1.1147, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.06353943049907684, |
|
"learning_rate": 0.0001976260755524877, |
|
"loss": 1.148, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.056039877235889435, |
|
"learning_rate": 0.00019759423267709555, |
|
"loss": 1.1285, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.06256404519081116, |
|
"learning_rate": 0.00019756218025817165, |
|
"loss": 1.1148, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.06320231407880783, |
|
"learning_rate": 0.00019752991836453543, |
|
"loss": 1.1897, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.06373140960931778, |
|
"learning_rate": 0.00019749744706545598, |
|
"loss": 1.147, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.061453696340322495, |
|
"learning_rate": 0.00019746476643065216, |
|
"loss": 1.1333, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.06347192078828812, |
|
"learning_rate": 0.00019743187653029214, |
|
"loss": 1.1834, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.058654770255088806, |
|
"learning_rate": 0.00019739877743499352, |
|
"loss": 1.1774, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.05696241557598114, |
|
"learning_rate": 0.000197365469215823, |
|
"loss": 1.1321, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.057179030030965805, |
|
"learning_rate": 0.00019733195194429628, |
|
"loss": 1.0855, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.058308880776166916, |
|
"learning_rate": 0.00019729822569237797, |
|
"loss": 1.0362, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.06666514277458191, |
|
"learning_rate": 0.0001972642905324813, |
|
"loss": 1.1118, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.05894763022661209, |
|
"learning_rate": 0.00019723014653746815, |
|
"loss": 1.1454, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.06241476908326149, |
|
"learning_rate": 0.00019719579378064869, |
|
"loss": 1.1657, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.05953037366271019, |
|
"learning_rate": 0.00019716123233578134, |
|
"loss": 1.1378, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.05746786668896675, |
|
"learning_rate": 0.00019712646227707263, |
|
"loss": 1.1115, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.05923418328166008, |
|
"learning_rate": 0.00019709148367917697, |
|
"loss": 1.0896, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.05635738745331764, |
|
"learning_rate": 0.00019705629661719652, |
|
"loss": 1.1049, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.05885479226708412, |
|
"learning_rate": 0.0001970209011666811, |
|
"loss": 1.1252, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.057710710912942886, |
|
"learning_rate": 0.00019698529740362785, |
|
"loss": 1.1097, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.06445460021495819, |
|
"learning_rate": 0.00019694948540448123, |
|
"loss": 1.2045, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.05737968161702156, |
|
"learning_rate": 0.00019691346524613286, |
|
"loss": 1.1544, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.0905169248580933, |
|
"eval_runtime": 97.1012, |
|
"eval_samples_per_second": 51.493, |
|
"eval_steps_per_second": 12.873, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.06000959873199463, |
|
"learning_rate": 0.00019687723700592116, |
|
"loss": 1.14, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.06186012923717499, |
|
"learning_rate": 0.00019684080076163142, |
|
"loss": 1.1284, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.061752576380968094, |
|
"learning_rate": 0.00019680415659149554, |
|
"loss": 1.1719, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.05509311705827713, |
|
"learning_rate": 0.00019676730457419178, |
|
"loss": 1.1155, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.06091266870498657, |
|
"learning_rate": 0.00019673024478884473, |
|
"loss": 1.0099, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.065452940762043, |
|
"learning_rate": 0.00019669297731502507, |
|
"loss": 1.1272, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.05998782813549042, |
|
"learning_rate": 0.00019665550223274937, |
|
"loss": 1.0957, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.06403583288192749, |
|
"learning_rate": 0.00019661781962248003, |
|
"loss": 1.137, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.06362204998731613, |
|
"learning_rate": 0.0001965799295651249, |
|
"loss": 1.1025, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.06712298840284348, |
|
"learning_rate": 0.0001965418321420374, |
|
"loss": 1.1069, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.059146981686353683, |
|
"learning_rate": 0.00019650352743501604, |
|
"loss": 1.1502, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.0600997731089592, |
|
"learning_rate": 0.00019646501552630444, |
|
"loss": 1.1693, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.06209671124815941, |
|
"learning_rate": 0.0001964262964985912, |
|
"loss": 1.1384, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.061546314507722855, |
|
"learning_rate": 0.0001963873704350094, |
|
"loss": 1.1485, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.06550047546625137, |
|
"learning_rate": 0.00019634823741913687, |
|
"loss": 1.1136, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.05800144001841545, |
|
"learning_rate": 0.0001963088975349956, |
|
"loss": 1.1783, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.06252887099981308, |
|
"learning_rate": 0.00019626935086705194, |
|
"loss": 1.1337, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.06455714255571365, |
|
"learning_rate": 0.00019622959750021605, |
|
"loss": 1.1107, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.06395559012889862, |
|
"learning_rate": 0.00019618963751984195, |
|
"loss": 1.0746, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.05962133780121803, |
|
"learning_rate": 0.00019614947101172732, |
|
"loss": 1.2018, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.06970924139022827, |
|
"learning_rate": 0.00019610909806211323, |
|
"loss": 1.2098, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.06366313248872757, |
|
"learning_rate": 0.000196068518757684, |
|
"loss": 1.0914, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.05959942191839218, |
|
"learning_rate": 0.00019602773318556707, |
|
"loss": 1.1557, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.06882765144109726, |
|
"learning_rate": 0.00019598674143333263, |
|
"loss": 1.2375, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.06210785359144211, |
|
"learning_rate": 0.00019594554358899366, |
|
"loss": 1.1748, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.06589924544095993, |
|
"learning_rate": 0.0001959041397410056, |
|
"loss": 1.1242, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.05691949278116226, |
|
"learning_rate": 0.00019586252997826626, |
|
"loss": 1.1532, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.061684492975473404, |
|
"learning_rate": 0.00019582071439011546, |
|
"loss": 1.1318, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.06065653637051582, |
|
"learning_rate": 0.00019577869306633503, |
|
"loss": 1.1281, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.057913534343242645, |
|
"learning_rate": 0.0001957364660971485, |
|
"loss": 1.1318, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.06302294880151749, |
|
"learning_rate": 0.0001956940335732209, |
|
"loss": 1.1971, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.0630715936422348, |
|
"learning_rate": 0.0001956513955856587, |
|
"loss": 1.215, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.067079097032547, |
|
"learning_rate": 0.00019560855222600947, |
|
"loss": 1.1484, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.05915733054280281, |
|
"learning_rate": 0.0001955655035862617, |
|
"loss": 1.0748, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.07278767973184586, |
|
"learning_rate": 0.00019552224975884466, |
|
"loss": 1.0966, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.05778686702251434, |
|
"learning_rate": 0.00019547879083662819, |
|
"loss": 1.0561, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.06521858274936676, |
|
"learning_rate": 0.00019543512691292251, |
|
"loss": 1.1249, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.058512818068265915, |
|
"learning_rate": 0.0001953912580814779, |
|
"loss": 1.0463, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.06136133521795273, |
|
"learning_rate": 0.00019534718443648473, |
|
"loss": 1.1914, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.05871862918138504, |
|
"learning_rate": 0.000195302906072573, |
|
"loss": 1.0887, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.05922750383615494, |
|
"learning_rate": 0.00019525842308481227, |
|
"loss": 1.0888, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.06273704022169113, |
|
"learning_rate": 0.0001952137355687116, |
|
"loss": 1.1218, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.05759681016206741, |
|
"learning_rate": 0.000195168843620219, |
|
"loss": 1.1278, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.060312505811452866, |
|
"learning_rate": 0.00019512374733572153, |
|
"loss": 1.1688, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.06145263463258743, |
|
"learning_rate": 0.0001950784468120449, |
|
"loss": 1.1055, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.06581958383321762, |
|
"learning_rate": 0.00019503294214645337, |
|
"loss": 1.2683, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.06561966240406036, |
|
"learning_rate": 0.00019498723343664955, |
|
"loss": 1.2038, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.06790629774332047, |
|
"learning_rate": 0.00019494132078077414, |
|
"loss": 1.1307, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.06045442819595337, |
|
"learning_rate": 0.0001948952042774057, |
|
"loss": 1.1119, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.06775711476802826, |
|
"learning_rate": 0.00019484888402556045, |
|
"loss": 1.1636, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.06277008354663849, |
|
"learning_rate": 0.00019480236012469213, |
|
"loss": 1.1908, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.0631483793258667, |
|
"learning_rate": 0.00019475563267469173, |
|
"loss": 1.1828, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.06697548180818558, |
|
"learning_rate": 0.0001947087017758872, |
|
"loss": 1.1107, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.06013527512550354, |
|
"learning_rate": 0.00019466156752904343, |
|
"loss": 1.1176, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.06709557771682739, |
|
"learning_rate": 0.00019461423003536182, |
|
"loss": 1.1575, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.057948894798755646, |
|
"learning_rate": 0.0001945666893964802, |
|
"loss": 1.0455, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.057657647877931595, |
|
"learning_rate": 0.00019451894571447258, |
|
"loss": 1.1221, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.06317263096570969, |
|
"learning_rate": 0.0001944709990918489, |
|
"loss": 1.1305, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.06375168263912201, |
|
"learning_rate": 0.00019442284963155487, |
|
"loss": 1.206, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.06475285440683365, |
|
"learning_rate": 0.00019437449743697164, |
|
"loss": 1.204, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.057945527136325836, |
|
"learning_rate": 0.00019432594261191568, |
|
"loss": 0.9143, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.05976686626672745, |
|
"learning_rate": 0.00019427718526063856, |
|
"loss": 1.1177, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.059271346777677536, |
|
"learning_rate": 0.00019422822548782666, |
|
"loss": 1.1089, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.05958615615963936, |
|
"learning_rate": 0.00019417906339860098, |
|
"loss": 1.1451, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.06455380469560623, |
|
"learning_rate": 0.00019412969909851695, |
|
"loss": 1.2423, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.0598178505897522, |
|
"learning_rate": 0.00019408013269356408, |
|
"loss": 1.081, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.06671278178691864, |
|
"learning_rate": 0.0001940303642901659, |
|
"loss": 1.1094, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.06148644909262657, |
|
"learning_rate": 0.0001939803939951796, |
|
"loss": 1.1045, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.06747152656316757, |
|
"learning_rate": 0.00019393022191589588, |
|
"loss": 1.1443, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.060876958072185516, |
|
"learning_rate": 0.00019387984816003867, |
|
"loss": 1.1341, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.05667860805988312, |
|
"learning_rate": 0.00019382927283576488, |
|
"loss": 1.2032, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.06507498770952225, |
|
"learning_rate": 0.0001937784960516643, |
|
"loss": 1.1355, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.05810081586241722, |
|
"learning_rate": 0.0001937275179167592, |
|
"loss": 1.0911, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.06210170313715935, |
|
"learning_rate": 0.00019367633854050422, |
|
"loss": 1.2247, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.060430753976106644, |
|
"learning_rate": 0.00019362495803278598, |
|
"loss": 1.1633, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.0607365183532238, |
|
"learning_rate": 0.0001935733765039231, |
|
"loss": 1.1253, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.062377460300922394, |
|
"learning_rate": 0.00019352159406466564, |
|
"loss": 1.1525, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.06079326942563057, |
|
"learning_rate": 0.00019346961082619522, |
|
"loss": 1.067, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.06413824111223221, |
|
"learning_rate": 0.00019341742690012442, |
|
"loss": 1.0536, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.06453590095043182, |
|
"learning_rate": 0.00019336504239849677, |
|
"loss": 1.1118, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.06608355045318604, |
|
"learning_rate": 0.00019331245743378658, |
|
"loss": 1.2306, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.06034080311655998, |
|
"learning_rate": 0.00019325967211889834, |
|
"loss": 1.0643, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.06985899806022644, |
|
"learning_rate": 0.0001932066865671669, |
|
"loss": 1.1907, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.06518577039241791, |
|
"learning_rate": 0.000193153500892357, |
|
"loss": 1.0499, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.05966721102595329, |
|
"learning_rate": 0.00019310011520866298, |
|
"loss": 1.2111, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.0628088116645813, |
|
"learning_rate": 0.0001930465296307087, |
|
"loss": 1.2063, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.06214950606226921, |
|
"learning_rate": 0.00019299274427354713, |
|
"loss": 1.1141, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.05994017794728279, |
|
"learning_rate": 0.00019293875925266028, |
|
"loss": 1.0661, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.0646560862660408, |
|
"learning_rate": 0.0001928845746839588, |
|
"loss": 1.1208, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.0596354603767395, |
|
"learning_rate": 0.00019283019068378182, |
|
"loss": 1.0839, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.06729655712842941, |
|
"learning_rate": 0.00019277560736889656, |
|
"loss": 1.1983, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.060334715992212296, |
|
"learning_rate": 0.0001927208248564984, |
|
"loss": 1.1733, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.06451458483934402, |
|
"learning_rate": 0.0001926658432642102, |
|
"loss": 1.1861, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.0642094537615776, |
|
"learning_rate": 0.00019261066271008235, |
|
"loss": 1.2082, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.06414438039064407, |
|
"learning_rate": 0.00019255528331259245, |
|
"loss": 1.2229, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.06006895750761032, |
|
"learning_rate": 0.00019249970519064503, |
|
"loss": 1.1456, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.06294744461774826, |
|
"learning_rate": 0.00019244392846357123, |
|
"loss": 1.072, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.06234186887741089, |
|
"learning_rate": 0.0001923879532511287, |
|
"loss": 1.0956, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.06028866767883301, |
|
"learning_rate": 0.00019233177967350117, |
|
"loss": 1.1874, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.06322412937879562, |
|
"learning_rate": 0.0001922754078512984, |
|
"loss": 1.0697, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.059378113597631454, |
|
"learning_rate": 0.00019221883790555567, |
|
"loss": 1.1663, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.062111396342515945, |
|
"learning_rate": 0.00019216206995773373, |
|
"loss": 1.1077, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.06687445938587189, |
|
"learning_rate": 0.00019210510412971844, |
|
"loss": 1.193, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.06198667362332344, |
|
"learning_rate": 0.00019204794054382052, |
|
"loss": 1.2258, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.05913461372256279, |
|
"learning_rate": 0.00019199057932277525, |
|
"loss": 1.141, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.06464271247386932, |
|
"learning_rate": 0.00019193302058974232, |
|
"loss": 1.0863, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.05979037657380104, |
|
"learning_rate": 0.0001918752644683055, |
|
"loss": 1.1686, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.06707989424467087, |
|
"learning_rate": 0.00019181731108247228, |
|
"loss": 1.2413, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.06815943866968155, |
|
"learning_rate": 0.0001917591605566738, |
|
"loss": 1.2022, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.06593851000070572, |
|
"learning_rate": 0.00019170081301576444, |
|
"loss": 1.0464, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.06178038939833641, |
|
"learning_rate": 0.00019164226858502153, |
|
"loss": 1.1011, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.06253184378147125, |
|
"learning_rate": 0.00019158352739014523, |
|
"loss": 1.0302, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.06402178853750229, |
|
"learning_rate": 0.0001915245895572581, |
|
"loss": 1.1971, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.0673704445362091, |
|
"learning_rate": 0.00019146545521290495, |
|
"loss": 1.1555, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.06606754660606384, |
|
"learning_rate": 0.00019140612448405252, |
|
"loss": 1.1057, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.060698218643665314, |
|
"learning_rate": 0.00019134659749808913, |
|
"loss": 1.1279, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.06506410986185074, |
|
"learning_rate": 0.00019128687438282457, |
|
"loss": 1.1735, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.061889030039310455, |
|
"learning_rate": 0.00019122695526648968, |
|
"loss": 1.1158, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.06519250571727753, |
|
"learning_rate": 0.0001911668402777362, |
|
"loss": 1.1934, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.06114347651600838, |
|
"learning_rate": 0.00019110652954563631, |
|
"loss": 1.1105, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.06125333905220032, |
|
"learning_rate": 0.0001910460231996826, |
|
"loss": 1.0982, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.06343477964401245, |
|
"learning_rate": 0.00019098532136978754, |
|
"loss": 1.1342, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.060326121747493744, |
|
"learning_rate": 0.00019092442418628343, |
|
"loss": 1.1897, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.06917954236268997, |
|
"learning_rate": 0.00019086333177992191, |
|
"loss": 1.0707, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.06384435296058655, |
|
"learning_rate": 0.00019080204428187388, |
|
"loss": 1.0565, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.06388357281684875, |
|
"learning_rate": 0.00019074056182372907, |
|
"loss": 1.1176, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.0598420649766922, |
|
"learning_rate": 0.00019067888453749575, |
|
"loss": 1.1335, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.05912897735834122, |
|
"learning_rate": 0.0001906170125556006, |
|
"loss": 1.0945, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.0659290999174118, |
|
"learning_rate": 0.00019055494601088834, |
|
"loss": 1.1361, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.0634288564324379, |
|
"learning_rate": 0.00019049268503662126, |
|
"loss": 1.1754, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.06105343624949455, |
|
"learning_rate": 0.00019043022976647934, |
|
"loss": 1.1168, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.06428004801273346, |
|
"learning_rate": 0.00019036758033455956, |
|
"loss": 1.0972, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.061880819499492645, |
|
"learning_rate": 0.0001903047368753759, |
|
"loss": 1.1545, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.07274632155895233, |
|
"learning_rate": 0.00019024169952385885, |
|
"loss": 1.1257, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.06646310538053513, |
|
"learning_rate": 0.00019017846841535522, |
|
"loss": 1.154, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.06078438088297844, |
|
"learning_rate": 0.00019011504368562782, |
|
"loss": 1.1303, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.07181931287050247, |
|
"learning_rate": 0.00019005142547085527, |
|
"loss": 1.1603, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.06724654138088226, |
|
"learning_rate": 0.00018998761390763154, |
|
"loss": 1.1718, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.05919702351093292, |
|
"learning_rate": 0.00018992360913296574, |
|
"loss": 1.142, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.06586825847625732, |
|
"learning_rate": 0.00018985941128428185, |
|
"loss": 1.1859, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.06922873854637146, |
|
"learning_rate": 0.00018979502049941833, |
|
"loss": 1.1, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.06558050960302353, |
|
"learning_rate": 0.00018973043691662803, |
|
"loss": 1.1329, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.06796015799045563, |
|
"learning_rate": 0.0001896656606745776, |
|
"loss": 1.1413, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.06253591179847717, |
|
"learning_rate": 0.00018960069191234746, |
|
"loss": 1.1416, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.07260395586490631, |
|
"learning_rate": 0.00018953553076943134, |
|
"loss": 1.2171, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.06394602358341217, |
|
"learning_rate": 0.000189470177385736, |
|
"loss": 1.1857, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.06450013071298599, |
|
"learning_rate": 0.000189404631901581, |
|
"loss": 1.1278, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.06406421959400177, |
|
"learning_rate": 0.00018933889445769836, |
|
"loss": 1.2068, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.06315187364816666, |
|
"learning_rate": 0.00018927296519523226, |
|
"loss": 1.0789, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.06743728369474411, |
|
"learning_rate": 0.00018920684425573865, |
|
"loss": 1.1594, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.06548736989498138, |
|
"learning_rate": 0.0001891405317811852, |
|
"loss": 1.0963, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.06221878528594971, |
|
"learning_rate": 0.00018907402791395057, |
|
"loss": 1.1802, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.06024456024169922, |
|
"learning_rate": 0.00018900733279682462, |
|
"loss": 1.1534, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.0619971826672554, |
|
"learning_rate": 0.00018894044657300765, |
|
"loss": 1.1287, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.061270926147699356, |
|
"learning_rate": 0.0001888733693861104, |
|
"loss": 1.1306, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.06278882175683975, |
|
"learning_rate": 0.00018880610138015356, |
|
"loss": 1.0748, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.061774447560310364, |
|
"learning_rate": 0.00018873864269956756, |
|
"loss": 1.083, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.06414585560560226, |
|
"learning_rate": 0.00018867099348919217, |
|
"loss": 1.1351, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.06463301926851273, |
|
"learning_rate": 0.00018860315389427633, |
|
"loss": 1.1633, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.06183488667011261, |
|
"learning_rate": 0.00018853512406047772, |
|
"loss": 1.0311, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.06121757999062538, |
|
"learning_rate": 0.00018846690413386238, |
|
"loss": 1.1787, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.06119801104068756, |
|
"learning_rate": 0.0001883984942609047, |
|
"loss": 1.1699, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.06786572933197021, |
|
"learning_rate": 0.0001883298945884867, |
|
"loss": 1.1863, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.06401295214891434, |
|
"learning_rate": 0.00018826110526389803, |
|
"loss": 1.1483, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.06333799660205841, |
|
"learning_rate": 0.0001881921264348355, |
|
"loss": 1.0793, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.06162017583847046, |
|
"learning_rate": 0.00018812295824940285, |
|
"loss": 1.1106, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.05900746211409569, |
|
"learning_rate": 0.00018805360085611032, |
|
"loss": 1.1257, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.06269056349992752, |
|
"learning_rate": 0.00018798405440387445, |
|
"loss": 1.1076, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.06586477160453796, |
|
"learning_rate": 0.00018791431904201765, |
|
"loss": 1.0705, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.07007557153701782, |
|
"learning_rate": 0.00018784439492026798, |
|
"loss": 1.2659, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.06069363281130791, |
|
"learning_rate": 0.00018777428218875878, |
|
"loss": 1.0526, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.06026845797896385, |
|
"learning_rate": 0.00018770398099802836, |
|
"loss": 1.1309, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.061220645904541016, |
|
"learning_rate": 0.0001876334914990196, |
|
"loss": 1.0999, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.06346994638442993, |
|
"learning_rate": 0.00018756281384307982, |
|
"loss": 1.0391, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.06735444813966751, |
|
"learning_rate": 0.00018749194818196022, |
|
"loss": 1.1974, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.06252043694257736, |
|
"learning_rate": 0.0001874208946678157, |
|
"loss": 1.1771, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.06052526459097862, |
|
"learning_rate": 0.0001873496534532045, |
|
"loss": 1.1878, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.06671708077192307, |
|
"learning_rate": 0.0001872782246910879, |
|
"loss": 1.1576, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.06357080489397049, |
|
"learning_rate": 0.00018720660853482977, |
|
"loss": 1.164, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.06754573434591293, |
|
"learning_rate": 0.00018713480513819644, |
|
"loss": 1.1932, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.0889657735824585, |
|
"eval_runtime": 97.0368, |
|
"eval_samples_per_second": 51.527, |
|
"eval_steps_per_second": 12.882, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.06117720529437065, |
|
"learning_rate": 0.0001870628146553562, |
|
"loss": 1.0225, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.06431601196527481, |
|
"learning_rate": 0.00018699063724087904, |
|
"loss": 1.1954, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.06532489508390427, |
|
"learning_rate": 0.0001869182730497363, |
|
"loss": 1.1475, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.06369902938604355, |
|
"learning_rate": 0.00018684572223730045, |
|
"loss": 1.0935, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.06582259386777878, |
|
"learning_rate": 0.0001867729849593444, |
|
"loss": 1.163, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.06448414921760559, |
|
"learning_rate": 0.0001867000613720417, |
|
"loss": 1.1137, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.0670081228017807, |
|
"learning_rate": 0.00018662695163196579, |
|
"loss": 1.1921, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.06068355217576027, |
|
"learning_rate": 0.0001865536558960898, |
|
"loss": 0.9705, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.06467951089143753, |
|
"learning_rate": 0.0001864801743217862, |
|
"loss": 1.0868, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.06502547860145569, |
|
"learning_rate": 0.0001864065070668265, |
|
"loss": 1.0937, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.07280650734901428, |
|
"learning_rate": 0.00018633265428938088, |
|
"loss": 1.1701, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.05973858758807182, |
|
"learning_rate": 0.00018625861614801785, |
|
"loss": 1.0324, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.06514280289411545, |
|
"learning_rate": 0.00018618439280170392, |
|
"loss": 1.1338, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.06471429020166397, |
|
"learning_rate": 0.00018610998440980324, |
|
"loss": 1.1876, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.0652419924736023, |
|
"learning_rate": 0.00018603539113207722, |
|
"loss": 1.2204, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.06470503658056259, |
|
"learning_rate": 0.0001859606131286843, |
|
"loss": 1.2483, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.06816300749778748, |
|
"learning_rate": 0.00018588565056017965, |
|
"loss": 1.1036, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.062083158642053604, |
|
"learning_rate": 0.00018581050358751445, |
|
"loss": 1.1232, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.06199049949645996, |
|
"learning_rate": 0.00018573517237203602, |
|
"loss": 1.0623, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.06976150721311569, |
|
"learning_rate": 0.0001856596570754872, |
|
"loss": 1.0361, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.06474665552377701, |
|
"learning_rate": 0.0001855839578600061, |
|
"loss": 1.1786, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.06963939964771271, |
|
"learning_rate": 0.00018550807488812562, |
|
"loss": 1.1381, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.06306161731481552, |
|
"learning_rate": 0.00018543200832277337, |
|
"loss": 1.1783, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.06826300919055939, |
|
"learning_rate": 0.00018535575832727102, |
|
"loss": 1.1401, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.061530325561761856, |
|
"learning_rate": 0.0001852793250653341, |
|
"loss": 1.0992, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.06267255544662476, |
|
"learning_rate": 0.00018520270870107166, |
|
"loss": 1.0652, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.059738241136074066, |
|
"learning_rate": 0.0001851259093989859, |
|
"loss": 1.1092, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.062138039618730545, |
|
"learning_rate": 0.00018504892732397173, |
|
"loss": 1.0607, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.06500416994094849, |
|
"learning_rate": 0.00018497176264131656, |
|
"loss": 1.0877, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.06768455356359482, |
|
"learning_rate": 0.00018489441551669986, |
|
"loss": 1.2099, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.06814403086900711, |
|
"learning_rate": 0.00018481688611619285, |
|
"loss": 1.1743, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.06558533012866974, |
|
"learning_rate": 0.00018473917460625798, |
|
"loss": 1.0566, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.06515612453222275, |
|
"learning_rate": 0.00018466128115374888, |
|
"loss": 1.1267, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.06616493314504623, |
|
"learning_rate": 0.00018458320592590975, |
|
"loss": 1.1703, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.06906623393297195, |
|
"learning_rate": 0.00018450494909037502, |
|
"loss": 1.2327, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.06478845328092575, |
|
"learning_rate": 0.00018442651081516917, |
|
"loss": 1.1189, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.0638127252459526, |
|
"learning_rate": 0.00018434789126870612, |
|
"loss": 1.1117, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.06424181908369064, |
|
"learning_rate": 0.00018426909061978908, |
|
"loss": 1.1469, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.06596982479095459, |
|
"learning_rate": 0.0001841901090376101, |
|
"loss": 1.1003, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.06202305108308792, |
|
"learning_rate": 0.00018411094669174965, |
|
"loss": 1.0425, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.0672823116183281, |
|
"learning_rate": 0.00018403160375217637, |
|
"loss": 1.1015, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.06283591687679291, |
|
"learning_rate": 0.00018395208038924667, |
|
"loss": 1.0863, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.06623289734125137, |
|
"learning_rate": 0.00018387237677370427, |
|
"loss": 1.0828, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.06633896380662918, |
|
"learning_rate": 0.00018379249307667994, |
|
"loss": 1.145, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.06487153470516205, |
|
"learning_rate": 0.00018371242946969117, |
|
"loss": 1.1447, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.06051245704293251, |
|
"learning_rate": 0.00018363218612464158, |
|
"loss": 1.0637, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.05917385593056679, |
|
"learning_rate": 0.00018355176321382087, |
|
"loss": 1.1459, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.061461642384529114, |
|
"learning_rate": 0.00018347116090990424, |
|
"loss": 1.0601, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.06090409681200981, |
|
"learning_rate": 0.00018339037938595193, |
|
"loss": 1.0531, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.0663984939455986, |
|
"learning_rate": 0.00018330941881540915, |
|
"loss": 1.1315, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.06271735578775406, |
|
"learning_rate": 0.0001832282793721055, |
|
"loss": 1.0869, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.06272199004888535, |
|
"learning_rate": 0.00018314696123025454, |
|
"loss": 1.1414, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.06375264376401901, |
|
"learning_rate": 0.0001830654645644536, |
|
"loss": 1.0946, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.06219152733683586, |
|
"learning_rate": 0.00018298378954968337, |
|
"loss": 1.0448, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.06392863392829895, |
|
"learning_rate": 0.00018290193636130727, |
|
"loss": 1.1606, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.06729146838188171, |
|
"learning_rate": 0.00018281990517507156, |
|
"loss": 1.1532, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.0799422636628151, |
|
"learning_rate": 0.00018273769616710436, |
|
"loss": 1.1133, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.06902763992547989, |
|
"learning_rate": 0.0001826553095139159, |
|
"loss": 1.1404, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.06674634665250778, |
|
"learning_rate": 0.00018257274539239756, |
|
"loss": 1.2019, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.06378789991140366, |
|
"learning_rate": 0.00018249000397982195, |
|
"loss": 1.1069, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.06818845123052597, |
|
"learning_rate": 0.0001824070854538422, |
|
"loss": 1.1844, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.07562380284070969, |
|
"learning_rate": 0.00018232398999249192, |
|
"loss": 1.0978, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.07126368582248688, |
|
"learning_rate": 0.00018224071777418437, |
|
"loss": 1.2169, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.06599973887205124, |
|
"learning_rate": 0.00018215726897771248, |
|
"loss": 1.1339, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.06484373658895493, |
|
"learning_rate": 0.00018207364378224826, |
|
"loss": 1.1609, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.062379587441682816, |
|
"learning_rate": 0.00018198984236734246, |
|
"loss": 1.1661, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.0662432610988617, |
|
"learning_rate": 0.00018190586491292424, |
|
"loss": 1.1372, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.06816010177135468, |
|
"learning_rate": 0.00018182171159930065, |
|
"loss": 1.1678, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.0649806335568428, |
|
"learning_rate": 0.0001817373826071564, |
|
"loss": 1.1688, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.06410276144742966, |
|
"learning_rate": 0.0001816528781175533, |
|
"loss": 1.1723, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.06425531208515167, |
|
"learning_rate": 0.00018156819831193012, |
|
"loss": 1.148, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.06638933718204498, |
|
"learning_rate": 0.00018148334337210193, |
|
"loss": 1.0427, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.06576802581548691, |
|
"learning_rate": 0.00018139831348025988, |
|
"loss": 1.0814, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.06729135662317276, |
|
"learning_rate": 0.0001813131088189707, |
|
"loss": 1.2238, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.066316619515419, |
|
"learning_rate": 0.00018122772957117645, |
|
"loss": 1.124, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.0657489225268364, |
|
"learning_rate": 0.00018114217592019393, |
|
"loss": 1.1616, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.06329985707998276, |
|
"learning_rate": 0.00018105644804971454, |
|
"loss": 1.1662, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.06630551815032959, |
|
"learning_rate": 0.00018097054614380365, |
|
"loss": 1.0863, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.0653151199221611, |
|
"learning_rate": 0.00018088447038690033, |
|
"loss": 1.0568, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.06651636958122253, |
|
"learning_rate": 0.00018079822096381688, |
|
"loss": 1.1219, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.06870090961456299, |
|
"learning_rate": 0.00018071179805973856, |
|
"loss": 1.1327, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.06400321424007416, |
|
"learning_rate": 0.000180625201860223, |
|
"loss": 1.0736, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.06521004438400269, |
|
"learning_rate": 0.00018053843255120002, |
|
"loss": 1.0366, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.0785718560218811, |
|
"learning_rate": 0.0001804514903189711, |
|
"loss": 1.0535, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.0732305571436882, |
|
"learning_rate": 0.0001803643753502089, |
|
"loss": 1.0828, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.06389783322811127, |
|
"learning_rate": 0.0001802770878319571, |
|
"loss": 1.0966, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.06395277380943298, |
|
"learning_rate": 0.00018018962795162977, |
|
"loss": 1.1414, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.06490381062030792, |
|
"learning_rate": 0.00018010199589701107, |
|
"loss": 1.1405, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.06930892169475555, |
|
"learning_rate": 0.00018001419185625488, |
|
"loss": 1.0145, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.07346969097852707, |
|
"learning_rate": 0.00017992621601788428, |
|
"loss": 1.1745, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.07306042313575745, |
|
"learning_rate": 0.0001798380685707913, |
|
"loss": 1.1333, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.06085991859436035, |
|
"learning_rate": 0.00017974974970423632, |
|
"loss": 1.0801, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.0631103590130806, |
|
"learning_rate": 0.00017966125960784787, |
|
"loss": 1.1209, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.06399189680814743, |
|
"learning_rate": 0.00017957259847162205, |
|
"loss": 1.1683, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.06143182888627052, |
|
"learning_rate": 0.00017948376648592232, |
|
"loss": 1.0855, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.06295006722211838, |
|
"learning_rate": 0.00017939476384147877, |
|
"loss": 1.1398, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.06850682944059372, |
|
"learning_rate": 0.0001793055907293881, |
|
"loss": 1.1245, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.061626896262168884, |
|
"learning_rate": 0.00017921624734111292, |
|
"loss": 1.0763, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.06570126861333847, |
|
"learning_rate": 0.00017912673386848152, |
|
"loss": 1.1547, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.06566940248012543, |
|
"learning_rate": 0.00017903705050368722, |
|
"loss": 1.1218, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.06877337396144867, |
|
"learning_rate": 0.00017894719743928827, |
|
"loss": 1.2173, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.06467350572347641, |
|
"learning_rate": 0.00017885717486820722, |
|
"loss": 1.1347, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.08330734819173813, |
|
"learning_rate": 0.00017876698298373053, |
|
"loss": 1.1373, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.06419449299573898, |
|
"learning_rate": 0.0001786766219795083, |
|
"loss": 1.1285, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.07984116673469543, |
|
"learning_rate": 0.0001785860920495536, |
|
"loss": 1.1309, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.07000021636486053, |
|
"learning_rate": 0.00017849539338824231, |
|
"loss": 1.1193, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.07003198564052582, |
|
"learning_rate": 0.00017840452619031258, |
|
"loss": 1.1596, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.07209260761737823, |
|
"learning_rate": 0.00017831349065086435, |
|
"loss": 1.2199, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.06799673289060593, |
|
"learning_rate": 0.00017822228696535907, |
|
"loss": 1.1247, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.06913288682699203, |
|
"learning_rate": 0.0001781309153296192, |
|
"loss": 1.1035, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.06872755289077759, |
|
"learning_rate": 0.0001780393759398278, |
|
"loss": 1.1613, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.06622165441513062, |
|
"learning_rate": 0.00017794766899252812, |
|
"loss": 1.0869, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.06728823482990265, |
|
"learning_rate": 0.00017785579468462316, |
|
"loss": 1.1408, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.06555955857038498, |
|
"learning_rate": 0.00017776375321337521, |
|
"loss": 1.1584, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.0661938264966011, |
|
"learning_rate": 0.0001776715447764056, |
|
"loss": 1.1542, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.07007063180208206, |
|
"learning_rate": 0.00017757916957169404, |
|
"loss": 1.1424, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.06441737711429596, |
|
"learning_rate": 0.0001774866277975784, |
|
"loss": 1.1361, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.06655770540237427, |
|
"learning_rate": 0.00017739391965275404, |
|
"loss": 1.0968, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.07319090515375137, |
|
"learning_rate": 0.0001773010453362737, |
|
"loss": 1.107, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.06766431778669357, |
|
"learning_rate": 0.0001772080050475468, |
|
"loss": 1.1571, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.06634720414876938, |
|
"learning_rate": 0.00017711479898633914, |
|
"loss": 1.175, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.06589143723249435, |
|
"learning_rate": 0.00017702142735277247, |
|
"loss": 1.1475, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.06698860973119736, |
|
"learning_rate": 0.00017692789034732403, |
|
"loss": 1.1042, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.06417112797498703, |
|
"learning_rate": 0.0001768341881708261, |
|
"loss": 1.0588, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.06298335641622543, |
|
"learning_rate": 0.00017674032102446563, |
|
"loss": 0.9964, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.06435716152191162, |
|
"learning_rate": 0.00017664628910978375, |
|
"loss": 1.1126, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.06721868366003036, |
|
"learning_rate": 0.00017655209262867541, |
|
"loss": 1.1541, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.06489331275224686, |
|
"learning_rate": 0.00017645773178338886, |
|
"loss": 1.0539, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.06617634743452072, |
|
"learning_rate": 0.00017636320677652526, |
|
"loss": 1.094, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.06776611506938934, |
|
"learning_rate": 0.0001762685178110382, |
|
"loss": 1.0155, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.06376908719539642, |
|
"learning_rate": 0.00017617366509023338, |
|
"loss": 1.2292, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.06204572319984436, |
|
"learning_rate": 0.00017607864881776807, |
|
"loss": 1.0968, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.07098660618066788, |
|
"learning_rate": 0.00017598346919765067, |
|
"loss": 1.1067, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.06708233058452606, |
|
"learning_rate": 0.00017588812643424032, |
|
"loss": 1.126, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.0708741545677185, |
|
"learning_rate": 0.00017579262073224647, |
|
"loss": 1.0234, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.06812895089387894, |
|
"learning_rate": 0.00017569695229672835, |
|
"loss": 1.14, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.06947667896747589, |
|
"learning_rate": 0.00017560112133309467, |
|
"loss": 1.1888, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.06499818712472916, |
|
"learning_rate": 0.0001755051280471031, |
|
"loss": 1.0889, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.063695028424263, |
|
"learning_rate": 0.00017540897264485967, |
|
"loss": 1.0678, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.06530368328094482, |
|
"learning_rate": 0.00017531265533281872, |
|
"loss": 1.0348, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.07548119127750397, |
|
"learning_rate": 0.0001752161763177821, |
|
"loss": 1.1479, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.06523353606462479, |
|
"learning_rate": 0.00017511953580689888, |
|
"loss": 1.1491, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.06849244982004166, |
|
"learning_rate": 0.00017502273400766484, |
|
"loss": 1.143, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.06976161897182465, |
|
"learning_rate": 0.00017492577112792208, |
|
"loss": 1.1597, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.06484940648078918, |
|
"learning_rate": 0.0001748286473758586, |
|
"loss": 1.1013, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.06747712939977646, |
|
"learning_rate": 0.00017473136296000772, |
|
"loss": 1.1129, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.06816747039556503, |
|
"learning_rate": 0.0001746339180892478, |
|
"loss": 1.2297, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.06335021555423737, |
|
"learning_rate": 0.00017453631297280166, |
|
"loss": 1.0683, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.06458504498004913, |
|
"learning_rate": 0.00017443854782023624, |
|
"loss": 1.0101, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.0690096914768219, |
|
"learning_rate": 0.000174340622841462, |
|
"loss": 1.1363, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.06535279005765915, |
|
"learning_rate": 0.00017424253824673263, |
|
"loss": 1.0364, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.0651826336979866, |
|
"learning_rate": 0.00017414429424664454, |
|
"loss": 1.1102, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.06632277369499207, |
|
"learning_rate": 0.00017404589105213632, |
|
"loss": 1.1461, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.06560949981212616, |
|
"learning_rate": 0.00017394732887448847, |
|
"loss": 1.1939, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.06644757837057114, |
|
"learning_rate": 0.00017384860792532276, |
|
"loss": 1.1603, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.06334047764539719, |
|
"learning_rate": 0.00017374972841660186, |
|
"loss": 1.1102, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.061144351959228516, |
|
"learning_rate": 0.00017365069056062894, |
|
"loss": 1.0735, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.0644732415676117, |
|
"learning_rate": 0.00017355149457004709, |
|
"loss": 1.1449, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.06779829412698746, |
|
"learning_rate": 0.00017345214065783897, |
|
"loss": 1.0644, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.06311120837926865, |
|
"learning_rate": 0.00017335262903732634, |
|
"loss": 1.0737, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.070918507874012, |
|
"learning_rate": 0.0001732529599221695, |
|
"loss": 1.002, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.06275039911270142, |
|
"learning_rate": 0.0001731531335263669, |
|
"loss": 1.0902, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.06746424734592438, |
|
"learning_rate": 0.0001730531500642548, |
|
"loss": 1.1495, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.07816082239151001, |
|
"learning_rate": 0.00017295300975050658, |
|
"loss": 1.0652, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.06773856282234192, |
|
"learning_rate": 0.00017285271280013245, |
|
"loss": 1.1856, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.0698360875248909, |
|
"learning_rate": 0.0001727522594284789, |
|
"loss": 1.201, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.07427448779344559, |
|
"learning_rate": 0.0001726516498512283, |
|
"loss": 1.116, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.06394834816455841, |
|
"learning_rate": 0.00017255088428439836, |
|
"loss": 1.1401, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.06733064353466034, |
|
"learning_rate": 0.0001724499629443418, |
|
"loss": 1.1239, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.06488535553216934, |
|
"learning_rate": 0.00017234888604774574, |
|
"loss": 1.1434, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.0653986781835556, |
|
"learning_rate": 0.00017224765381163132, |
|
"loss": 1.1392, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.07161843031644821, |
|
"learning_rate": 0.00017214626645335314, |
|
"loss": 1.0808, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.06368163228034973, |
|
"learning_rate": 0.00017204472419059895, |
|
"loss": 1.1395, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.07309909909963608, |
|
"learning_rate": 0.00017194302724138903, |
|
"loss": 1.1411, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.0663491040468216, |
|
"learning_rate": 0.00017184117582407578, |
|
"loss": 1.1509, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.06098710745573044, |
|
"learning_rate": 0.00017173917015734336, |
|
"loss": 1.0276, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.0714559257030487, |
|
"learning_rate": 0.00017163701046020707, |
|
"loss": 1.2508, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.06662353128194809, |
|
"learning_rate": 0.00017153469695201277, |
|
"loss": 1.0658, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.0661638155579567, |
|
"learning_rate": 0.0001714322298524368, |
|
"loss": 1.1066, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.07075709104537964, |
|
"learning_rate": 0.00017132960938148512, |
|
"loss": 1.1827, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 1.0872660875320435, |
|
"eval_runtime": 96.8426, |
|
"eval_samples_per_second": 51.63, |
|
"eval_steps_per_second": 12.908, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.07708127051591873, |
|
"learning_rate": 0.00017122683575949307, |
|
"loss": 1.2037, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.06289788335561752, |
|
"learning_rate": 0.0001711239092071248, |
|
"loss": 1.0755, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.0677611231803894, |
|
"learning_rate": 0.00017102082994537282, |
|
"loss": 1.1438, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.06381526589393616, |
|
"learning_rate": 0.00017091759819555744, |
|
"loss": 1.1003, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.06795594096183777, |
|
"learning_rate": 0.00017081421417932652, |
|
"loss": 1.1063, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.06619235128164291, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 1.1499, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.06958078593015671, |
|
"learning_rate": 0.0001706069902358433, |
|
"loss": 1.0918, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.06839771568775177, |
|
"learning_rate": 0.0001705031507535193, |
|
"loss": 1.0789, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.0677015408873558, |
|
"learning_rate": 0.00017039915989463538, |
|
"loss": 1.1419, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.0648641586303711, |
|
"learning_rate": 0.00017029501788246924, |
|
"loss": 1.1461, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.06705692410469055, |
|
"learning_rate": 0.00017019072494062304, |
|
"loss": 1.1288, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.06862998753786087, |
|
"learning_rate": 0.00017008628129302307, |
|
"loss": 1.107, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.07801419496536255, |
|
"learning_rate": 0.00016998168716391915, |
|
"loss": 1.097, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.07198420912027359, |
|
"learning_rate": 0.00016987694277788417, |
|
"loss": 1.1328, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.06974530965089798, |
|
"learning_rate": 0.00016977204835981374, |
|
"loss": 1.233, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.06750082969665527, |
|
"learning_rate": 0.00016966700413492556, |
|
"loss": 1.1373, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.06531986594200134, |
|
"learning_rate": 0.00016956181032875894, |
|
"loss": 1.0542, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.06485234200954437, |
|
"learning_rate": 0.0001694564671671743, |
|
"loss": 1.0888, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.06844603270292282, |
|
"learning_rate": 0.0001693509748763529, |
|
"loss": 1.1358, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.06846935302019119, |
|
"learning_rate": 0.00016924533368279607, |
|
"loss": 1.186, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.07385078817605972, |
|
"learning_rate": 0.00016913954381332492, |
|
"loss": 1.137, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.06992754340171814, |
|
"learning_rate": 0.0001690336054950797, |
|
"loss": 1.1749, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.06728782504796982, |
|
"learning_rate": 0.0001689275189555195, |
|
"loss": 1.041, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.0680394098162651, |
|
"learning_rate": 0.00016882128442242156, |
|
"loss": 1.171, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.06516265124082565, |
|
"learning_rate": 0.00016871490212388087, |
|
"loss": 1.126, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.06674471497535706, |
|
"learning_rate": 0.00016860837228830974, |
|
"loss": 1.1474, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.06331098824739456, |
|
"learning_rate": 0.00016850169514443728, |
|
"loss": 1.1349, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.06607010960578918, |
|
"learning_rate": 0.00016839487092130883, |
|
"loss": 1.1848, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.06833096593618393, |
|
"learning_rate": 0.0001682878998482855, |
|
"loss": 1.1646, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.06246986240148544, |
|
"learning_rate": 0.0001681807821550438, |
|
"loss": 1.0467, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.06146036833524704, |
|
"learning_rate": 0.0001680735180715749, |
|
"loss": 0.9574, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.07327894866466522, |
|
"learning_rate": 0.00016796610782818442, |
|
"loss": 1.1735, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.06582464277744293, |
|
"learning_rate": 0.00016785855165549175, |
|
"loss": 1.0437, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.06511417776346207, |
|
"learning_rate": 0.00016775084978442955, |
|
"loss": 1.0813, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.06808367371559143, |
|
"learning_rate": 0.00016764300244624338, |
|
"loss": 1.0771, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.06845704466104507, |
|
"learning_rate": 0.0001675350098724911, |
|
"loss": 1.1645, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.0673985630273819, |
|
"learning_rate": 0.00016742687229504246, |
|
"loss": 1.0358, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.0681719034910202, |
|
"learning_rate": 0.00016731858994607838, |
|
"loss": 1.0971, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.06640625, |
|
"learning_rate": 0.00016721016305809084, |
|
"loss": 1.242, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.07134019583463669, |
|
"learning_rate": 0.00016710159186388203, |
|
"loss": 1.0486, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.07230845093727112, |
|
"learning_rate": 0.00016699287659656395, |
|
"loss": 1.1678, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.06907068192958832, |
|
"learning_rate": 0.00016688401748955802, |
|
"loss": 1.1183, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.06549103558063507, |
|
"learning_rate": 0.00016677501477659445, |
|
"loss": 1.1073, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.06864264607429504, |
|
"learning_rate": 0.0001666658686917118, |
|
"loss": 1.1528, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.06810259073972702, |
|
"learning_rate": 0.00016655657946925646, |
|
"loss": 1.1308, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.06780505925416946, |
|
"learning_rate": 0.00016644714734388217, |
|
"loss": 1.16, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.06738951057195663, |
|
"learning_rate": 0.00016633757255054947, |
|
"loss": 1.1706, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.06679335236549377, |
|
"learning_rate": 0.0001662278553245252, |
|
"loss": 1.1704, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.06979308277368546, |
|
"learning_rate": 0.00016611799590138203, |
|
"loss": 1.1482, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.0626835972070694, |
|
"learning_rate": 0.00016600799451699802, |
|
"loss": 1.1115, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.06404287368059158, |
|
"learning_rate": 0.0001658978514075559, |
|
"loss": 1.175, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.07459414005279541, |
|
"learning_rate": 0.00016578756680954277, |
|
"loss": 1.1302, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.0666908472776413, |
|
"learning_rate": 0.00016567714095974952, |
|
"loss": 1.1636, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.06724884361028671, |
|
"learning_rate": 0.0001655665740952703, |
|
"loss": 1.1355, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.0689745619893074, |
|
"learning_rate": 0.00016545586645350205, |
|
"loss": 1.1162, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.06672396510839462, |
|
"learning_rate": 0.000165345018272144, |
|
"loss": 1.0862, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.06393066048622131, |
|
"learning_rate": 0.00016523402978919704, |
|
"loss": 1.1315, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.07766008377075195, |
|
"learning_rate": 0.00016512290124296336, |
|
"loss": 1.1024, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.07702287286520004, |
|
"learning_rate": 0.00016501163287204588, |
|
"loss": 1.1509, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.06490621715784073, |
|
"learning_rate": 0.00016490022491534768, |
|
"loss": 1.0167, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.06604059785604477, |
|
"learning_rate": 0.0001647886776120717, |
|
"loss": 1.106, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.06731244921684265, |
|
"learning_rate": 0.00016467699120171987, |
|
"loss": 1.0898, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.07821813970804214, |
|
"learning_rate": 0.00016456516592409294, |
|
"loss": 1.1176, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.06816477328538895, |
|
"learning_rate": 0.0001644532020192897, |
|
"loss": 1.1786, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.06874791532754898, |
|
"learning_rate": 0.0001643410997277067, |
|
"loss": 1.0926, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.06932558119297028, |
|
"learning_rate": 0.00016422885929003758, |
|
"loss": 1.1679, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.06875893473625183, |
|
"learning_rate": 0.00016411648094727253, |
|
"loss": 1.1935, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.06305741518735886, |
|
"learning_rate": 0.00016400396494069792, |
|
"loss": 1.0654, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.06925616413354874, |
|
"learning_rate": 0.00016389131151189567, |
|
"loss": 1.1218, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.07191671431064606, |
|
"learning_rate": 0.00016377852090274276, |
|
"loss": 1.1875, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.06721782684326172, |
|
"learning_rate": 0.00016366559335541066, |
|
"loss": 1.2297, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.07050056755542755, |
|
"learning_rate": 0.00016355252911236492, |
|
"loss": 1.2071, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.0658205896615982, |
|
"learning_rate": 0.00016343932841636456, |
|
"loss": 1.1652, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.07456327974796295, |
|
"learning_rate": 0.0001633259915104616, |
|
"loss": 1.0894, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.06406401097774506, |
|
"learning_rate": 0.00016321251863800045, |
|
"loss": 1.0864, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.06928929686546326, |
|
"learning_rate": 0.00016309891004261755, |
|
"loss": 1.0215, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.06436607241630554, |
|
"learning_rate": 0.00016298516596824068, |
|
"loss": 1.1855, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.06957562267780304, |
|
"learning_rate": 0.0001628712866590885, |
|
"loss": 1.2085, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.07085030525922775, |
|
"learning_rate": 0.0001627572723596701, |
|
"loss": 1.1079, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.06608451902866364, |
|
"learning_rate": 0.0001626431233147843, |
|
"loss": 1.1456, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.0686989277601242, |
|
"learning_rate": 0.00016252883976951938, |
|
"loss": 1.1112, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.06871567666530609, |
|
"learning_rate": 0.00016241442196925223, |
|
"loss": 1.1612, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.06559213250875473, |
|
"learning_rate": 0.00016229987015964814, |
|
"loss": 1.1588, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.06943194568157196, |
|
"learning_rate": 0.00016218518458666008, |
|
"loss": 1.1108, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.06725773215293884, |
|
"learning_rate": 0.00016207036549652814, |
|
"loss": 1.1942, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.06677956134080887, |
|
"learning_rate": 0.00016195541313577923, |
|
"loss": 1.129, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.06949102878570557, |
|
"learning_rate": 0.00016184032775122631, |
|
"loss": 1.0889, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.06555166095495224, |
|
"learning_rate": 0.00016172510958996795, |
|
"loss": 1.1484, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.07066816091537476, |
|
"learning_rate": 0.0001616097588993879, |
|
"loss": 1.0807, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.06895067542791367, |
|
"learning_rate": 0.00016149427592715432, |
|
"loss": 1.1586, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.07003293186426163, |
|
"learning_rate": 0.00016137866092121953, |
|
"loss": 1.1059, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.06466325372457504, |
|
"learning_rate": 0.0001612629141298192, |
|
"loss": 1.1759, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.06705661863088608, |
|
"learning_rate": 0.00016114703580147202, |
|
"loss": 1.1766, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.07274851948022842, |
|
"learning_rate": 0.00016103102618497922, |
|
"loss": 1.211, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.06761591881513596, |
|
"learning_rate": 0.0001609148855294237, |
|
"loss": 1.1049, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.06395487487316132, |
|
"learning_rate": 0.00016079861408416985, |
|
"loss": 1.0404, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.07156820595264435, |
|
"learning_rate": 0.00016068221209886288, |
|
"loss": 1.1569, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.06414105743169785, |
|
"learning_rate": 0.00016056567982342817, |
|
"loss": 1.1568, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.06608510762453079, |
|
"learning_rate": 0.00016044901750807098, |
|
"loss": 1.1529, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.06756897270679474, |
|
"learning_rate": 0.00016033222540327567, |
|
"loss": 1.0845, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.06797973066568375, |
|
"learning_rate": 0.00016021530375980535, |
|
"loss": 1.0768, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.06994897872209549, |
|
"learning_rate": 0.00016009825282870126, |
|
"loss": 1.1013, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.06616318970918655, |
|
"learning_rate": 0.00015998107286128215, |
|
"loss": 1.0895, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.07222306728363037, |
|
"learning_rate": 0.00015986376410914388, |
|
"loss": 1.1811, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.06701509654521942, |
|
"learning_rate": 0.00015974632682415885, |
|
"loss": 1.1747, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.0668727234005928, |
|
"learning_rate": 0.00015962876125847535, |
|
"loss": 1.1241, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.06632225960493088, |
|
"learning_rate": 0.00015951106766451715, |
|
"loss": 1.1301, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.06712299585342407, |
|
"learning_rate": 0.00015939324629498294, |
|
"loss": 1.1646, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.06921354681253433, |
|
"learning_rate": 0.0001592752974028457, |
|
"loss": 1.1075, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.06251943111419678, |
|
"learning_rate": 0.00015915722124135227, |
|
"loss": 1.139, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.06602328270673752, |
|
"learning_rate": 0.00015903901806402264, |
|
"loss": 1.1455, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.06831764429807663, |
|
"learning_rate": 0.00015892068812464963, |
|
"loss": 1.1503, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.0706949532032013, |
|
"learning_rate": 0.00015880223167729818, |
|
"loss": 1.1477, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.07188212126493454, |
|
"learning_rate": 0.0001586836489763049, |
|
"loss": 1.0946, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.0642421543598175, |
|
"learning_rate": 0.00015856494027627734, |
|
"loss": 1.1365, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.07110840082168579, |
|
"learning_rate": 0.00015844610583209373, |
|
"loss": 1.1358, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.06591419875621796, |
|
"learning_rate": 0.0001583271458989023, |
|
"loss": 1.1176, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.0683559700846672, |
|
"learning_rate": 0.00015820806073212055, |
|
"loss": 1.0383, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.06749717891216278, |
|
"learning_rate": 0.00015808885058743498, |
|
"loss": 1.1389, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.07007017731666565, |
|
"learning_rate": 0.00015796951572080047, |
|
"loss": 1.1519, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.06702763587236404, |
|
"learning_rate": 0.00015785005638843956, |
|
"loss": 1.1068, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.07349605113267899, |
|
"learning_rate": 0.0001577304728468422, |
|
"loss": 1.2148, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.06870365887880325, |
|
"learning_rate": 0.00015761076535276486, |
|
"loss": 1.1951, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.07068511843681335, |
|
"learning_rate": 0.00015749093416323024, |
|
"loss": 1.1536, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.0644054263830185, |
|
"learning_rate": 0.00015737097953552658, |
|
"loss": 1.1183, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.06498972326517105, |
|
"learning_rate": 0.0001572509017272072, |
|
"loss": 1.0997, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.06673012673854828, |
|
"learning_rate": 0.00015713070099608988, |
|
"loss": 1.1471, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.06598661094903946, |
|
"learning_rate": 0.0001570103776002563, |
|
"loss": 1.0986, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.06478138267993927, |
|
"learning_rate": 0.00015688993179805154, |
|
"loss": 1.0817, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.06719911098480225, |
|
"learning_rate": 0.00015676936384808354, |
|
"loss": 1.1287, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.06300705671310425, |
|
"learning_rate": 0.00015664867400922239, |
|
"loss": 1.0937, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.06910447776317596, |
|
"learning_rate": 0.00015652786254059998, |
|
"loss": 1.0682, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.06596586108207703, |
|
"learning_rate": 0.0001564069297016093, |
|
"loss": 1.0551, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.06589323282241821, |
|
"learning_rate": 0.00015628587575190395, |
|
"loss": 1.1609, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.07057927548885345, |
|
"learning_rate": 0.00015616470095139762, |
|
"loss": 1.1117, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.07004386186599731, |
|
"learning_rate": 0.0001560434055602634, |
|
"loss": 1.1485, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.06499215215444565, |
|
"learning_rate": 0.00015592198983893329, |
|
"loss": 1.0118, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.06642549484968185, |
|
"learning_rate": 0.00015580045404809772, |
|
"loss": 1.1556, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.06729692965745926, |
|
"learning_rate": 0.0001556787984487049, |
|
"loss": 1.1552, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.069004587829113, |
|
"learning_rate": 0.00015555702330196023, |
|
"loss": 1.1373, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.07328642904758453, |
|
"learning_rate": 0.00015543512886932582, |
|
"loss": 1.1266, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.06573385000228882, |
|
"learning_rate": 0.00015531311541251995, |
|
"loss": 1.1094, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.0658872127532959, |
|
"learning_rate": 0.00015519098319351636, |
|
"loss": 1.087, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.0687699168920517, |
|
"learning_rate": 0.00015506873247454384, |
|
"loss": 1.0914, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.06617342680692673, |
|
"learning_rate": 0.00015494636351808563, |
|
"loss": 1.1377, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.06719239801168442, |
|
"learning_rate": 0.00015482387658687875, |
|
"loss": 1.2187, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.0662359967827797, |
|
"learning_rate": 0.0001547012719439136, |
|
"loss": 1.0325, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.0648040920495987, |
|
"learning_rate": 0.0001545785498524333, |
|
"loss": 1.0903, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.06930810958147049, |
|
"learning_rate": 0.00015445571057593315, |
|
"loss": 1.128, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.06874281167984009, |
|
"learning_rate": 0.00015433275437816004, |
|
"loss": 1.2245, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.06701396405696869, |
|
"learning_rate": 0.0001542096815231119, |
|
"loss": 1.1042, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.06753288954496384, |
|
"learning_rate": 0.00015408649227503714, |
|
"loss": 1.1593, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.06974251568317413, |
|
"learning_rate": 0.00015396318689843408, |
|
"loss": 1.1662, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.07186184823513031, |
|
"learning_rate": 0.00015383976565805035, |
|
"loss": 1.2292, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.06782180815935135, |
|
"learning_rate": 0.0001537162288188824, |
|
"loss": 1.1006, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.06984351575374603, |
|
"learning_rate": 0.00015359257664617485, |
|
"loss": 1.0839, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.0696905329823494, |
|
"learning_rate": 0.00015346880940541997, |
|
"loss": 1.0847, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.06678467988967896, |
|
"learning_rate": 0.00015334492736235705, |
|
"loss": 1.145, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.06899551302194595, |
|
"learning_rate": 0.0001532209307829719, |
|
"loss": 1.0262, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.06538818776607513, |
|
"learning_rate": 0.00015309681993349626, |
|
"loss": 1.1179, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.0666823759675026, |
|
"learning_rate": 0.00015297259508040718, |
|
"loss": 1.1654, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.0722886174917221, |
|
"learning_rate": 0.00015284825649042655, |
|
"loss": 1.1446, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.0693565309047699, |
|
"learning_rate": 0.0001527238044305204, |
|
"loss": 1.2072, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.07026789337396622, |
|
"learning_rate": 0.00015259923916789844, |
|
"loss": 1.1308, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.07177083194255829, |
|
"learning_rate": 0.00015247456097001338, |
|
"loss": 1.1002, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.0645269900560379, |
|
"learning_rate": 0.00015234977010456047, |
|
"loss": 1.1355, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.0640435442328453, |
|
"learning_rate": 0.00015222486683947678, |
|
"loss": 1.1648, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.06458823382854462, |
|
"learning_rate": 0.0001520998514429409, |
|
"loss": 1.0977, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.06381873786449432, |
|
"learning_rate": 0.0001519747241833719, |
|
"loss": 1.0494, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.07047054916620255, |
|
"learning_rate": 0.00015184948532942928, |
|
"loss": 1.0674, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.07138516753911972, |
|
"learning_rate": 0.000151724135150012, |
|
"loss": 1.2869, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.06848011165857315, |
|
"learning_rate": 0.0001515986739142581, |
|
"loss": 1.1813, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.0663144439458847, |
|
"learning_rate": 0.00015147310189154406, |
|
"loss": 1.2019, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.07065600156784058, |
|
"learning_rate": 0.0001513474193514842, |
|
"loss": 1.1387, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.06766260415315628, |
|
"learning_rate": 0.0001512216265639302, |
|
"loss": 1.1266, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.06984108686447144, |
|
"learning_rate": 0.00015109572379897035, |
|
"loss": 1.2299, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.06617981195449829, |
|
"learning_rate": 0.00015096971132692917, |
|
"loss": 1.1855, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.07162600755691528, |
|
"learning_rate": 0.0001508435894183667, |
|
"loss": 1.0786, |
|
"step": 718 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 2154, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 718, |
|
"total_flos": 2.1425420186167542e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|