{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 1000, "global_step": 1145, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004366812227074236, "grad_norm": 0.49208763779092174, "learning_rate": 4.347826086956522e-08, "logits/chosen": -1.130352258682251, "logits/rejected": -0.9433857798576355, "logps/chosen": -272.3143005371094, "logps/rejected": -290.848388671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.043668122270742356, "grad_norm": 0.5373741627747036, "learning_rate": 4.347826086956522e-07, "logits/chosen": -1.0119584798812866, "logits/rejected": -1.054663062095642, "logps/chosen": -300.1561279296875, "logps/rejected": -261.20513916015625, "loss": 0.6928, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.0015893004601821303, "rewards/margins": 0.001330445520579815, "rewards/rejected": 0.0002588547649793327, "step": 10 }, { "epoch": 0.08733624454148471, "grad_norm": 0.518120805736266, "learning_rate": 8.695652173913044e-07, "logits/chosen": -1.1330419778823853, "logits/rejected": -1.0284273624420166, "logps/chosen": -283.1369934082031, "logps/rejected": -323.2412109375, "loss": 0.6924, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0005519447731785476, "rewards/margins": 0.0006424236344173551, "rewards/rejected": -0.0011943683493882418, "step": 20 }, { "epoch": 0.13100436681222707, "grad_norm": 0.6136347394482697, "learning_rate": 1.3043478260869566e-06, "logits/chosen": -1.0650444030761719, "logits/rejected": -1.107439637184143, "logps/chosen": -290.29168701171875, "logps/rejected": -270.15887451171875, "loss": 0.6899, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0012172337155789137, "rewards/margins": 0.0061761606484651566, "rewards/rejected": -0.007393394596874714, "step": 30 }, { "epoch": 0.17467248908296942, "grad_norm": 0.5452485335548993, "learning_rate": 1.7391304347826088e-06, "logits/chosen": -1.0981855392456055, "logits/rejected": -1.1204359531402588, "logps/chosen": -288.19488525390625, "logps/rejected": -279.60968017578125, "loss": 0.6822, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.01171443797647953, "rewards/margins": 0.023151502013206482, "rewards/rejected": -0.03486593812704086, "step": 40 }, { "epoch": 0.2183406113537118, "grad_norm": 0.5716667587131498, "learning_rate": 2.173913043478261e-06, "logits/chosen": -1.1297352313995361, "logits/rejected": -1.0235356092453003, "logps/chosen": -267.85931396484375, "logps/rejected": -315.6715393066406, "loss": 0.6681, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.03383685275912285, "rewards/margins": 0.050773195922374725, "rewards/rejected": -0.08461005985736847, "step": 50 }, { "epoch": 0.26200873362445415, "grad_norm": 0.7397729774530086, "learning_rate": 2.6086956521739132e-06, "logits/chosen": -1.0748536586761475, "logits/rejected": -1.096380352973938, "logps/chosen": -298.35894775390625, "logps/rejected": -278.65167236328125, "loss": 0.6372, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.06509696692228317, "rewards/margins": 0.13354261219501495, "rewards/rejected": -0.19863960146903992, "step": 60 }, { "epoch": 0.3056768558951965, "grad_norm": 0.9490199807243085, "learning_rate": 3.043478260869566e-06, "logits/chosen": -1.1220731735229492, "logits/rejected": -1.1358802318572998, "logps/chosen": -287.66119384765625, "logps/rejected": -300.89215087890625, "loss": 0.566, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.05601518228650093, "rewards/margins": 0.2802619934082031, "rewards/rejected": -0.33627718687057495, "step": 70 }, { "epoch": 0.34934497816593885, "grad_norm": 1.1366581667233986, "learning_rate": 3.4782608695652175e-06, "logits/chosen": -1.1631652116775513, "logits/rejected": -1.0547783374786377, "logps/chosen": -273.6378173828125, "logps/rejected": -378.0707702636719, "loss": 0.4241, "rewards/accuracies": 1.0, "rewards/chosen": -0.0387813039124012, "rewards/margins": 0.6967908143997192, "rewards/rejected": -0.7355720400810242, "step": 80 }, { "epoch": 0.3930131004366812, "grad_norm": 0.699317306849894, "learning_rate": 3.91304347826087e-06, "logits/chosen": -1.13710355758667, "logits/rejected": -1.1224480867385864, "logps/chosen": -286.89630126953125, "logps/rejected": -425.4947204589844, "loss": 0.1966, "rewards/accuracies": 1.0, "rewards/chosen": 0.010465627536177635, "rewards/margins": 1.6259124279022217, "rewards/rejected": -1.6154468059539795, "step": 90 }, { "epoch": 0.4366812227074236, "grad_norm": 0.20225227555832068, "learning_rate": 4.347826086956522e-06, "logits/chosen": -1.162099003791809, "logits/rejected": -1.0710939168930054, "logps/chosen": -254.20108032226562, "logps/rejected": -621.758544921875, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": 0.06020994111895561, "rewards/margins": 3.4915261268615723, "rewards/rejected": -3.431316375732422, "step": 100 }, { "epoch": 0.48034934497816595, "grad_norm": 0.09010131899726684, "learning_rate": 4.782608695652174e-06, "logits/chosen": -1.0987495183944702, "logits/rejected": -1.049612283706665, "logps/chosen": -281.94390869140625, "logps/rejected": -799.0904541015625, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 0.1645246148109436, "rewards/margins": 5.45929479598999, "rewards/rejected": -5.29477071762085, "step": 110 }, { "epoch": 0.5240174672489083, "grad_norm": 0.030005828505106115, "learning_rate": 4.999709285361594e-06, "logits/chosen": -1.1363656520843506, "logits/rejected": -1.0334757566452026, "logps/chosen": -270.98944091796875, "logps/rejected": -936.4185791015625, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.26095929741859436, "rewards/margins": 6.916310787200928, "rewards/rejected": -6.655351161956787, "step": 120 }, { "epoch": 0.5676855895196506, "grad_norm": 0.019361577478786484, "learning_rate": 4.997383973910631e-06, "logits/chosen": -1.0980539321899414, "logits/rejected": -0.9340838193893433, "logps/chosen": -266.50250244140625, "logps/rejected": -1020.0845947265625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.24470993876457214, "rewards/margins": 7.702264308929443, "rewards/rejected": -7.457553863525391, "step": 130 }, { "epoch": 0.611353711790393, "grad_norm": 0.012282637611773778, "learning_rate": 4.9927355140895775e-06, "logits/chosen": -1.1392152309417725, "logits/rejected": -0.8957275152206421, "logps/chosen": -242.6056365966797, "logps/rejected": -1173.87158203125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.2704493999481201, "rewards/margins": 8.908761978149414, "rewards/rejected": -8.638312339782715, "step": 140 }, { "epoch": 0.6550218340611353, "grad_norm": 0.04700706955431712, "learning_rate": 4.985768230048011e-06, "logits/chosen": -1.1246144771575928, "logits/rejected": -0.9621224403381348, "logps/chosen": -273.8687438964844, "logps/rejected": -1090.0859375, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 0.17607423663139343, "rewards/margins": 8.613072395324707, "rewards/rejected": -8.43699836730957, "step": 150 }, { "epoch": 0.6986899563318777, "grad_norm": 0.01943896178498091, "learning_rate": 4.976488602981748e-06, "logits/chosen": -1.145508885383606, "logits/rejected": -0.9405732154846191, "logps/chosen": -249.9766387939453, "logps/rejected": -1222.8212890625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 0.33843302726745605, "rewards/margins": 9.41877269744873, "rewards/rejected": -9.080339431762695, "step": 160 }, { "epoch": 0.74235807860262, "grad_norm": 0.019746462175002255, "learning_rate": 4.9649052651038255e-06, "logits/chosen": -1.0897105932235718, "logits/rejected": -0.9191669225692749, "logps/chosen": -267.56549072265625, "logps/rejected": -1116.246826171875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.28829699754714966, "rewards/margins": 8.88953971862793, "rewards/rejected": -8.601243019104004, "step": 170 }, { "epoch": 0.7860262008733624, "grad_norm": 0.014841671172711, "learning_rate": 4.9510289916145295e-06, "logits/chosen": -1.0843827724456787, "logits/rejected": -0.8706234693527222, "logps/chosen": -254.61972045898438, "logps/rejected": -1236.5650634765625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 0.22450891137123108, "rewards/margins": 9.739435195922852, "rewards/rejected": -9.514925956726074, "step": 180 }, { "epoch": 0.8296943231441049, "grad_norm": 0.012319918167579146, "learning_rate": 4.934872690677953e-06, "logits/chosen": -1.0835936069488525, "logits/rejected": -0.8566627502441406, "logps/chosen": -267.7489013671875, "logps/rejected": -1272.4853515625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 0.1923988163471222, "rewards/margins": 9.936999320983887, "rewards/rejected": -9.744600296020508, "step": 190 }, { "epoch": 0.8733624454148472, "grad_norm": 0.005430948770812638, "learning_rate": 4.9164513914144005e-06, "logits/chosen": -1.0162463188171387, "logits/rejected": -0.7294031381607056, "logps/chosen": -249.8229522705078, "logps/rejected": -1294.401611328125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.20589938759803772, "rewards/margins": 10.359135627746582, "rewards/rejected": -10.153237342834473, "step": 200 }, { "epoch": 0.9170305676855895, "grad_norm": 0.009500725575344962, "learning_rate": 4.8957822299198045e-06, "logits/chosen": -1.1154823303222656, "logits/rejected": -0.8606049418449402, "logps/chosen": -262.8265686035156, "logps/rejected": -1394.8551025390625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 0.19020363688468933, "rewards/margins": 11.131672859191895, "rewards/rejected": -10.941469192504883, "step": 210 }, { "epoch": 0.9606986899563319, "grad_norm": 0.02542499541949849, "learning_rate": 4.872884433325169e-06, "logits/chosen": -1.0547072887420654, "logits/rejected": -0.8354827165603638, "logps/chosen": -282.1262512207031, "logps/rejected": -1277.065673828125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.10952471196651459, "rewards/margins": 10.300054550170898, "rewards/rejected": -10.190529823303223, "step": 220 }, { "epoch": 1.0043668122270741, "grad_norm": 0.019502802530769667, "learning_rate": 4.847779301910868e-06, "logits/chosen": -1.088416576385498, "logits/rejected": -0.8197258114814758, "logps/chosen": -240.4183807373047, "logps/rejected": -1398.536376953125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.14008644223213196, "rewards/margins": 11.213750839233398, "rewards/rejected": -11.073664665222168, "step": 230 }, { "epoch": 1.0480349344978166, "grad_norm": 0.025635188925498188, "learning_rate": 4.820490189292415e-06, "logits/chosen": -1.019892930984497, "logits/rejected": -0.7206239700317383, "logps/chosen": -249.9215087890625, "logps/rejected": -1368.528564453125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.11964831501245499, "rewards/margins": 11.160282135009766, "rewards/rejected": -11.040634155273438, "step": 240 }, { "epoch": 1.091703056768559, "grad_norm": 0.0019862259316801, "learning_rate": 4.791042480696179e-06, "logits/chosen": -1.1091744899749756, "logits/rejected": -0.8607629537582397, "logps/chosen": -280.6003112792969, "logps/rejected": -1413.723388671875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.14324969053268433, "rewards/margins": 11.427762985229492, "rewards/rejected": -11.284514427185059, "step": 250 }, { "epoch": 1.1353711790393013, "grad_norm": 0.003461068195787667, "learning_rate": 4.759463569345205e-06, "logits/chosen": -1.0466573238372803, "logits/rejected": -0.8044939041137695, "logps/chosen": -261.8208312988281, "logps/rejected": -1364.37255859375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.16325196623802185, "rewards/margins": 11.088302612304688, "rewards/rejected": -10.925050735473633, "step": 260 }, { "epoch": 1.1790393013100438, "grad_norm": 0.005672203724021556, "learning_rate": 4.725782830977145e-06, "logits/chosen": -1.0772409439086914, "logits/rejected": -0.7674180865287781, "logps/chosen": -244.04861450195312, "logps/rejected": -1468.4453125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.1920471489429474, "rewards/margins": 11.930693626403809, "rewards/rejected": -11.738645553588867, "step": 270 }, { "epoch": 1.222707423580786, "grad_norm": 0.00517545244251001, "learning_rate": 4.690031596517984e-06, "logits/chosen": -1.0256779193878174, "logits/rejected": -0.7417846918106079, "logps/chosen": -246.1106414794922, "logps/rejected": -1458.6060791015625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.17811021208763123, "rewards/margins": 11.871198654174805, "rewards/rejected": -11.69308853149414, "step": 280 }, { "epoch": 1.2663755458515285, "grad_norm": 0.0052311080547555655, "learning_rate": 4.652243122936987e-06, "logits/chosen": -1.0737969875335693, "logits/rejected": -0.8021048307418823, "logps/chosen": -262.6719665527344, "logps/rejected": -1506.664306640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.19292786717414856, "rewards/margins": 12.288152694702148, "rewards/rejected": -12.09522533416748, "step": 290 }, { "epoch": 1.3100436681222707, "grad_norm": 0.007845893197308343, "learning_rate": 4.612452562309975e-06, "logits/chosen": -1.0275319814682007, "logits/rejected": -0.6741601228713989, "logps/chosen": -233.83407592773438, "logps/rejected": -1481.06640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.20325830578804016, "rewards/margins": 12.083272933959961, "rewards/rejected": -11.88001537322998, "step": 300 }, { "epoch": 1.3537117903930131, "grad_norm": 0.001639011685908484, "learning_rate": 4.570696929119717e-06, "logits/chosen": -1.0229878425598145, "logits/rejected": -0.7648278474807739, "logps/chosen": -259.880615234375, "logps/rejected": -1439.419189453125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.13743802905082703, "rewards/margins": 11.878623962402344, "rewards/rejected": -11.741185188293457, "step": 310 }, { "epoch": 1.3973799126637554, "grad_norm": 0.002036377670452987, "learning_rate": 4.527015065823841e-06, "logits/chosen": -1.013264536857605, "logits/rejected": -0.7761918306350708, "logps/chosen": -272.4781799316406, "logps/rejected": -1432.2052001953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.1656205952167511, "rewards/margins": 11.849295616149902, "rewards/rejected": -11.683676719665527, "step": 320 }, { "epoch": 1.4410480349344978, "grad_norm": 0.00825121579599894, "learning_rate": 4.481447606722309e-06, "logits/chosen": -1.0379178524017334, "logits/rejected": -0.7940360903739929, "logps/chosen": -274.08929443359375, "logps/rejected": -1472.6800537109375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.19115516543388367, "rewards/margins": 12.22935962677002, "rewards/rejected": -12.03820514678955, "step": 330 }, { "epoch": 1.48471615720524, "grad_norm": 0.006624246041316681, "learning_rate": 4.434036940158062e-06, "logits/chosen": -1.0243127346038818, "logits/rejected": -0.6224126219749451, "logps/chosen": -222.64013671875, "logps/rejected": -1671.427001953125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.19593994319438934, "rewards/margins": 13.684160232543945, "rewards/rejected": -13.48822021484375, "step": 340 }, { "epoch": 1.5283842794759825, "grad_norm": 0.00969023506429811, "learning_rate": 4.384827169085993e-06, "logits/chosen": -1.0768173933029175, "logits/rejected": -0.9027138948440552, "logps/chosen": -304.2896728515625, "logps/rejected": -1465.677001953125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.06724857538938522, "rewards/margins": 12.110160827636719, "rewards/rejected": -12.042912483215332, "step": 350 }, { "epoch": 1.572052401746725, "grad_norm": 0.008449136082982325, "learning_rate": 4.333864070046938e-06, "logits/chosen": -0.9996398687362671, "logits/rejected": -0.6732652187347412, "logps/chosen": -264.0496520996094, "logps/rejected": -1532.588134765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.09769190102815628, "rewards/margins": 12.633995056152344, "rewards/rejected": -12.53630256652832, "step": 360 }, { "epoch": 1.6157205240174672, "grad_norm": 0.0026627154109082445, "learning_rate": 4.28119505058483e-06, "logits/chosen": -1.0139929056167603, "logits/rejected": -0.7123194336891174, "logps/chosen": -275.62481689453125, "logps/rejected": -1458.732421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.08639346808195114, "rewards/margins": 12.07685661315918, "rewards/rejected": -11.990463256835938, "step": 370 }, { "epoch": 1.6593886462882095, "grad_norm": 0.004704630681169572, "learning_rate": 4.226869105146658e-06, "logits/chosen": -1.0525470972061157, "logits/rejected": -0.7499625086784363, "logps/chosen": -256.189453125, "logps/rejected": -1665.451904296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.15123505890369415, "rewards/margins": 13.766995429992676, "rewards/rejected": -13.615760803222656, "step": 380 }, { "epoch": 1.703056768558952, "grad_norm": 0.008451726735303476, "learning_rate": 4.170936769506222e-06, "logits/chosen": -1.0494039058685303, "logits/rejected": -0.7635387182235718, "logps/chosen": -272.2103576660156, "logps/rejected": -1517.40625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.11954255402088165, "rewards/margins": 12.597132682800293, "rewards/rejected": -12.477590560913086, "step": 390 }, { "epoch": 1.7467248908296944, "grad_norm": 0.004007124213956819, "learning_rate": 4.1134500737541026e-06, "logits/chosen": -1.0403097867965698, "logits/rejected": -0.7625882029533386, "logps/chosen": -270.08642578125, "logps/rejected": -1559.534423828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.14756450057029724, "rewards/margins": 12.893702507019043, "rewards/rejected": -12.746137619018555, "step": 400 }, { "epoch": 1.7903930131004366, "grad_norm": 0.008071006947580056, "learning_rate": 4.054462493897569e-06, "logits/chosen": -1.0538218021392822, "logits/rejected": -0.6631166934967041, "logps/chosen": -230.2433319091797, "logps/rejected": -1751.548095703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.1488143503665924, "rewards/margins": 14.362360000610352, "rewards/rejected": -14.213546752929688, "step": 410 }, { "epoch": 1.8340611353711789, "grad_norm": 0.0018837924584018762, "learning_rate": 3.994028902115439e-06, "logits/chosen": -1.0209208726882935, "logits/rejected": -0.6195182204246521, "logps/chosen": -235.20816040039062, "logps/rejected": -1623.1568603515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.12359651178121567, "rewards/margins": 13.363885879516602, "rewards/rejected": -13.240289688110352, "step": 420 }, { "epoch": 1.8777292576419216, "grad_norm": 0.0024505034148971385, "learning_rate": 3.932205515714189e-06, "logits/chosen": -1.0572965145111084, "logits/rejected": -0.764691948890686, "logps/chosen": -299.56732177734375, "logps/rejected": -1628.056884765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.033301644027233124, "rewards/margins": 13.467303276062012, "rewards/rejected": -13.500605583190918, "step": 430 }, { "epoch": 1.9213973799126638, "grad_norm": 0.006019680096008564, "learning_rate": 3.86904984483277e-06, "logits/chosen": -1.0619957447052002, "logits/rejected": -0.7923606634140015, "logps/chosen": -298.5255432128906, "logps/rejected": -1547.7294921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.007502234075218439, "rewards/margins": 12.943402290344238, "rewards/rejected": -12.93589973449707, "step": 440 }, { "epoch": 1.965065502183406, "grad_norm": 0.0039079828979289005, "learning_rate": 3.8046206389447916e-06, "logits/chosen": -1.0510538816452026, "logits/rejected": -0.7345406413078308, "logps/chosen": -275.7720947265625, "logps/rejected": -1645.208251953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.0038163154385983944, "rewards/margins": 13.638483047485352, "rewards/rejected": -13.634668350219727, "step": 450 }, { "epoch": 2.0087336244541483, "grad_norm": 0.0017708249899463602, "learning_rate": 3.738977832207839e-06, "logits/chosen": -1.0177559852600098, "logits/rejected": -0.7104192972183228, "logps/chosen": -263.3643798828125, "logps/rejected": -1611.4786376953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.02570909820497036, "rewards/margins": 13.336071968078613, "rewards/rejected": -13.3103609085083, "step": 460 }, { "epoch": 2.052401746724891, "grad_norm": 0.000997282058931975, "learning_rate": 3.6721824877107588e-06, "logits/chosen": -1.0775004625320435, "logits/rejected": -0.8090157508850098, "logps/chosen": -305.11627197265625, "logps/rejected": -1616.4625244140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.022619858384132385, "rewards/margins": 13.472871780395508, "rewards/rejected": -13.450250625610352, "step": 470 }, { "epoch": 2.096069868995633, "grad_norm": 0.0066682395247295335, "learning_rate": 3.604296740670768e-06, "logits/chosen": -1.0786068439483643, "logits/rejected": -0.6820305585861206, "logps/chosen": -257.0702819824219, "logps/rejected": -1786.6517333984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.09799555689096451, "rewards/margins": 14.890355110168457, "rewards/rejected": -14.7923583984375, "step": 480 }, { "epoch": 2.1397379912663754, "grad_norm": 0.004948689561737302, "learning_rate": 3.5353837406332464e-06, "logits/chosen": -1.0740149021148682, "logits/rejected": -0.7413342595100403, "logps/chosen": -247.671630859375, "logps/rejected": -1785.2659912109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.1334933489561081, "rewards/margins": 14.81743335723877, "rewards/rejected": -14.683941841125488, "step": 490 }, { "epoch": 2.183406113537118, "grad_norm": 0.0032297799668046237, "learning_rate": 3.4655075927279576e-06, "logits/chosen": -1.0715500116348267, "logits/rejected": -0.7507299184799194, "logps/chosen": -257.148681640625, "logps/rejected": -1643.170654296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.04553316906094551, "rewards/margins": 13.757342338562012, "rewards/rejected": -13.711812019348145, "step": 500 }, { "epoch": 2.2270742358078603, "grad_norm": 0.0018344305263622298, "learning_rate": 3.3947332980363552e-06, "logits/chosen": -0.9531366229057312, "logits/rejected": -0.5800803303718567, "logps/chosen": -267.22320556640625, "logps/rejected": -1683.7496337890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.01091139018535614, "rewards/margins": 14.13904094696045, "rewards/rejected": -14.128130912780762, "step": 510 }, { "epoch": 2.2707423580786026, "grad_norm": 0.0008743192695472521, "learning_rate": 3.3231266931254546e-06, "logits/chosen": -1.0591729879379272, "logits/rejected": -0.7667123079299927, "logps/chosen": -288.68048095703125, "logps/rejected": -1706.4267578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.048968568444252014, "rewards/margins": 14.315594673156738, "rewards/rejected": -14.266626358032227, "step": 520 }, { "epoch": 2.314410480349345, "grad_norm": 0.0006214259809055745, "learning_rate": 3.250754388804495e-06, "logits/chosen": -1.0545308589935303, "logits/rejected": -0.6721210479736328, "logps/chosen": -258.18707275390625, "logps/rejected": -1773.6435546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.016913337633013725, "rewards/margins": 14.813578605651855, "rewards/rejected": -14.796663284301758, "step": 530 }, { "epoch": 2.3580786026200875, "grad_norm": 0.0009986936165298707, "learning_rate": 3.1776837081613893e-06, "logits/chosen": -0.9897233247756958, "logits/rejected": -0.6502217650413513, "logps/chosen": -265.50994873046875, "logps/rejected": -1707.3509521484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.05818755552172661, "rewards/margins": 14.338269233703613, "rewards/rejected": -14.280080795288086, "step": 540 }, { "epoch": 2.4017467248908297, "grad_norm": 0.00984202902471183, "learning_rate": 3.1039826239365754e-06, "logits/chosen": -1.026111364364624, "logits/rejected": -0.7162208557128906, "logps/chosen": -260.0814514160156, "logps/rejected": -1701.0667724609375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.060255538672208786, "rewards/margins": 14.235310554504395, "rewards/rejected": -14.175054550170898, "step": 550 }, { "epoch": 2.445414847161572, "grad_norm": 0.0006428554350249238, "learning_rate": 3.0297196952925533e-06, "logits/chosen": -1.0254216194152832, "logits/rejected": -0.7658672332763672, "logps/chosen": -304.80194091796875, "logps/rejected": -1645.176025390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.019381849095225334, "rewards/margins": 13.853315353393555, "rewards/rejected": -13.872695922851562, "step": 560 }, { "epoch": 2.489082969432314, "grad_norm": 0.0006532892925360383, "learning_rate": 2.9549640040379043e-06, "logits/chosen": -1.0421395301818848, "logits/rejected": -0.7338224649429321, "logps/chosen": -289.6170349121094, "logps/rejected": -1806.13671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.016583764925599098, "rewards/margins": 15.094789505004883, "rewards/rejected": -15.078204154968262, "step": 570 }, { "epoch": 2.532751091703057, "grad_norm": 0.0006677213433183142, "learning_rate": 2.8797850903651274e-06, "logits/chosen": -1.0362173318862915, "logits/rejected": -0.716796875, "logps/chosen": -265.3944091796875, "logps/rejected": -1734.1741943359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.07199420034885406, "rewards/margins": 14.672683715820312, "rewards/rejected": -14.600687980651855, "step": 580 }, { "epoch": 2.576419213973799, "grad_norm": 0.0017150618685269877, "learning_rate": 2.804252888162079e-06, "logits/chosen": -1.1411523818969727, "logits/rejected": -0.7371311187744141, "logps/chosen": -265.2336120605469, "logps/rejected": -1891.736572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.006541900336742401, "rewards/margins": 15.778755187988281, "rewards/rejected": -15.772212028503418, "step": 590 }, { "epoch": 2.6200873362445414, "grad_norm": 0.0014112958209940734, "learning_rate": 2.7284376599571776e-06, "logits/chosen": -1.054970622062683, "logits/rejected": -0.7131912112236023, "logps/chosen": -264.7176818847656, "logps/rejected": -1720.33984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.05957148224115372, "rewards/margins": 14.534815788269043, "rewards/rejected": -14.475244522094727, "step": 600 }, { "epoch": 2.6637554585152836, "grad_norm": 0.0006698680618471773, "learning_rate": 2.652409931558898e-06, "logits/chosen": -1.053966760635376, "logits/rejected": -0.7491308450698853, "logps/chosen": -280.0963134765625, "logps/rejected": -1777.6761474609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.008257086388766766, "rewards/margins": 14.989705085754395, "rewards/rejected": -14.98144817352295, "step": 610 }, { "epoch": 2.7074235807860263, "grad_norm": 0.00042568362060489706, "learning_rate": 2.5762404264503538e-06, "logits/chosen": -0.9900799989700317, "logits/rejected": -0.6423364281654358, "logps/chosen": -269.9592590332031, "logps/rejected": -1832.5003662109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.042371202260255814, "rewards/margins": 15.435081481933594, "rewards/rejected": -15.392709732055664, "step": 620 }, { "epoch": 2.7510917030567685, "grad_norm": 0.006748202311280723, "learning_rate": 2.5e-06, "logits/chosen": -1.0268778800964355, "logits/rejected": -0.7622432708740234, "logps/chosen": -285.1914978027344, "logps/rejected": -1769.4827880859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.037803106009960175, "rewards/margins": 14.83598804473877, "rewards/rejected": -14.873788833618164, "step": 630 }, { "epoch": 2.7947598253275108, "grad_norm": 0.0002702166687339878, "learning_rate": 2.423759573549647e-06, "logits/chosen": -1.0029267072677612, "logits/rejected": -0.6316828727722168, "logps/chosen": -260.8511962890625, "logps/rejected": -1813.537841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.039741456508636475, "rewards/margins": 15.310914993286133, "rewards/rejected": -15.271173477172852, "step": 640 }, { "epoch": 2.8384279475982535, "grad_norm": 0.0006646355698341602, "learning_rate": 2.3475900684411027e-06, "logits/chosen": -1.006449818611145, "logits/rejected": -0.6019884943962097, "logps/chosen": -250.7771453857422, "logps/rejected": -1806.491455078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.015752162784337997, "rewards/margins": 15.226987838745117, "rewards/rejected": -15.2427396774292, "step": 650 }, { "epoch": 2.8820960698689957, "grad_norm": 0.00047399361927177104, "learning_rate": 2.2715623400428228e-06, "logits/chosen": -1.0256198644638062, "logits/rejected": -0.7082680463790894, "logps/chosen": -287.2723083496094, "logps/rejected": -1791.8154296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.028542857617139816, "rewards/margins": 15.111543655395508, "rewards/rejected": -15.14008617401123, "step": 660 }, { "epoch": 2.925764192139738, "grad_norm": 0.054595514895715194, "learning_rate": 2.1957471118379213e-06, "logits/chosen": -1.1037609577178955, "logits/rejected": -0.7577636241912842, "logps/chosen": -303.9775390625, "logps/rejected": -1865.5306396484375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.036632128059864044, "rewards/margins": 15.698193550109863, "rewards/rejected": -15.734827041625977, "step": 670 }, { "epoch": 2.96943231441048, "grad_norm": 0.0032068293343592976, "learning_rate": 2.120214909634873e-06, "logits/chosen": -1.0517921447753906, "logits/rejected": -0.770795464515686, "logps/chosen": -298.4246520996094, "logps/rejected": -1795.1331787109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.014761227183043957, "rewards/margins": 15.1719970703125, "rewards/rejected": -15.186758041381836, "step": 680 }, { "epoch": 3.013100436681223, "grad_norm": 0.0011657499181104522, "learning_rate": 2.045035995962097e-06, "logits/chosen": -1.1193125247955322, "logits/rejected": -0.7743921279907227, "logps/chosen": -279.6126403808594, "logps/rejected": -1916.1976318359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.046258725225925446, "rewards/margins": 16.126333236694336, "rewards/rejected": -16.172592163085938, "step": 690 }, { "epoch": 3.056768558951965, "grad_norm": 0.005031019960938767, "learning_rate": 1.970280304707447e-06, "logits/chosen": -1.0536220073699951, "logits/rejected": -0.6994236707687378, "logps/chosen": -274.2372131347656, "logps/rejected": -1818.864013671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.030473018065094948, "rewards/margins": 15.32104206085205, "rewards/rejected": -15.351513862609863, "step": 700 }, { "epoch": 3.1004366812227073, "grad_norm": 0.0003812225537618957, "learning_rate": 1.8960173760634257e-06, "logits/chosen": -1.1193227767944336, "logits/rejected": -0.8343600034713745, "logps/chosen": -317.20367431640625, "logps/rejected": -1797.347900390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.015081268735229969, "rewards/margins": 15.233189582824707, "rewards/rejected": -15.248270034790039, "step": 710 }, { "epoch": 3.14410480349345, "grad_norm": 0.000800716771561942, "learning_rate": 1.8223162918386122e-06, "logits/chosen": -1.0911037921905518, "logits/rejected": -0.7263038158416748, "logps/chosen": -281.10723876953125, "logps/rejected": -1893.2275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.07366519421339035, "rewards/margins": 15.97996711730957, "rewards/rejected": -16.053630828857422, "step": 720 }, { "epoch": 3.1877729257641922, "grad_norm": 0.0033435888991239673, "learning_rate": 1.7492456111955052e-06, "logits/chosen": -1.0024917125701904, "logits/rejected": -0.6330265998840332, "logps/chosen": -266.6875915527344, "logps/rejected": -1799.3509521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.05265193060040474, "rewards/margins": 15.307687759399414, "rewards/rejected": -15.255037307739258, "step": 730 }, { "epoch": 3.2314410480349345, "grad_norm": 0.003033628662190874, "learning_rate": 1.6768733068745468e-06, "logits/chosen": -1.0493130683898926, "logits/rejected": -0.7165160775184631, "logps/chosen": -277.95294189453125, "logps/rejected": -1927.5621337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.008759565651416779, "rewards/margins": 16.3278865814209, "rewards/rejected": -16.336645126342773, "step": 740 }, { "epoch": 3.2751091703056767, "grad_norm": 0.00030739766784929366, "learning_rate": 1.6052667019636462e-06, "logits/chosen": -1.0858592987060547, "logits/rejected": -0.7486026883125305, "logps/chosen": -285.6336669921875, "logps/rejected": -1904.0074462890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.039479680359363556, "rewards/margins": 15.980633735656738, "rewards/rejected": -16.02011489868164, "step": 750 }, { "epoch": 3.3187772925764194, "grad_norm": 0.0099847141974217, "learning_rate": 1.5344924072720434e-06, "logits/chosen": -1.0522048473358154, "logits/rejected": -0.7434892058372498, "logps/chosen": -294.0987548828125, "logps/rejected": -1748.886962890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.06646230816841125, "rewards/margins": 14.771476745605469, "rewards/rejected": -14.837939262390137, "step": 760 }, { "epoch": 3.3624454148471616, "grad_norm": 0.00046452138658813734, "learning_rate": 1.4646162593667535e-06, "logits/chosen": -1.0780597925186157, "logits/rejected": -0.8004827499389648, "logps/chosen": -309.17120361328125, "logps/rejected": -1823.7366943359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.043299149721860886, "rewards/margins": 15.575587272644043, "rewards/rejected": -15.532289505004883, "step": 770 }, { "epoch": 3.406113537117904, "grad_norm": 0.0018748318848776301, "learning_rate": 1.3957032593292319e-06, "logits/chosen": -1.042887568473816, "logits/rejected": -0.6965152621269226, "logps/chosen": -282.84088134765625, "logps/rejected": -1889.387451171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.013044556602835655, "rewards/margins": 15.971423149108887, "rewards/rejected": -15.958378791809082, "step": 780 }, { "epoch": 3.449781659388646, "grad_norm": 0.0009191121751634733, "learning_rate": 1.3278175122892416e-06, "logits/chosen": -1.029362678527832, "logits/rejected": -0.6999324560165405, "logps/chosen": -269.5941467285156, "logps/rejected": -1812.1949462890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.10188305377960205, "rewards/margins": 15.274584770202637, "rewards/rejected": -15.376466751098633, "step": 790 }, { "epoch": 3.493449781659389, "grad_norm": 0.003126132491902847, "learning_rate": 1.261022167792161e-06, "logits/chosen": -1.0470654964447021, "logits/rejected": -0.6794711351394653, "logps/chosen": -278.49285888671875, "logps/rejected": -1961.002197265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.009503474459052086, "rewards/margins": 16.650577545166016, "rewards/rejected": -16.641077041625977, "step": 800 }, { "epoch": 3.537117903930131, "grad_norm": 0.0059086305628955095, "learning_rate": 1.195379361055209e-06, "logits/chosen": -0.9826908111572266, "logits/rejected": -0.6099307537078857, "logps/chosen": -278.61492919921875, "logps/rejected": -1975.528076171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.03355104476213455, "rewards/margins": 16.752765655517578, "rewards/rejected": -16.78631591796875, "step": 810 }, { "epoch": 3.5807860262008733, "grad_norm": 0.003101839804676522, "learning_rate": 1.1309501551672303e-06, "logits/chosen": -0.957851231098175, "logits/rejected": -0.5697217583656311, "logps/chosen": -272.66534423828125, "logps/rejected": -1837.3662109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.07275199890136719, "rewards/margins": 15.55827522277832, "rewards/rejected": -15.631025314331055, "step": 820 }, { "epoch": 3.6244541484716155, "grad_norm": 0.006553990432039902, "learning_rate": 1.0677944842858112e-06, "logits/chosen": -1.0141226053237915, "logits/rejected": -0.6257001757621765, "logps/chosen": -280.70672607421875, "logps/rejected": -1861.9078369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.05027519538998604, "rewards/margins": 15.878219604492188, "rewards/rejected": -15.928494453430176, "step": 830 }, { "epoch": 3.668122270742358, "grad_norm": 0.0003416983892558372, "learning_rate": 1.005971097884561e-06, "logits/chosen": -1.0156476497650146, "logits/rejected": -0.668128252029419, "logps/chosen": -302.67767333984375, "logps/rejected": -1935.9398193359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.08607998490333557, "rewards/margins": 16.394611358642578, "rewards/rejected": -16.480690002441406, "step": 840 }, { "epoch": 3.7117903930131004, "grad_norm": 0.0015327333742183899, "learning_rate": 9.455375061024319e-07, "logits/chosen": -1.0618703365325928, "logits/rejected": -0.7237562537193298, "logps/chosen": -283.15557861328125, "logps/rejected": -1865.9681396484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.054165177047252655, "rewards/margins": 15.781875610351562, "rewards/rejected": -15.836041450500488, "step": 850 }, { "epoch": 3.7554585152838427, "grad_norm": 0.004269773041594158, "learning_rate": 8.86549926245898e-07, "logits/chosen": -1.060253381729126, "logits/rejected": -0.6907247304916382, "logps/chosen": -279.11090087890625, "logps/rejected": -1889.4056396484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.0650867372751236, "rewards/margins": 15.91883659362793, "rewards/rejected": -15.983922958374023, "step": 860 }, { "epoch": 3.7991266375545854, "grad_norm": 0.0010427555714299119, "learning_rate": 8.29063230493779e-07, "logits/chosen": -1.0654280185699463, "logits/rejected": -0.6975089907646179, "logps/chosen": -275.31011962890625, "logps/rejected": -1959.466796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.10659889876842499, "rewards/margins": 16.469921112060547, "rewards/rejected": -16.576520919799805, "step": 870 }, { "epoch": 3.8427947598253276, "grad_norm": 0.0051211999521691845, "learning_rate": 7.731308948533431e-07, "logits/chosen": -1.0260651111602783, "logits/rejected": -0.7374225854873657, "logps/chosen": -309.5072937011719, "logps/rejected": -1826.8656005859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.08689963072538376, "rewards/margins": 15.4946928024292, "rewards/rejected": -15.58159351348877, "step": 880 }, { "epoch": 3.88646288209607, "grad_norm": 0.002864032933131896, "learning_rate": 7.188049494151703e-07, "logits/chosen": -0.9869282841682434, "logits/rejected": -0.6170369386672974, "logps/chosen": -270.72308349609375, "logps/rejected": -1809.623046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.03863278776407242, "rewards/margins": 15.385714530944824, "rewards/rejected": -15.424346923828125, "step": 890 }, { "epoch": 3.930131004366812, "grad_norm": 0.0005105250492242013, "learning_rate": 6.661359299530626e-07, "logits/chosen": -1.0458041429519653, "logits/rejected": -0.8353503346443176, "logps/chosen": -325.7237548828125, "logps/rejected": -1614.521484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.15461190044879913, "rewards/margins": 13.781686782836914, "rewards/rejected": -13.936299324035645, "step": 900 }, { "epoch": 3.9737991266375547, "grad_norm": 0.0019150602563612286, "learning_rate": 6.151728309140071e-07, "logits/chosen": -1.0450153350830078, "logits/rejected": -0.7124982476234436, "logps/chosen": -285.0978698730469, "logps/rejected": -1913.976318359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.026266271248459816, "rewards/margins": 16.297687530517578, "rewards/rejected": -16.32395362854004, "step": 910 }, { "epoch": 4.0174672489082965, "grad_norm": 0.003658964581230378, "learning_rate": 5.659630598419391e-07, "logits/chosen": -1.0644596815109253, "logits/rejected": -0.6967732906341553, "logps/chosen": -284.8916320800781, "logps/rejected": -1840.668212890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.04237327724695206, "rewards/margins": 15.650690078735352, "rewards/rejected": -15.69306468963623, "step": 920 }, { "epoch": 4.06113537117904, "grad_norm": 0.015359415097265612, "learning_rate": 5.185523932776923e-07, "logits/chosen": -1.0555784702301025, "logits/rejected": -0.7345298528671265, "logps/chosen": -284.2557067871094, "logps/rejected": -1905.572509765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.04300800338387489, "rewards/margins": 16.220272064208984, "rewards/rejected": -16.26327896118164, "step": 930 }, { "epoch": 4.104803493449782, "grad_norm": 0.0043225257316477665, "learning_rate": 4.7298493417616024e-07, "logits/chosen": -1.024458646774292, "logits/rejected": -0.7369360327720642, "logps/chosen": -297.3896484375, "logps/rejected": -1880.3125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.06699924170970917, "rewards/margins": 15.987092971801758, "rewards/rejected": -16.05409049987793, "step": 940 }, { "epoch": 4.148471615720524, "grad_norm": 0.0007120346767669291, "learning_rate": 4.293030708802834e-07, "logits/chosen": -1.0639699697494507, "logits/rejected": -0.7229939699172974, "logps/chosen": -270.7474060058594, "logps/rejected": -1974.702392578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.030464956536889076, "rewards/margins": 16.875165939331055, "rewards/rejected": -16.84469985961914, "step": 950 }, { "epoch": 4.192139737991266, "grad_norm": 0.0004859166750622356, "learning_rate": 3.875474376900254e-07, "logits/chosen": -0.9850033521652222, "logits/rejected": -0.6558709144592285, "logps/chosen": -287.7508544921875, "logps/rejected": -1824.685791015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.10610561072826385, "rewards/margins": 15.522783279418945, "rewards/rejected": -15.628889083862305, "step": 960 }, { "epoch": 4.235807860262009, "grad_norm": 0.002799456551978829, "learning_rate": 3.4775687706301437e-07, "logits/chosen": -1.1057827472686768, "logits/rejected": -0.7324444055557251, "logps/chosen": -268.9285583496094, "logps/rejected": -2027.9287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.012243424542248249, "rewards/margins": 17.31172752380371, "rewards/rejected": -17.299488067626953, "step": 970 }, { "epoch": 4.279475982532751, "grad_norm": 0.0022373731136834133, "learning_rate": 3.0996840348201717e-07, "logits/chosen": -1.1121981143951416, "logits/rejected": -0.8360017538070679, "logps/chosen": -309.40155029296875, "logps/rejected": -1771.3427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.12068891525268555, "rewards/margins": 14.9802827835083, "rewards/rejected": -15.100972175598145, "step": 980 }, { "epoch": 4.323144104803493, "grad_norm": 0.00044941977383980663, "learning_rate": 2.742171690228562e-07, "logits/chosen": -1.0458042621612549, "logits/rejected": -0.7098134756088257, "logps/chosen": -300.3705749511719, "logps/rejected": -1846.2822265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.13983380794525146, "rewards/margins": 15.580610275268555, "rewards/rejected": -15.720443725585938, "step": 990 }, { "epoch": 4.366812227074236, "grad_norm": 0.002386856484501028, "learning_rate": 2.405364306547955e-07, "logits/chosen": -1.082747459411621, "logits/rejected": -0.8248895406723022, "logps/chosen": -306.8914794921875, "logps/rejected": -1777.709228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.0018977939616888762, "rewards/margins": 15.146817207336426, "rewards/rejected": -15.148715019226074, "step": 1000 }, { "epoch": 4.4104803493449785, "grad_norm": 0.0029792452832107918, "learning_rate": 2.0895751930382125e-07, "logits/chosen": -1.0091092586517334, "logits/rejected": -0.6796575784683228, "logps/chosen": -295.8236389160156, "logps/rejected": -1882.607421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.007821875624358654, "rewards/margins": 16.001935958862305, "rewards/rejected": -16.00975799560547, "step": 1010 }, { "epoch": 4.454148471615721, "grad_norm": 0.0002768349770005794, "learning_rate": 1.7950981070758488e-07, "logits/chosen": -1.0478919744491577, "logits/rejected": -0.7594279050827026, "logps/chosen": -287.52288818359375, "logps/rejected": -1790.5562744140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.0713457316160202, "rewards/margins": 15.24449634552002, "rewards/rejected": -15.315841674804688, "step": 1020 }, { "epoch": 4.497816593886463, "grad_norm": 0.005069342731202753, "learning_rate": 1.5222069808913303e-07, "logits/chosen": -1.0174686908721924, "logits/rejected": -0.683228611946106, "logps/chosen": -301.92144775390625, "logps/rejected": -1872.2666015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.04044654220342636, "rewards/margins": 15.973681449890137, "rewards/rejected": -16.01412582397461, "step": 1030 }, { "epoch": 4.541484716157205, "grad_norm": 0.0004959573429988225, "learning_rate": 1.271155666748311e-07, "logits/chosen": -1.0908715724945068, "logits/rejected": -0.7660378217697144, "logps/chosen": -306.63043212890625, "logps/rejected": -1871.072509765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.09966543316841125, "rewards/margins": 15.866052627563477, "rewards/rejected": -15.965716361999512, "step": 1040 }, { "epoch": 4.585152838427947, "grad_norm": 0.00019462022413360382, "learning_rate": 1.0421777008019663e-07, "logits/chosen": -1.0952246189117432, "logits/rejected": -0.709916353225708, "logps/chosen": -295.17877197265625, "logps/rejected": -2033.247314453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.07402383536100388, "rewards/margins": 17.205215454101562, "rewards/rejected": -17.279239654541016, "step": 1050 }, { "epoch": 4.62882096069869, "grad_norm": 0.0036632249464871505, "learning_rate": 8.354860858560021e-08, "logits/chosen": -1.041784405708313, "logits/rejected": -0.6548532247543335, "logps/chosen": -293.025146484375, "logps/rejected": -2029.672607421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.0596146360039711, "rewards/margins": 17.3281307220459, "rewards/rejected": -17.387744903564453, "step": 1060 }, { "epoch": 4.672489082969433, "grad_norm": 0.00032895380717983924, "learning_rate": 6.512730932204698e-08, "logits/chosen": -1.0059609413146973, "logits/rejected": -0.7132547497749329, "logps/chosen": -317.78680419921875, "logps/rejected": -1786.2691650390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.03703427314758301, "rewards/margins": 15.275805473327637, "rewards/rejected": -15.312841415405273, "step": 1070 }, { "epoch": 4.716157205240175, "grad_norm": 0.0031606785955010904, "learning_rate": 4.897100838547081e-08, "logits/chosen": -1.0450844764709473, "logits/rejected": -0.7194000482559204, "logps/chosen": -301.86541748046875, "logps/rejected": -1955.65625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.05521082133054733, "rewards/margins": 16.574329376220703, "rewards/rejected": -16.62953758239746, "step": 1080 }, { "epoch": 4.759825327510917, "grad_norm": 0.000347629698955204, "learning_rate": 3.5094734896174985e-08, "logits/chosen": -1.080841302871704, "logits/rejected": -0.6831713914871216, "logps/chosen": -268.0194091796875, "logps/rejected": -1998.539306640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.04163534566760063, "rewards/margins": 16.94226837158203, "rewards/rejected": -16.98390007019043, "step": 1090 }, { "epoch": 4.8034934497816595, "grad_norm": 0.004295417718120041, "learning_rate": 2.351139701825267e-08, "logits/chosen": -0.9642395973205566, "logits/rejected": -0.6139574646949768, "logps/chosen": -276.0238037109375, "logps/rejected": -1780.542236328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.0755363255739212, "rewards/margins": 15.09814167022705, "rewards/rejected": -15.173677444458008, "step": 1100 }, { "epoch": 4.847161572052402, "grad_norm": 0.00037246559243259353, "learning_rate": 1.4231769951990326e-08, "logits/chosen": -1.033734679222107, "logits/rejected": -0.6827409267425537, "logps/chosen": -287.67584228515625, "logps/rejected": -1885.5576171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.08407995104789734, "rewards/margins": 16.050437927246094, "rewards/rejected": -16.134517669677734, "step": 1110 }, { "epoch": 4.890829694323144, "grad_norm": 0.0003852961387970997, "learning_rate": 7.264485910423447e-09, "logits/chosen": -0.9720686078071594, "logits/rejected": -0.5935501456260681, "logps/chosen": -265.44622802734375, "logps/rejected": -1915.1038818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.024737151339650154, "rewards/margins": 16.284820556640625, "rewards/rejected": -16.309558868408203, "step": 1120 }, { "epoch": 4.934497816593886, "grad_norm": 0.015272373299215733, "learning_rate": 2.6160260893692833e-09, "logits/chosen": -1.0101922750473022, "logits/rejected": -0.6879107356071472, "logps/chosen": -289.36138916015625, "logps/rejected": -1796.4605712890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.028253447264432907, "rewards/margins": 15.371627807617188, "rewards/rejected": -15.34337329864502, "step": 1130 }, { "epoch": 4.978165938864628, "grad_norm": 0.0002424398537573296, "learning_rate": 2.9071463840540936e-10, "logits/chosen": -1.0956311225891113, "logits/rejected": -0.8314415216445923, "logps/chosen": -341.66766357421875, "logps/rejected": -1909.0863037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.11481740325689316, "rewards/margins": 16.255878448486328, "rewards/rejected": -16.370695114135742, "step": 1140 }, { "epoch": 5.0, "step": 1145, "total_flos": 0.0, "train_loss": 0.046525747915877304, "train_runtime": 10909.6849, "train_samples_per_second": 6.711, "train_steps_per_second": 0.105 } ], "logging_steps": 10, "max_steps": 1145, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }