{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998691442030882, "eval_steps": 100, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.856400966644287, "logits/rejected": -2.6539194583892822, "logps/chosen": -302.289794921875, "logps/rejected": -253.04373168945312, "loss": 2500.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 1.0416666666666667e-06, "logits/chosen": -2.5851330757141113, "logits/rejected": -2.6188478469848633, "logps/chosen": -265.6952209472656, "logps/rejected": -261.4213562011719, "loss": 2495.385, "rewards/accuracies": 0.4375, "rewards/chosen": 0.005977082531899214, "rewards/margins": 0.0005994850071147084, "rewards/rejected": 0.005377596709877253, "step": 10 }, { "epoch": 0.04, "learning_rate": 2.0833333333333334e-06, "logits/chosen": -2.6101512908935547, "logits/rejected": -2.5939109325408936, "logps/chosen": -255.68185424804688, "logps/rejected": -248.1254119873047, "loss": 2457.86, "rewards/accuracies": 0.628125011920929, "rewards/chosen": 0.013690793886780739, "rewards/margins": 0.00916606467217207, "rewards/rejected": 0.004524729214608669, "step": 20 }, { "epoch": 0.06, "learning_rate": 3.125e-06, "logits/chosen": -2.604323148727417, "logits/rejected": -2.598053455352783, "logps/chosen": -254.423095703125, "logps/rejected": -226.73153686523438, "loss": 2402.3988, "rewards/accuracies": 0.703125, "rewards/chosen": 0.01266755722463131, "rewards/margins": 0.024019470438361168, "rewards/rejected": -0.01135191135108471, "step": 30 }, { "epoch": 0.08, "learning_rate": 4.166666666666667e-06, "logits/chosen": -2.6043972969055176, "logits/rejected": -2.582412004470825, "logps/chosen": -279.12042236328125, "logps/rejected": -241.2065887451172, "loss": 2290.4264, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.024520257487893105, "rewards/margins": 0.0557018406689167, "rewards/rejected": -0.031181585043668747, "step": 40 }, { "epoch": 0.1, "learning_rate": 4.999731868769027e-06, "logits/chosen": -2.531161308288574, "logits/rejected": -2.5264387130737305, "logps/chosen": -252.51846313476562, "logps/rejected": -247.7227325439453, "loss": 2291.9322, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.029673133045434952, "rewards/margins": 0.08245684206485748, "rewards/rejected": -0.05278371647000313, "step": 50 }, { "epoch": 0.13, "learning_rate": 4.9903533134293035e-06, "logits/chosen": -2.545037031173706, "logits/rejected": -2.5416412353515625, "logps/chosen": -260.83905029296875, "logps/rejected": -239.8417205810547, "loss": 2269.9371, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.03231300041079521, "rewards/margins": 0.09112317860126495, "rewards/rejected": -0.05881017446517944, "step": 60 }, { "epoch": 0.15, "learning_rate": 4.967625656594782e-06, "logits/chosen": -2.5832419395446777, "logits/rejected": -2.564356565475464, "logps/chosen": -275.95452880859375, "logps/rejected": -264.7611083984375, "loss": 2236.1113, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.036882974207401276, "rewards/margins": 0.08578891307115555, "rewards/rejected": -0.048905935138463974, "step": 70 }, { "epoch": 0.17, "learning_rate": 4.93167072587771e-06, "logits/chosen": -2.552919864654541, "logits/rejected": -2.524970293045044, "logps/chosen": -257.78448486328125, "logps/rejected": -262.3812561035156, "loss": 2220.0893, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.037375591695308685, "rewards/margins": 0.11339374631643295, "rewards/rejected": -0.07601816952228546, "step": 80 }, { "epoch": 0.19, "learning_rate": 4.882681251368549e-06, "logits/chosen": -2.56257963180542, "logits/rejected": -2.5289363861083984, "logps/chosen": -239.4860382080078, "logps/rejected": -252.36196899414062, "loss": 2167.3848, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.04182355850934982, "rewards/margins": 0.10886694490909576, "rewards/rejected": -0.06704337894916534, "step": 90 }, { "epoch": 0.21, "learning_rate": 4.8209198325401815e-06, "logits/chosen": -2.5551962852478027, "logits/rejected": -2.562063455581665, "logps/chosen": -266.8739013671875, "logps/rejected": -269.649169921875, "loss": 2149.4746, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.04759662598371506, "rewards/margins": 0.1307816356420517, "rewards/rejected": -0.08318501710891724, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": -2.222931385040283, "eval_logits/rejected": -2.1770126819610596, "eval_logps/chosen": -260.57818603515625, "eval_logps/rejected": -253.25228881835938, "eval_loss": 2190.7666015625, "eval_rewards/accuracies": 0.7460317611694336, "eval_rewards/chosen": 0.044464047998189926, "eval_rewards/margins": 0.12927772104740143, "eval_rewards/rejected": -0.0848136618733406, "eval_runtime": 549.355, "eval_samples_per_second": 3.641, "eval_steps_per_second": 0.115, "step": 100 }, { "epoch": 0.23, "learning_rate": 4.746717530629565e-06, "logits/chosen": -2.5229454040527344, "logits/rejected": -2.5105621814727783, "logps/chosen": -261.46649169921875, "logps/rejected": -256.37835693359375, "loss": 2174.1184, "rewards/accuracies": 0.746874988079071, "rewards/chosen": 0.03517655283212662, "rewards/margins": 0.11897265911102295, "rewards/rejected": -0.08379611372947693, "step": 110 }, { "epoch": 0.25, "learning_rate": 4.660472094042121e-06, "logits/chosen": -2.5114097595214844, "logits/rejected": -2.481840133666992, "logps/chosen": -246.70370483398438, "logps/rejected": -238.27621459960938, "loss": 2181.3053, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.044524095952510834, "rewards/margins": 0.10293309390544891, "rewards/rejected": -0.05840899422764778, "step": 120 }, { "epoch": 0.27, "learning_rate": 4.5626458262912745e-06, "logits/chosen": -2.4726600646972656, "logits/rejected": -2.46514630317688, "logps/chosen": -271.7862548828125, "logps/rejected": -260.61676025390625, "loss": 2175.3252, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.06200919300317764, "rewards/margins": 0.12613125145435333, "rewards/rejected": -0.06412206590175629, "step": 130 }, { "epoch": 0.29, "learning_rate": 4.453763107901676e-06, "logits/chosen": -2.506436586380005, "logits/rejected": -2.5005128383636475, "logps/chosen": -237.8655242919922, "logps/rejected": -249.9298553466797, "loss": 2167.2516, "rewards/accuracies": 0.734375, "rewards/chosen": 0.024008702486753464, "rewards/margins": 0.1495535969734192, "rewards/rejected": -0.12554487586021423, "step": 140 }, { "epoch": 0.31, "learning_rate": 4.33440758555951e-06, "logits/chosen": -2.5227842330932617, "logits/rejected": -2.536785364151001, "logps/chosen": -260.7518005371094, "logps/rejected": -235.9630889892578, "loss": 2119.4062, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.04733316972851753, "rewards/margins": 0.12345732748508453, "rewards/rejected": -0.0761241465806961, "step": 150 }, { "epoch": 0.33, "learning_rate": 4.205219043576955e-06, "logits/chosen": -2.5534234046936035, "logits/rejected": -2.4914207458496094, "logps/chosen": -254.14065551757812, "logps/rejected": -250.95700073242188, "loss": 2114.7645, "rewards/accuracies": 0.778124988079071, "rewards/chosen": 0.06031092256307602, "rewards/margins": 0.15202957391738892, "rewards/rejected": -0.09171866625547409, "step": 160 }, { "epoch": 0.36, "learning_rate": 4.066889974440757e-06, "logits/chosen": -2.5092320442199707, "logits/rejected": -2.4965577125549316, "logps/chosen": -254.91439819335938, "logps/rejected": -242.8040008544922, "loss": 2229.8135, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.034448813647031784, "rewards/margins": 0.12951095402240753, "rewards/rejected": -0.09506212174892426, "step": 170 }, { "epoch": 0.38, "learning_rate": 3.92016186682789e-06, "logits/chosen": -2.521221399307251, "logits/rejected": -2.533686399459839, "logps/chosen": -251.4235382080078, "logps/rejected": -259.76220703125, "loss": 2175.5213, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.04062749817967415, "rewards/margins": 0.120635487139225, "rewards/rejected": -0.08000798523426056, "step": 180 }, { "epoch": 0.4, "learning_rate": 3.7658212309857576e-06, "logits/chosen": -2.5192363262176514, "logits/rejected": -2.4917151927948, "logps/chosen": -255.2060089111328, "logps/rejected": -250.82022094726562, "loss": 2099.443, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.0492943711578846, "rewards/margins": 0.14053165912628174, "rewards/rejected": -0.09123729914426804, "step": 190 }, { "epoch": 0.42, "learning_rate": 3.604695382782159e-06, "logits/chosen": -2.5251801013946533, "logits/rejected": -2.5034642219543457, "logps/chosen": -269.3675537109375, "logps/rejected": -262.86376953125, "loss": 2105.1256, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.0575677752494812, "rewards/margins": 0.14340198040008545, "rewards/rejected": -0.08583419024944305, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": -2.260270833969116, "eval_logits/rejected": -2.2073864936828613, "eval_logps/chosen": -259.5941467285156, "eval_logps/rejected": -254.3839874267578, "eval_loss": 2151.155517578125, "eval_rewards/accuracies": 0.7599206566810608, "eval_rewards/chosen": 0.05430443957448006, "eval_rewards/margins": 0.15043501555919647, "eval_rewards/rejected": -0.09613056480884552, "eval_runtime": 548.195, "eval_samples_per_second": 3.648, "eval_steps_per_second": 0.115, "step": 200 }, { "epoch": 0.44, "learning_rate": 3.437648009023905e-06, "logits/chosen": -2.533383369445801, "logits/rejected": -2.4935860633850098, "logps/chosen": -243.6236114501953, "logps/rejected": -238.85140991210938, "loss": 2145.5416, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.06410142779350281, "rewards/margins": 0.14374245703220367, "rewards/rejected": -0.07964102178812027, "step": 210 }, { "epoch": 0.46, "learning_rate": 3.265574537815398e-06, "logits/chosen": -2.554565906524658, "logits/rejected": -2.56289005279541, "logps/chosen": -277.4061584472656, "logps/rejected": -253.40048217773438, "loss": 2196.8484, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.052330613136291504, "rewards/margins": 0.11339585483074188, "rewards/rejected": -0.06106524541974068, "step": 220 }, { "epoch": 0.48, "learning_rate": 3.089397338773569e-06, "logits/chosen": -2.4857611656188965, "logits/rejected": -2.473193407058716, "logps/chosen": -247.3427276611328, "logps/rejected": -241.8627471923828, "loss": 2160.1729, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.03845102712512016, "rewards/margins": 0.11976752430200577, "rewards/rejected": -0.0813164934515953, "step": 230 }, { "epoch": 0.5, "learning_rate": 2.9100607788275547e-06, "logits/chosen": -2.5121560096740723, "logits/rejected": -2.516338586807251, "logps/chosen": -257.1769714355469, "logps/rejected": -247.3695068359375, "loss": 2185.7641, "rewards/accuracies": 0.684374988079071, "rewards/chosen": 0.0379050187766552, "rewards/margins": 0.11140499264001846, "rewards/rejected": -0.07349997013807297, "step": 240 }, { "epoch": 0.52, "learning_rate": 2.72852616010567e-06, "logits/chosen": -2.5092978477478027, "logits/rejected": -2.487090826034546, "logps/chosen": -264.5955505371094, "logps/rejected": -246.3382110595703, "loss": 2136.6197, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": 0.039962492883205414, "rewards/margins": 0.1403963267803192, "rewards/rejected": -0.1004338413476944, "step": 250 }, { "epoch": 0.54, "learning_rate": 2.5457665670441937e-06, "logits/chosen": -2.5069711208343506, "logits/rejected": -2.5030505657196045, "logps/chosen": -257.4859619140625, "logps/rejected": -231.91958618164062, "loss": 2085.2795, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.05723271518945694, "rewards/margins": 0.15024301409721375, "rewards/rejected": -0.0930103212594986, "step": 260 }, { "epoch": 0.57, "learning_rate": 2.3627616503391813e-06, "logits/chosen": -2.525665760040283, "logits/rejected": -2.5043163299560547, "logps/chosen": -280.7471618652344, "logps/rejected": -267.36712646484375, "loss": 2089.859, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.05569761246442795, "rewards/margins": 0.179846853017807, "rewards/rejected": -0.12414924055337906, "step": 270 }, { "epoch": 0.59, "learning_rate": 2.1804923757009885e-06, "logits/chosen": -2.500837564468384, "logits/rejected": -2.501950740814209, "logps/chosen": -270.04193115234375, "logps/rejected": -248.61978149414062, "loss": 2111.6906, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.05320361256599426, "rewards/margins": 0.1410333216190338, "rewards/rejected": -0.08782971650362015, "step": 280 }, { "epoch": 0.61, "learning_rate": 1.9999357655598894e-06, "logits/chosen": -2.5122292041778564, "logits/rejected": -2.50368070602417, "logps/chosen": -258.72686767578125, "logps/rejected": -256.91387939453125, "loss": 2137.0592, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.053160279989242554, "rewards/margins": 0.15454119443893433, "rewards/rejected": -0.10138092190027237, "step": 290 }, { "epoch": 0.63, "learning_rate": 1.8220596619089576e-06, "logits/chosen": -2.471623659133911, "logits/rejected": -2.4690403938293457, "logps/chosen": -246.51766967773438, "logps/rejected": -251.79257202148438, "loss": 2135.4973, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0453377440571785, "rewards/margins": 0.12641170620918274, "rewards/rejected": -0.08107397705316544, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": -2.2764506340026855, "eval_logits/rejected": -2.2231767177581787, "eval_logps/chosen": -258.7624206542969, "eval_logps/rejected": -252.75852966308594, "eval_loss": 2129.089599609375, "eval_rewards/accuracies": 0.7559523582458496, "eval_rewards/chosen": 0.06262180209159851, "eval_rewards/margins": 0.14249789714813232, "eval_rewards/rejected": -0.07987607270479202, "eval_runtime": 547.9938, "eval_samples_per_second": 3.65, "eval_steps_per_second": 0.115, "step": 300 }, { "epoch": 0.65, "learning_rate": 1.647817538357072e-06, "logits/chosen": -2.5041086673736572, "logits/rejected": -2.495436191558838, "logps/chosen": -264.5109558105469, "logps/rejected": -248.3275604248047, "loss": 2107.123, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.05480458214879036, "rewards/margins": 0.13964474201202393, "rewards/rejected": -0.08484016358852386, "step": 310 }, { "epoch": 0.67, "learning_rate": 1.4781433892011132e-06, "logits/chosen": -2.53191876411438, "logits/rejected": -2.4989166259765625, "logps/chosen": -242.36599731445312, "logps/rejected": -243.78067016601562, "loss": 2076.0621, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": 0.05456935614347458, "rewards/margins": 0.14978976547718048, "rewards/rejected": -0.0952204093337059, "step": 320 }, { "epoch": 0.69, "learning_rate": 1.3139467229135999e-06, "logits/chosen": -2.4768006801605225, "logits/rejected": -2.4569873809814453, "logps/chosen": -263.0523681640625, "logps/rejected": -250.5469207763672, "loss": 2112.1141, "rewards/accuracies": 0.734375, "rewards/chosen": 0.044828929007053375, "rewards/margins": 0.13050048053264618, "rewards/rejected": -0.0856715738773346, "step": 330 }, { "epoch": 0.71, "learning_rate": 1.1561076868822756e-06, "logits/chosen": -2.5158028602600098, "logits/rejected": -2.5096983909606934, "logps/chosen": -275.6848449707031, "logps/rejected": -246.7259979248047, "loss": 2151.2445, "rewards/accuracies": 0.746874988079071, "rewards/chosen": 0.052164845168590546, "rewards/margins": 0.15314052999019623, "rewards/rejected": -0.10097566992044449, "step": 340 }, { "epoch": 0.73, "learning_rate": 1.0054723495346484e-06, "logits/chosen": -2.518799304962158, "logits/rejected": -2.4620516300201416, "logps/chosen": -249.27401733398438, "logps/rejected": -218.7183074951172, "loss": 2093.9803, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": 0.0662151575088501, "rewards/margins": 0.14556364715099335, "rewards/rejected": -0.07934850454330444, "step": 350 }, { "epoch": 0.75, "learning_rate": 8.628481651367876e-07, "logits/chosen": -2.5340943336486816, "logits/rejected": -2.5006654262542725, "logps/chosen": -260.32464599609375, "logps/rejected": -237.3218536376953, "loss": 2094.1246, "rewards/accuracies": 0.765625, "rewards/chosen": 0.05396001785993576, "rewards/margins": 0.15317106246948242, "rewards/rejected": -0.09921105206012726, "step": 360 }, { "epoch": 0.77, "learning_rate": 7.289996455765749e-07, "logits/chosen": -2.529265880584717, "logits/rejected": -2.515712261199951, "logps/chosen": -266.943115234375, "logps/rejected": -246.0579376220703, "loss": 2115.357, "rewards/accuracies": 0.71875, "rewards/chosen": 0.052078358829021454, "rewards/margins": 0.1462351232767105, "rewards/rejected": -0.09415675699710846, "step": 370 }, { "epoch": 0.8, "learning_rate": 6.046442623320145e-07, "logits/chosen": -2.4891440868377686, "logits/rejected": -2.499753952026367, "logps/chosen": -253.51632690429688, "logps/rejected": -245.4505615234375, "loss": 2082.182, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": 0.051686953753232956, "rewards/margins": 0.1390691101551056, "rewards/rejected": -0.08738215267658234, "step": 380 }, { "epoch": 0.82, "learning_rate": 4.904486005914027e-07, "logits/chosen": -2.532160997390747, "logits/rejected": -2.5001654624938965, "logps/chosen": -280.9754333496094, "logps/rejected": -279.0588684082031, "loss": 2114.3043, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": 0.0547635443508625, "rewards/margins": 0.14076778292655945, "rewards/rejected": -0.08600424975156784, "step": 390 }, { "epoch": 0.84, "learning_rate": 3.8702478614051353e-07, "logits/chosen": -2.4791765213012695, "logits/rejected": -2.4799935817718506, "logps/chosen": -246.14102172851562, "logps/rejected": -251.533447265625, "loss": 2099.8018, "rewards/accuracies": 0.703125, "rewards/chosen": 0.0392024889588356, "rewards/margins": 0.13221651315689087, "rewards/rejected": -0.09301402419805527, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": -2.254145860671997, "eval_logits/rejected": -2.2016360759735107, "eval_logps/chosen": -259.64398193359375, "eval_logps/rejected": -254.3590850830078, "eval_loss": 2121.667236328125, "eval_rewards/accuracies": 0.7539682388305664, "eval_rewards/chosen": 0.05380600318312645, "eval_rewards/margins": 0.14968746900558472, "eval_rewards/rejected": -0.09588146954774857, "eval_runtime": 547.9727, "eval_samples_per_second": 3.65, "eval_steps_per_second": 0.115, "step": 400 }, { "epoch": 0.86, "learning_rate": 2.9492720416985004e-07, "logits/chosen": -2.4832329750061035, "logits/rejected": -2.463463306427002, "logps/chosen": -284.7741394042969, "logps/rejected": -252.4269561767578, "loss": 2145.448, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.05263269692659378, "rewards/margins": 0.15021036565303802, "rewards/rejected": -0.09757767617702484, "step": 410 }, { "epoch": 0.88, "learning_rate": 2.1464952759020857e-07, "logits/chosen": -2.4804348945617676, "logits/rejected": -2.457764148712158, "logps/chosen": -254.78604125976562, "logps/rejected": -278.61346435546875, "loss": 2123.6629, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.033899884670972824, "rewards/margins": 0.11116783320903778, "rewards/rejected": -0.07726795971393585, "step": 420 }, { "epoch": 0.9, "learning_rate": 1.4662207078575685e-07, "logits/chosen": -2.4848549365997314, "logits/rejected": -2.485640048980713, "logps/chosen": -268.3457336425781, "logps/rejected": -268.5885925292969, "loss": 2144.4309, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.03841588646173477, "rewards/margins": 0.13024446368217468, "rewards/rejected": -0.09182857722043991, "step": 430 }, { "epoch": 0.92, "learning_rate": 9.120948298936422e-08, "logits/chosen": -2.457054615020752, "logits/rejected": -2.4329726696014404, "logps/chosen": -231.9584197998047, "logps/rejected": -234.6277313232422, "loss": 2118.3984, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.038600482046604156, "rewards/margins": 0.13669805228710175, "rewards/rejected": -0.09809757024049759, "step": 440 }, { "epoch": 0.94, "learning_rate": 4.870879364444109e-08, "logits/chosen": -2.5156655311584473, "logits/rejected": -2.563300848007202, "logps/chosen": -263.9936218261719, "logps/rejected": -265.6227722167969, "loss": 2123.5402, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.04902677983045578, "rewards/margins": 0.1260160207748413, "rewards/rejected": -0.07698923349380493, "step": 450 }, { "epoch": 0.96, "learning_rate": 1.93478202307823e-08, "logits/chosen": -2.470996379852295, "logits/rejected": -2.4720451831817627, "logps/chosen": -258.21734619140625, "logps/rejected": -262.04925537109375, "loss": 2078.5094, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": 0.04391016811132431, "rewards/margins": 0.14817874133586884, "rewards/rejected": -0.10426857322454453, "step": 460 }, { "epoch": 0.98, "learning_rate": 3.283947088983663e-09, "logits/chosen": -2.513140916824341, "logits/rejected": -2.535651206970215, "logps/chosen": -249.6727752685547, "logps/rejected": -248.2782745361328, "loss": 2093.2779, "rewards/accuracies": 0.75, "rewards/chosen": 0.04741714522242546, "rewards/margins": 0.143958181142807, "rewards/rejected": -0.09654103964567184, "step": 470 }, { "epoch": 1.0, "step": 477, "total_flos": 0.0, "train_loss": 2164.5614415454666, "train_runtime": 32346.8016, "train_samples_per_second": 1.89, "train_steps_per_second": 0.015 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }