{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99562408835174, "eval_steps": 200, "global_step": 1797, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 22.077117919921875, "learning_rate": 9.999251052313705e-06, "logits/chosen": 0.9130447506904602, "logits/rejected": 0.9451152682304382, "logps/chosen": -119.4351806640625, "logps/rejected": -146.38978576660156, "loss": 2.3628, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 23.45577621459961, "rewards/margins": 1.552248477935791, "rewards/rejected": 21.903528213500977, "step": 10 }, { "epoch": 0.03, "grad_norm": 22.769922256469727, "learning_rate": 9.996974102027961e-06, "logits/chosen": 0.5303283929824829, "logits/rejected": 0.5954318642616272, "logps/chosen": -131.7035369873047, "logps/rejected": -162.93435668945312, "loss": 2.0403, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 23.408517837524414, "rewards/margins": 3.0596365928649902, "rewards/rejected": 20.3488826751709, "step": 20 }, { "epoch": 0.05, "grad_norm": 34.49003601074219, "learning_rate": 9.993618904787861e-06, "logits/chosen": 0.23024284839630127, "logits/rejected": 0.2893335521221161, "logps/chosen": -135.048828125, "logps/rejected": -175.6779022216797, "loss": 1.5462, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 22.8612003326416, "rewards/margins": 3.707317352294922, "rewards/rejected": 19.153881072998047, "step": 30 }, { "epoch": 0.07, "grad_norm": 26.99772071838379, "learning_rate": 9.98844090765316e-06, "logits/chosen": -0.12471504509449005, "logits/rejected": -0.05195974186062813, "logps/chosen": -137.30029296875, "logps/rejected": -179.8270721435547, "loss": 1.6569, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 21.461271286010742, "rewards/margins": 3.4389548301696777, "rewards/rejected": 18.022315979003906, "step": 40 }, { "epoch": 0.08, "grad_norm": 39.197853088378906, "learning_rate": 9.98173813574765e-06, "logits/chosen": -0.5969198346138, "logits/rejected": -0.5180742144584656, "logps/chosen": -145.24537658691406, "logps/rejected": -207.34591674804688, "loss": 1.5324, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 21.160579681396484, "rewards/margins": 5.194663047790527, "rewards/rejected": 15.965914726257324, "step": 50 }, { "epoch": 0.1, "grad_norm": 28.039043426513672, "learning_rate": 9.973512637851239e-06, "logits/chosen": -0.972032904624939, "logits/rejected": -0.9215275049209595, "logps/chosen": -159.76718139648438, "logps/rejected": -216.4589080810547, "loss": 1.515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 19.871841430664062, "rewards/margins": 5.058460235595703, "rewards/rejected": 14.813380241394043, "step": 60 }, { "epoch": 0.12, "grad_norm": 11.507994651794434, "learning_rate": 9.963766928182676e-06, "logits/chosen": -1.2355167865753174, "logits/rejected": -1.1757726669311523, "logps/chosen": -153.20492553710938, "logps/rejected": -238.68008422851562, "loss": 1.1533, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 18.97603988647461, "rewards/margins": 6.053138732910156, "rewards/rejected": 12.922900199890137, "step": 70 }, { "epoch": 0.13, "grad_norm": 8.204204559326172, "learning_rate": 9.952503985631063e-06, "logits/chosen": -1.262416958808899, "logits/rejected": -1.1802202463150024, "logps/chosen": -134.965576171875, "logps/rejected": -221.21835327148438, "loss": 1.0608, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 20.67170524597168, "rewards/margins": 6.517201900482178, "rewards/rejected": 14.154504776000977, "step": 80 }, { "epoch": 0.15, "grad_norm": 27.41866111755371, "learning_rate": 9.939727252845304e-06, "logits/chosen": -0.9036836624145508, "logits/rejected": -0.8652833104133606, "logps/chosen": -141.32794189453125, "logps/rejected": -192.92916870117188, "loss": 1.2558, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 21.43319320678711, "rewards/margins": 5.654860496520996, "rewards/rejected": 15.77833080291748, "step": 90 }, { "epoch": 0.17, "grad_norm": 31.713287353515625, "learning_rate": 9.925440635181834e-06, "logits/chosen": -0.7909407615661621, "logits/rejected": -0.7640722393989563, "logps/chosen": -128.05874633789062, "logps/rejected": -197.05027770996094, "loss": 1.3031, "rewards/accuracies": 0.8125, "rewards/chosen": 21.722820281982422, "rewards/margins": 5.5480637550354, "rewards/rejected": 16.174755096435547, "step": 100 }, { "epoch": 0.18, "grad_norm": 5.262263298034668, "learning_rate": 9.909648499510903e-06, "logits/chosen": -0.7350383400917053, "logits/rejected": -0.7220867872238159, "logps/chosen": -141.88296508789062, "logps/rejected": -192.28868103027344, "loss": 1.2729, "rewards/accuracies": 0.875, "rewards/chosen": 20.970355987548828, "rewards/margins": 6.049419403076172, "rewards/rejected": 14.920933723449707, "step": 110 }, { "epoch": 0.2, "grad_norm": 20.070709228515625, "learning_rate": 9.892355672881781e-06, "logits/chosen": -0.8407080769538879, "logits/rejected": -0.8053030967712402, "logps/chosen": -129.39187622070312, "logps/rejected": -197.65792846679688, "loss": 1.4401, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 22.391189575195312, "rewards/margins": 6.017077445983887, "rewards/rejected": 16.374113082885742, "step": 120 }, { "epoch": 0.22, "grad_norm": 15.006941795349121, "learning_rate": 9.873567441047321e-06, "logits/chosen": -0.965476393699646, "logits/rejected": -0.8994636535644531, "logps/chosen": -115.93741607666016, "logps/rejected": -211.8863983154297, "loss": 0.9361, "rewards/accuracies": 0.875, "rewards/chosen": 23.738582611083984, "rewards/margins": 7.023127555847168, "rewards/rejected": 16.715452194213867, "step": 130 }, { "epoch": 0.23, "grad_norm": 21.54697608947754, "learning_rate": 9.853289546848304e-06, "logits/chosen": -1.1242530345916748, "logits/rejected": -1.0690996646881104, "logps/chosen": -136.72805786132812, "logps/rejected": -215.2025146484375, "loss": 0.994, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 21.464555740356445, "rewards/margins": 6.428616523742676, "rewards/rejected": 15.035940170288086, "step": 140 }, { "epoch": 0.25, "grad_norm": 36.22259521484375, "learning_rate": 9.83152818845808e-06, "logits/chosen": -1.238879680633545, "logits/rejected": -1.212425947189331, "logps/chosen": -135.1555938720703, "logps/rejected": -223.7063751220703, "loss": 1.5207, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 21.200214385986328, "rewards/margins": 6.609138488769531, "rewards/rejected": 14.59107780456543, "step": 150 }, { "epoch": 0.27, "grad_norm": 5.652963161468506, "learning_rate": 9.808290017488018e-06, "logits/chosen": -1.2760050296783447, "logits/rejected": -1.229891061782837, "logps/chosen": -124.10953521728516, "logps/rejected": -205.0720977783203, "loss": 1.1957, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 21.773719787597656, "rewards/margins": 6.910721778869629, "rewards/rejected": 14.862997055053711, "step": 160 }, { "epoch": 0.28, "grad_norm": 7.20241117477417, "learning_rate": 9.783582136954363e-06, "logits/chosen": -1.022456169128418, "logits/rejected": -1.0098379850387573, "logps/chosen": -138.91571044921875, "logps/rejected": -193.36270141601562, "loss": 1.1055, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 21.772497177124023, "rewards/margins": 7.529877662658691, "rewards/rejected": 14.242624282836914, "step": 170 }, { "epoch": 0.3, "grad_norm": 23.037525177001953, "learning_rate": 9.757412099107122e-06, "logits/chosen": -1.0896828174591064, "logits/rejected": -1.049005150794983, "logps/chosen": -126.07826232910156, "logps/rejected": -211.6298370361328, "loss": 1.1861, "rewards/accuracies": 0.875, "rewards/chosen": 22.460693359375, "rewards/margins": 7.169064998626709, "rewards/rejected": 15.29162883758545, "step": 180 }, { "epoch": 0.32, "grad_norm": 29.62506866455078, "learning_rate": 9.72978790312163e-06, "logits/chosen": -0.8976786732673645, "logits/rejected": -0.8687394857406616, "logps/chosen": -119.01469421386719, "logps/rejected": -198.20928955078125, "loss": 0.9781, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 23.478139877319336, "rewards/margins": 6.4206695556640625, "rewards/rejected": 17.057470321655273, "step": 190 }, { "epoch": 0.33, "grad_norm": 36.63336181640625, "learning_rate": 9.700717992653505e-06, "logits/chosen": -1.0064117908477783, "logits/rejected": -0.9567538499832153, "logps/chosen": -119.5957260131836, "logps/rejected": -198.21707153320312, "loss": 1.1726, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 22.104581832885742, "rewards/margins": 6.333141803741455, "rewards/rejected": 15.771438598632812, "step": 200 }, { "epoch": 0.33, "eval_logits/chosen": -0.7826857566833496, "eval_logits/rejected": -0.7607142925262451, "eval_logps/chosen": -162.48268127441406, "eval_logps/rejected": -199.3195037841797, "eval_loss": 2.607863664627075, "eval_rewards/accuracies": 0.5929077863693237, "eval_rewards/chosen": 14.876009941101074, "eval_rewards/margins": 1.181178092956543, "eval_rewards/rejected": 13.694831848144531, "eval_runtime": 941.4569, "eval_samples_per_second": 0.749, "eval_steps_per_second": 0.749, "step": 200 }, { "epoch": 0.35, "grad_norm": 5.637684345245361, "learning_rate": 9.670211253257753e-06, "logits/chosen": -1.359417200088501, "logits/rejected": -1.3465789556503296, "logps/chosen": -123.56754302978516, "logps/rejected": -221.25131225585938, "loss": 0.8259, "rewards/accuracies": 0.9375, "rewards/chosen": 22.563716888427734, "rewards/margins": 8.354880332946777, "rewards/rejected": 14.208834648132324, "step": 210 }, { "epoch": 0.37, "grad_norm": 18.743879318237305, "learning_rate": 9.638277009672787e-06, "logits/chosen": -1.294345736503601, "logits/rejected": -1.2438859939575195, "logps/chosen": -120.88092041015625, "logps/rejected": -219.56845092773438, "loss": 0.9568, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 22.052473068237305, "rewards/margins": 7.690165042877197, "rewards/rejected": 14.36230754852295, "step": 220 }, { "epoch": 0.38, "grad_norm": 4.425251483917236, "learning_rate": 9.604925022970226e-06, "logits/chosen": -1.4860092401504517, "logits/rejected": -1.4260871410369873, "logps/chosen": -116.0375747680664, "logps/rejected": -233.6222381591797, "loss": 0.8266, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 22.423358917236328, "rewards/margins": 7.869799613952637, "rewards/rejected": 14.553558349609375, "step": 230 }, { "epoch": 0.4, "grad_norm": 15.595026969909668, "learning_rate": 9.570165487571295e-06, "logits/chosen": -1.29686439037323, "logits/rejected": -1.2336632013320923, "logps/chosen": -114.66429138183594, "logps/rejected": -219.73941040039062, "loss": 0.9711, "rewards/accuracies": 0.875, "rewards/chosen": 23.410022735595703, "rewards/margins": 8.979305267333984, "rewards/rejected": 14.430717468261719, "step": 240 }, { "epoch": 0.42, "grad_norm": 8.037131309509277, "learning_rate": 9.534009028130791e-06, "logits/chosen": -1.3593500852584839, "logits/rejected": -1.3431365489959717, "logps/chosen": -116.7984619140625, "logps/rejected": -203.38714599609375, "loss": 1.0554, "rewards/accuracies": 0.875, "rewards/chosen": 22.437034606933594, "rewards/margins": 7.695396423339844, "rewards/rejected": 14.74163818359375, "step": 250 }, { "epoch": 0.43, "grad_norm": 24.600027084350586, "learning_rate": 9.496466696289533e-06, "logits/chosen": -1.296525478363037, "logits/rejected": -1.2403943538665771, "logps/chosen": -125.29974365234375, "logps/rejected": -220.273193359375, "loss": 1.0706, "rewards/accuracies": 0.875, "rewards/chosen": 22.99033546447754, "rewards/margins": 7.625124454498291, "rewards/rejected": 15.365211486816406, "step": 260 }, { "epoch": 0.45, "grad_norm": 12.19877815246582, "learning_rate": 9.4575499672963e-06, "logits/chosen": -1.330751657485962, "logits/rejected": -1.2963988780975342, "logps/chosen": -121.9324951171875, "logps/rejected": -204.8725128173828, "loss": 1.0879, "rewards/accuracies": 0.875, "rewards/chosen": 22.846813201904297, "rewards/margins": 6.9288506507873535, "rewards/rejected": 15.917961120605469, "step": 270 }, { "epoch": 0.47, "grad_norm": 61.58235168457031, "learning_rate": 9.417270736500284e-06, "logits/chosen": -1.2155901193618774, "logits/rejected": -1.1836094856262207, "logps/chosen": -120.52559661865234, "logps/rejected": -213.6286163330078, "loss": 1.044, "rewards/accuracies": 0.875, "rewards/chosen": 22.38599967956543, "rewards/margins": 7.965991973876953, "rewards/rejected": 14.420007705688477, "step": 280 }, { "epoch": 0.48, "grad_norm": 27.1291446685791, "learning_rate": 9.375641315715147e-06, "logits/chosen": -1.4014637470245361, "logits/rejected": -1.3697010278701782, "logps/chosen": -120.2726821899414, "logps/rejected": -248.0132293701172, "loss": 1.1305, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 23.45905113220215, "rewards/margins": 9.868653297424316, "rewards/rejected": 13.5903959274292, "step": 290 }, { "epoch": 0.5, "grad_norm": 4.111963748931885, "learning_rate": 9.332674429455762e-06, "logits/chosen": -1.3501853942871094, "logits/rejected": -1.3238388299942017, "logps/chosen": -120.29520416259766, "logps/rejected": -219.8441619873047, "loss": 0.9532, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 23.571392059326172, "rewards/margins": 10.177915573120117, "rewards/rejected": 13.393475532531738, "step": 300 }, { "epoch": 0.52, "grad_norm": 19.904033660888672, "learning_rate": 9.288383211048827e-06, "logits/chosen": -1.3211156129837036, "logits/rejected": -1.2947032451629639, "logps/chosen": -108.23663330078125, "logps/rejected": -220.5325164794922, "loss": 0.8943, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 23.694904327392578, "rewards/margins": 8.900300025939941, "rewards/rejected": 14.794604301452637, "step": 310 }, { "epoch": 0.53, "grad_norm": 5.557188987731934, "learning_rate": 9.242781198618508e-06, "logits/chosen": -1.4145915508270264, "logits/rejected": -1.4117708206176758, "logps/chosen": -116.71977233886719, "logps/rejected": -205.55245971679688, "loss": 0.8282, "rewards/accuracies": 0.9375, "rewards/chosen": 24.738483428955078, "rewards/margins": 9.999399185180664, "rewards/rejected": 14.739084243774414, "step": 320 }, { "epoch": 0.55, "grad_norm": 38.25541687011719, "learning_rate": 9.195882330948351e-06, "logits/chosen": -1.4355194568634033, "logits/rejected": -1.430084228515625, "logps/chosen": -116.34031677246094, "logps/rejected": -221.4161376953125, "loss": 0.7625, "rewards/accuracies": 0.9375, "rewards/chosen": 24.25949478149414, "rewards/margins": 10.02813720703125, "rewards/rejected": 14.231356620788574, "step": 330 }, { "epoch": 0.57, "grad_norm": 40.876747131347656, "learning_rate": 9.147700943220737e-06, "logits/chosen": -1.4224138259887695, "logits/rejected": -1.3659865856170654, "logps/chosen": -110.4036865234375, "logps/rejected": -227.19296264648438, "loss": 0.792, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 24.380083084106445, "rewards/margins": 9.536338806152344, "rewards/rejected": 14.843744277954102, "step": 340 }, { "epoch": 0.58, "grad_norm": 31.962615966796875, "learning_rate": 9.098251762635162e-06, "logits/chosen": -1.1947047710418701, "logits/rejected": -1.1642345190048218, "logps/chosen": -112.7500991821289, "logps/rejected": -222.45657348632812, "loss": 1.1409, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 23.021337509155273, "rewards/margins": 8.003363609313965, "rewards/rejected": 15.017971992492676, "step": 350 }, { "epoch": 0.6, "grad_norm": 22.922439575195312, "learning_rate": 9.047549903906704e-06, "logits/chosen": -1.4528108835220337, "logits/rejected": -1.4263083934783936, "logps/chosen": -112.0550537109375, "logps/rejected": -224.366943359375, "loss": 0.7604, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 24.369909286499023, "rewards/margins": 9.932126998901367, "rewards/rejected": 14.437784194946289, "step": 360 }, { "epoch": 0.62, "grad_norm": 15.393943786621094, "learning_rate": 8.99561086464603e-06, "logits/chosen": -1.4283573627471924, "logits/rejected": -1.4220378398895264, "logps/chosen": -102.64058685302734, "logps/rejected": -220.1312713623047, "loss": 0.6298, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 24.416704177856445, "rewards/margins": 9.940857887268066, "rewards/rejected": 14.475845336914062, "step": 370 }, { "epoch": 0.63, "grad_norm": 31.526750564575195, "learning_rate": 8.942450520622371e-06, "logits/chosen": -1.4221882820129395, "logits/rejected": -1.4053022861480713, "logps/chosen": -96.04936218261719, "logps/rejected": -219.9590301513672, "loss": 0.8734, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 25.633686065673828, "rewards/margins": 10.437232971191406, "rewards/rejected": 15.196454048156738, "step": 380 }, { "epoch": 0.65, "grad_norm": 21.012407302856445, "learning_rate": 8.888085120910917e-06, "logits/chosen": -1.5149024724960327, "logits/rejected": -1.5197780132293701, "logps/chosen": -121.2115249633789, "logps/rejected": -217.0883026123047, "loss": 0.9286, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 23.104969024658203, "rewards/margins": 8.534513473510742, "rewards/rejected": 14.570454597473145, "step": 390 }, { "epoch": 0.67, "grad_norm": 31.061025619506836, "learning_rate": 8.83253128292609e-06, "logits/chosen": -1.3974319696426392, "logits/rejected": -1.365886926651001, "logps/chosen": -106.92268371582031, "logps/rejected": -213.5463409423828, "loss": 1.0028, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 23.568645477294922, "rewards/margins": 9.186344146728516, "rewards/rejected": 14.382299423217773, "step": 400 }, { "epoch": 0.67, "eval_logits/chosen": -0.8742575645446777, "eval_logits/rejected": -0.8668540120124817, "eval_logps/chosen": -161.5126495361328, "eval_logps/rejected": -197.9254608154297, "eval_loss": 2.674311876296997, "eval_rewards/accuracies": 0.5843971371650696, "eval_rewards/chosen": 14.97301197052002, "eval_rewards/margins": 1.1387754678726196, "eval_rewards/rejected": 13.834238052368164, "eval_runtime": 926.938, "eval_samples_per_second": 0.761, "eval_steps_per_second": 0.761, "step": 400 }, { "epoch": 0.68, "grad_norm": 2.5695064067840576, "learning_rate": 8.77580598734224e-06, "logits/chosen": -1.3901078701019287, "logits/rejected": -1.361106514930725, "logps/chosen": -100.05810546875, "logps/rejected": -217.40853881835938, "loss": 0.8265, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 24.2711181640625, "rewards/margins": 9.111645698547363, "rewards/rejected": 15.159472465515137, "step": 410 }, { "epoch": 0.7, "grad_norm": 37.29205322265625, "learning_rate": 8.717926572903315e-06, "logits/chosen": -1.4033088684082031, "logits/rejected": -1.3668584823608398, "logps/chosen": -95.80254364013672, "logps/rejected": -216.8607635498047, "loss": 0.8961, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 25.137277603149414, "rewards/margins": 9.997251510620117, "rewards/rejected": 15.140027046203613, "step": 420 }, { "epoch": 0.72, "grad_norm": 13.271622657775879, "learning_rate": 8.658910731123056e-06, "logits/chosen": -1.2801754474639893, "logits/rejected": -1.2473911046981812, "logps/chosen": -97.76548767089844, "logps/rejected": -216.43551635742188, "loss": 0.8534, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 24.32241439819336, "rewards/margins": 9.534911155700684, "rewards/rejected": 14.787504196166992, "step": 430 }, { "epoch": 0.73, "grad_norm": 38.644073486328125, "learning_rate": 8.598776500877398e-06, "logits/chosen": -1.557314157485962, "logits/rejected": -1.5338244438171387, "logps/chosen": -101.72956085205078, "logps/rejected": -228.32504272460938, "loss": 0.7487, "rewards/accuracies": 0.9375, "rewards/chosen": 24.7454891204834, "rewards/margins": 11.237323760986328, "rewards/rejected": 13.50816535949707, "step": 440 }, { "epoch": 0.75, "grad_norm": 3.0210349559783936, "learning_rate": 8.537542262890664e-06, "logits/chosen": -1.6236152648925781, "logits/rejected": -1.5971102714538574, "logps/chosen": -95.90837097167969, "logps/rejected": -224.8739471435547, "loss": 0.6576, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 26.812732696533203, "rewards/margins": 12.709887504577637, "rewards/rejected": 14.102846145629883, "step": 450 }, { "epoch": 0.77, "grad_norm": 31.46013641357422, "learning_rate": 8.475226734117293e-06, "logits/chosen": -1.275268793106079, "logits/rejected": -1.2527453899383545, "logps/chosen": -90.21646881103516, "logps/rejected": -206.49935913085938, "loss": 0.6837, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 25.937389373779297, "rewards/margins": 10.999462127685547, "rewards/rejected": 14.93792724609375, "step": 460 }, { "epoch": 0.78, "grad_norm": 8.342381477355957, "learning_rate": 8.411848962020786e-06, "logits/chosen": -1.283499002456665, "logits/rejected": -1.2215421199798584, "logps/chosen": -92.12187194824219, "logps/rejected": -205.30447387695312, "loss": 0.7592, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 25.408042907714844, "rewards/margins": 9.172635078430176, "rewards/rejected": 16.235408782958984, "step": 470 }, { "epoch": 0.8, "grad_norm": 21.772287368774414, "learning_rate": 8.347428318751623e-06, "logits/chosen": -1.44618821144104, "logits/rejected": -1.3733718395233154, "logps/chosen": -84.4385986328125, "logps/rejected": -211.10861206054688, "loss": 0.6625, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 26.974166870117188, "rewards/margins": 11.247968673706055, "rewards/rejected": 15.726198196411133, "step": 480 }, { "epoch": 0.82, "grad_norm": 2.757871389389038, "learning_rate": 8.281984495225938e-06, "logits/chosen": -1.810275673866272, "logits/rejected": -1.757764220237732, "logps/chosen": -85.5127182006836, "logps/rejected": -223.26803588867188, "loss": 0.8267, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 26.434656143188477, "rewards/margins": 11.342155456542969, "rewards/rejected": 15.092500686645508, "step": 490 }, { "epoch": 0.83, "grad_norm": 23.952486038208008, "learning_rate": 8.215537495106781e-06, "logits/chosen": -2.0263965129852295, "logits/rejected": -1.9776378870010376, "logps/chosen": -78.3379898071289, "logps/rejected": -242.8697967529297, "loss": 0.655, "rewards/accuracies": 0.9375, "rewards/chosen": 28.698970794677734, "rewards/margins": 14.983367919921875, "rewards/rejected": 13.715606689453125, "step": 500 }, { "epoch": 0.85, "grad_norm": 27.391632080078125, "learning_rate": 8.148107628689736e-06, "logits/chosen": -1.8597463369369507, "logits/rejected": -1.8375170230865479, "logps/chosen": -97.95671844482422, "logps/rejected": -241.65005493164062, "loss": 0.8206, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 25.743389129638672, "rewards/margins": 13.008593559265137, "rewards/rejected": 12.734795570373535, "step": 510 }, { "epoch": 0.87, "grad_norm": 32.186092376708984, "learning_rate": 8.07971550669487e-06, "logits/chosen": -1.7367337942123413, "logits/rejected": -1.6837384700775146, "logps/chosen": -97.98736572265625, "logps/rejected": -236.2861785888672, "loss": 0.7388, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 25.235164642333984, "rewards/margins": 12.560583114624023, "rewards/rejected": 12.674580574035645, "step": 520 }, { "epoch": 0.88, "grad_norm": 2.0279977321624756, "learning_rate": 8.01038203396682e-06, "logits/chosen": -1.7110230922698975, "logits/rejected": -1.6617202758789062, "logps/chosen": -86.60490417480469, "logps/rejected": -218.7451171875, "loss": 0.7764, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 27.18789291381836, "rewards/margins": 12.605551719665527, "rewards/rejected": 14.582344055175781, "step": 530 }, { "epoch": 0.9, "grad_norm": 4.693509101867676, "learning_rate": 7.940128403084979e-06, "logits/chosen": -1.803138017654419, "logits/rejected": -1.7637073993682861, "logps/chosen": -82.79511260986328, "logps/rejected": -221.371826171875, "loss": 0.569, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 28.16043472290039, "rewards/margins": 13.802328109741211, "rewards/rejected": 14.358105659484863, "step": 540 }, { "epoch": 0.92, "grad_norm": 27.00885772705078, "learning_rate": 7.868976087885741e-06, "logits/chosen": -1.769118309020996, "logits/rejected": -1.7129123210906982, "logps/chosen": -83.30874633789062, "logps/rejected": -247.39993286132812, "loss": 0.8273, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 25.751117706298828, "rewards/margins": 12.938650131225586, "rewards/rejected": 12.812464714050293, "step": 550 }, { "epoch": 0.93, "grad_norm": 36.580535888671875, "learning_rate": 7.796946836898781e-06, "logits/chosen": -1.8945682048797607, "logits/rejected": -1.8643999099731445, "logps/chosen": -83.80934143066406, "logps/rejected": -228.9951629638672, "loss": 0.9133, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 25.17868995666504, "rewards/margins": 11.825202941894531, "rewards/rejected": 13.353485107421875, "step": 560 }, { "epoch": 0.95, "grad_norm": 53.96921920776367, "learning_rate": 7.724062666699359e-06, "logits/chosen": -1.8547554016113281, "logits/rejected": -1.8166589736938477, "logps/chosen": -96.59284973144531, "logps/rejected": -250.556640625, "loss": 0.6751, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 25.961719512939453, "rewards/margins": 14.7571382522583, "rewards/rejected": 11.204582214355469, "step": 570 }, { "epoch": 0.97, "grad_norm": 21.306289672851562, "learning_rate": 7.650345855178695e-06, "logits/chosen": -1.9091202020645142, "logits/rejected": -1.8824758529663086, "logps/chosen": -85.81441497802734, "logps/rejected": -241.8865509033203, "loss": 0.6111, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 25.982864379882812, "rewards/margins": 13.588445663452148, "rewards/rejected": 12.394417762756348, "step": 580 }, { "epoch": 0.98, "grad_norm": 8.490675926208496, "learning_rate": 7.575818934734482e-06, "logits/chosen": -1.7473344802856445, "logits/rejected": -1.7193949222564697, "logps/chosen": -83.74988555908203, "logps/rejected": -207.8963165283203, "loss": 0.5498, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 26.92534828186035, "rewards/margins": 12.920440673828125, "rewards/rejected": 14.004905700683594, "step": 590 }, { "epoch": 1.0, "grad_norm": 3.250255584716797, "learning_rate": 7.500504685383589e-06, "logits/chosen": -1.8129802942276, "logits/rejected": -1.748167634010315, "logps/chosen": -73.27245330810547, "logps/rejected": -218.0284423828125, "loss": 0.5127, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 27.97617530822754, "rewards/margins": 13.350275993347168, "rewards/rejected": 14.625898361206055, "step": 600 }, { "epoch": 1.0, "eval_logits/chosen": -0.8561357855796814, "eval_logits/rejected": -0.8501406908035278, "eval_logps/chosen": -157.18006896972656, "eval_logps/rejected": -196.33729553222656, "eval_loss": 2.5238630771636963, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": 15.406268119812012, "eval_rewards/margins": 1.4132167100906372, "eval_rewards/rejected": 13.993051528930664, "eval_runtime": 959.2657, "eval_samples_per_second": 0.735, "eval_steps_per_second": 0.735, "step": 600 }, { "epoch": 1.02, "grad_norm": 2.0936248302459717, "learning_rate": 7.4244261277990935e-06, "logits/chosen": -1.645000696182251, "logits/rejected": -1.5917456150054932, "logps/chosen": -73.57734680175781, "logps/rejected": -235.9907684326172, "loss": 0.4526, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 28.39139747619629, "rewards/margins": 15.543986320495605, "rewards/rejected": 12.847410202026367, "step": 610 }, { "epoch": 1.03, "grad_norm": 3.0842278003692627, "learning_rate": 7.347606516273741e-06, "logits/chosen": -1.7528049945831299, "logits/rejected": -1.7014877796173096, "logps/chosen": -78.36769104003906, "logps/rejected": -245.0220947265625, "loss": 0.463, "rewards/accuracies": 1.0, "rewards/chosen": 28.718791961669922, "rewards/margins": 15.73725700378418, "rewards/rejected": 12.981534004211426, "step": 620 }, { "epoch": 1.05, "grad_norm": 3.525000810623169, "learning_rate": 7.270069331612e-06, "logits/chosen": -1.6429353952407837, "logits/rejected": -1.5838866233825684, "logps/chosen": -79.90181732177734, "logps/rejected": -221.4831085205078, "loss": 0.4782, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 26.645065307617188, "rewards/margins": 13.249574661254883, "rewards/rejected": 13.395492553710938, "step": 630 }, { "epoch": 1.07, "grad_norm": 24.071266174316406, "learning_rate": 7.1918382739528804e-06, "logits/chosen": -1.838573694229126, "logits/rejected": -1.77816903591156, "logps/chosen": -75.74369812011719, "logps/rejected": -250.9163055419922, "loss": 0.4283, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 27.663928985595703, "rewards/margins": 14.643702507019043, "rewards/rejected": 13.020225524902344, "step": 640 }, { "epoch": 1.08, "grad_norm": 2.601551055908203, "learning_rate": 7.112937255525722e-06, "logits/chosen": -1.651479721069336, "logits/rejected": -1.5933506488800049, "logps/chosen": -69.61723327636719, "logps/rejected": -234.6320037841797, "loss": 0.4144, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 28.57497215270996, "rewards/margins": 15.81470775604248, "rewards/rejected": 12.760263442993164, "step": 650 }, { "epoch": 1.1, "grad_norm": 2.3393607139587402, "learning_rate": 7.033390393341133e-06, "logits/chosen": -1.9843261241912842, "logits/rejected": -1.8986890316009521, "logps/chosen": -56.452239990234375, "logps/rejected": -254.9853057861328, "loss": 0.338, "rewards/accuracies": 1.0, "rewards/chosen": 28.793537139892578, "rewards/margins": 16.600866317749023, "rewards/rejected": 12.192671775817871, "step": 660 }, { "epoch": 1.12, "grad_norm": 23.764225006103516, "learning_rate": 6.953222001819347e-06, "logits/chosen": -1.808882474899292, "logits/rejected": -1.7674134969711304, "logps/chosen": -70.89783477783203, "logps/rejected": -221.87173461914062, "loss": 0.4405, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 29.50577163696289, "rewards/margins": 16.218791961669922, "rewards/rejected": 13.286974906921387, "step": 670 }, { "epoch": 1.13, "grad_norm": 5.011995315551758, "learning_rate": 6.87245658535825e-06, "logits/chosen": -1.7610832452774048, "logits/rejected": -1.7023169994354248, "logps/chosen": -64.40223693847656, "logps/rejected": -239.07498168945312, "loss": 0.3924, "rewards/accuracies": 1.0, "rewards/chosen": 28.532089233398438, "rewards/margins": 15.520894050598145, "rewards/rejected": 13.011195182800293, "step": 680 }, { "epoch": 1.15, "grad_norm": 3.537167549133301, "learning_rate": 6.791118830843311e-06, "logits/chosen": -1.6986154317855835, "logits/rejected": -1.6274213790893555, "logps/chosen": -73.15770721435547, "logps/rejected": -220.58633422851562, "loss": 0.5515, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 27.489849090576172, "rewards/margins": 13.804224967956543, "rewards/rejected": 13.68562126159668, "step": 690 }, { "epoch": 1.17, "grad_norm": 7.608737945556641, "learning_rate": 6.709233600101761e-06, "logits/chosen": -1.7815144062042236, "logits/rejected": -1.7079941034317017, "logps/chosen": -64.2397689819336, "logps/rejected": -222.42898559570312, "loss": 0.374, "rewards/accuracies": 1.0, "rewards/chosen": 29.80181884765625, "rewards/margins": 15.93518352508545, "rewards/rejected": 13.8666353225708, "step": 700 }, { "epoch": 1.18, "grad_norm": 7.344120025634766, "learning_rate": 6.626825922303287e-06, "logits/chosen": -1.482596755027771, "logits/rejected": -1.398471713066101, "logps/chosen": -69.20109558105469, "logps/rejected": -227.3030548095703, "loss": 0.4768, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 26.281591415405273, "rewards/margins": 13.225834846496582, "rewards/rejected": 13.055760383605957, "step": 710 }, { "epoch": 1.2, "grad_norm": 2.493931293487549, "learning_rate": 6.5439209863095675e-06, "logits/chosen": -1.86245596408844, "logits/rejected": -1.805001974105835, "logps/chosen": -58.1787223815918, "logps/rejected": -235.18923950195312, "loss": 0.3453, "rewards/accuracies": 1.0, "rewards/chosen": 28.87502098083496, "rewards/margins": 16.159029006958008, "rewards/rejected": 12.715993881225586, "step": 720 }, { "epoch": 1.22, "grad_norm": 5.012514114379883, "learning_rate": 6.460544132975014e-06, "logits/chosen": -1.6647535562515259, "logits/rejected": -1.6346346139907837, "logps/chosen": -67.99676513671875, "logps/rejected": -234.99331665039062, "loss": 0.4353, "rewards/accuracies": 1.0, "rewards/chosen": 27.812246322631836, "rewards/margins": 16.04288673400879, "rewards/rejected": 11.76936149597168, "step": 730 }, { "epoch": 1.23, "grad_norm": 25.745906829833984, "learning_rate": 6.376720847401042e-06, "logits/chosen": -1.816080093383789, "logits/rejected": -1.754172921180725, "logps/chosen": -61.254425048828125, "logps/rejected": -244.97998046875, "loss": 0.3916, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 28.01883888244629, "rewards/margins": 16.0445499420166, "rewards/rejected": 11.974291801452637, "step": 740 }, { "epoch": 1.25, "grad_norm": 3.14039945602417, "learning_rate": 6.292476751146255e-06, "logits/chosen": -1.8254092931747437, "logits/rejected": -1.7369928359985352, "logps/chosen": -59.7559814453125, "logps/rejected": -227.97384643554688, "loss": 0.41, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 29.0753173828125, "rewards/margins": 16.199630737304688, "rewards/rejected": 12.875683784484863, "step": 750 }, { "epoch": 1.27, "grad_norm": 4.181332588195801, "learning_rate": 6.207837594394913e-06, "logits/chosen": -1.819637656211853, "logits/rejected": -1.7874256372451782, "logps/chosen": -60.003814697265625, "logps/rejected": -217.7976531982422, "loss": 0.3721, "rewards/accuracies": 1.0, "rewards/chosen": 28.516149520874023, "rewards/margins": 15.223426818847656, "rewards/rejected": 13.292726516723633, "step": 760 }, { "epoch": 1.28, "grad_norm": 11.886590003967285, "learning_rate": 6.1228292480861e-06, "logits/chosen": -1.7983691692352295, "logits/rejected": -1.707035779953003, "logps/chosen": -70.19294738769531, "logps/rejected": -250.7736358642578, "loss": 0.5082, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 28.486637115478516, "rewards/margins": 16.149328231811523, "rewards/rejected": 12.337307929992676, "step": 770 }, { "epoch": 1.3, "grad_norm": 2.3090531826019287, "learning_rate": 6.037477696005966e-06, "logits/chosen": -1.7597004175186157, "logits/rejected": -1.6807353496551514, "logps/chosen": -61.509925842285156, "logps/rejected": -227.34652709960938, "loss": 0.4032, "rewards/accuracies": 1.0, "rewards/chosen": 29.05527687072754, "rewards/margins": 16.505048751831055, "rewards/rejected": 12.550226211547852, "step": 780 }, { "epoch": 1.32, "grad_norm": 2.6496028900146484, "learning_rate": 5.95180902684548e-06, "logits/chosen": -1.6354789733886719, "logits/rejected": -1.545203447341919, "logps/chosen": -61.32403564453125, "logps/rejected": -239.3839874267578, "loss": 0.396, "rewards/accuracies": 1.0, "rewards/chosen": 28.665252685546875, "rewards/margins": 16.576025009155273, "rewards/rejected": 12.08923053741455, "step": 790 }, { "epoch": 1.33, "grad_norm": 2.990304946899414, "learning_rate": 5.8658494262261215e-06, "logits/chosen": -1.8462188243865967, "logits/rejected": -1.7873852252960205, "logps/chosen": -64.35298919677734, "logps/rejected": -239.20401000976562, "loss": 0.3787, "rewards/accuracies": 1.0, "rewards/chosen": 28.686588287353516, "rewards/margins": 15.4246826171875, "rewards/rejected": 13.2619047164917, "step": 800 }, { "epoch": 1.33, "eval_logits/chosen": -0.8357726335525513, "eval_logits/rejected": -0.8384636044502258, "eval_logps/chosen": -158.54798889160156, "eval_logps/rejected": -197.1554718017578, "eval_loss": 2.5951449871063232, "eval_rewards/accuracies": 0.6141843795776367, "eval_rewards/chosen": 15.269478797912598, "eval_rewards/margins": 1.3582426309585571, "eval_rewards/rejected": 13.911234855651855, "eval_runtime": 933.4217, "eval_samples_per_second": 0.755, "eval_steps_per_second": 0.755, "step": 800 }, { "epoch": 1.35, "grad_norm": 29.393373489379883, "learning_rate": 5.779625168695943e-06, "logits/chosen": -1.8400392532348633, "logits/rejected": -1.7939599752426147, "logps/chosen": -53.9841423034668, "logps/rejected": -231.19735717773438, "loss": 0.3973, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 29.610637664794922, "rewards/margins": 16.809568405151367, "rewards/rejected": 12.801069259643555, "step": 810 }, { "epoch": 1.37, "grad_norm": 3.947079658508301, "learning_rate": 5.6931626096984475e-06, "logits/chosen": -1.7477422952651978, "logits/rejected": -1.700312614440918, "logps/chosen": -60.37701416015625, "logps/rejected": -254.6928253173828, "loss": 0.37, "rewards/accuracies": 1.0, "rewards/chosen": 27.82967185974121, "rewards/margins": 15.517053604125977, "rewards/rejected": 12.31261920928955, "step": 820 }, { "epoch": 1.38, "grad_norm": 19.618087768554688, "learning_rate": 5.6064881775167445e-06, "logits/chosen": -1.7533241510391235, "logits/rejected": -1.6893295049667358, "logps/chosen": -61.474517822265625, "logps/rejected": -223.3421173095703, "loss": 0.4478, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 27.814483642578125, "rewards/margins": 14.598594665527344, "rewards/rejected": 13.215890884399414, "step": 830 }, { "epoch": 1.4, "grad_norm": 3.5263216495513916, "learning_rate": 5.5196283651954375e-06, "logits/chosen": -1.5104930400848389, "logits/rejected": -1.413916826248169, "logps/chosen": -70.25590515136719, "logps/rejected": -236.75833129882812, "loss": 0.4924, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 28.405406951904297, "rewards/margins": 15.316106796264648, "rewards/rejected": 13.089299201965332, "step": 840 }, { "epoch": 1.42, "grad_norm": 3.5469884872436523, "learning_rate": 5.432609722442715e-06, "logits/chosen": -1.7516591548919678, "logits/rejected": -1.6808538436889648, "logps/chosen": -69.64612579345703, "logps/rejected": -237.8245086669922, "loss": 0.4107, "rewards/accuracies": 1.0, "rewards/chosen": 28.481128692626953, "rewards/margins": 16.653011322021484, "rewards/rejected": 11.828117370605469, "step": 850 }, { "epoch": 1.43, "grad_norm": 4.012500286102295, "learning_rate": 5.345458847515133e-06, "logits/chosen": -1.3920238018035889, "logits/rejected": -1.284397840499878, "logps/chosen": -67.01615905761719, "logps/rejected": -207.93798828125, "loss": 0.4615, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 27.689502716064453, "rewards/margins": 13.398959159851074, "rewards/rejected": 14.290542602539062, "step": 860 }, { "epoch": 1.45, "grad_norm": 2.9533350467681885, "learning_rate": 5.258202379087537e-06, "logits/chosen": -1.7657880783081055, "logits/rejected": -1.693655014038086, "logps/chosen": -64.3235092163086, "logps/rejected": -251.1502227783203, "loss": 0.4456, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 28.66314125061035, "rewards/margins": 16.5991268157959, "rewards/rejected": 12.06401252746582, "step": 870 }, { "epoch": 1.47, "grad_norm": 3.6378014087677, "learning_rate": 5.170866988110656e-06, "logits/chosen": -1.7154031991958618, "logits/rejected": -1.657488226890564, "logps/chosen": -66.95178985595703, "logps/rejected": -236.646240234375, "loss": 0.4076, "rewards/accuracies": 1.0, "rewards/chosen": 28.9368839263916, "rewards/margins": 16.125226974487305, "rewards/rejected": 12.811657905578613, "step": 880 }, { "epoch": 1.48, "grad_norm": 2.8945438861846924, "learning_rate": 5.083479369658807e-06, "logits/chosen": -1.696937918663025, "logits/rejected": -1.6348483562469482, "logps/chosen": -58.26952362060547, "logps/rejected": -230.33377075195312, "loss": 0.3785, "rewards/accuracies": 1.0, "rewards/chosen": 28.738727569580078, "rewards/margins": 15.88463306427002, "rewards/rejected": 12.854090690612793, "step": 890 }, { "epoch": 1.5, "grad_norm": 2.305314302444458, "learning_rate": 4.9960662347702405e-06, "logits/chosen": -1.5210245847702026, "logits/rejected": -1.4456034898757935, "logps/chosen": -54.75811004638672, "logps/rejected": -232.2668914794922, "loss": 0.3659, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 29.211429595947266, "rewards/margins": 16.932321548461914, "rewards/rejected": 12.279109954833984, "step": 900 }, { "epoch": 1.52, "grad_norm": 30.96356964111328, "learning_rate": 4.908654302282602e-06, "logits/chosen": -1.9021522998809814, "logits/rejected": -1.8719444274902344, "logps/chosen": -55.43334197998047, "logps/rejected": -241.4973602294922, "loss": 0.3925, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 29.54325294494629, "rewards/margins": 18.098033905029297, "rewards/rejected": 11.44521713256836, "step": 910 }, { "epoch": 1.53, "grad_norm": 2.172999858856201, "learning_rate": 4.821270290666007e-06, "logits/chosen": -1.9638334512710571, "logits/rejected": -1.933005928993225, "logps/chosen": -50.753273010253906, "logps/rejected": -264.568603515625, "loss": 0.3296, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 29.600269317626953, "rewards/margins": 18.301733016967773, "rewards/rejected": 11.29853630065918, "step": 920 }, { "epoch": 1.55, "grad_norm": 3.1367807388305664, "learning_rate": 4.733940909856239e-06, "logits/chosen": -1.8073303699493408, "logits/rejected": -1.769351601600647, "logps/chosen": -56.24708938598633, "logps/rejected": -268.60906982421875, "loss": 0.349, "rewards/accuracies": 1.0, "rewards/chosen": 29.238876342773438, "rewards/margins": 17.679536819458008, "rewards/rejected": 11.559335708618164, "step": 930 }, { "epoch": 1.57, "grad_norm": 2.4563372135162354, "learning_rate": 4.646692853090539e-06, "logits/chosen": -1.8005622625350952, "logits/rejected": -1.783666968345642, "logps/chosen": -58.6412467956543, "logps/rejected": -242.1209716796875, "loss": 0.3657, "rewards/accuracies": 1.0, "rewards/chosen": 29.608911514282227, "rewards/margins": 18.011024475097656, "rewards/rejected": 11.597890853881836, "step": 940 }, { "epoch": 1.58, "grad_norm": 2.5796990394592285, "learning_rate": 4.559552788748507e-06, "logits/chosen": -2.053856134414673, "logits/rejected": -2.0160202980041504, "logps/chosen": -49.09900665283203, "logps/rejected": -269.3837890625, "loss": 0.2841, "rewards/accuracies": 1.0, "rewards/chosen": 30.68400001525879, "rewards/margins": 19.868282318115234, "rewards/rejected": 10.815717697143555, "step": 950 }, { "epoch": 1.6, "grad_norm": 2.9965362548828125, "learning_rate": 4.472547352200615e-06, "logits/chosen": -1.8396308422088623, "logits/rejected": -1.8210567235946655, "logps/chosen": -52.045127868652344, "logps/rejected": -257.56011962890625, "loss": 0.3359, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 29.730632781982422, "rewards/margins": 18.14250373840332, "rewards/rejected": 11.58813190460205, "step": 960 }, { "epoch": 1.62, "grad_norm": 3.3091185092926025, "learning_rate": 4.385703137666784e-06, "logits/chosen": -1.7598381042480469, "logits/rejected": -1.7196592092514038, "logps/chosen": -55.36179733276367, "logps/rejected": -260.2227478027344, "loss": 0.3407, "rewards/accuracies": 1.0, "rewards/chosen": 30.468708038330078, "rewards/margins": 19.425325393676758, "rewards/rejected": 11.043380737304688, "step": 970 }, { "epoch": 1.63, "grad_norm": 1.9838672876358032, "learning_rate": 4.2990466900875625e-06, "logits/chosen": -1.8335222005844116, "logits/rejected": -1.776532769203186, "logps/chosen": -51.03464889526367, "logps/rejected": -237.20083618164062, "loss": 0.3608, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 29.984943389892578, "rewards/margins": 17.165090560913086, "rewards/rejected": 12.819851875305176, "step": 980 }, { "epoch": 1.65, "grad_norm": 3.0028090476989746, "learning_rate": 4.212604497010346e-06, "logits/chosen": -1.7095162868499756, "logits/rejected": -1.6648433208465576, "logps/chosen": -48.533897399902344, "logps/rejected": -234.913330078125, "loss": 0.3397, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 30.055179595947266, "rewards/margins": 17.712875366210938, "rewards/rejected": 12.342303276062012, "step": 990 }, { "epoch": 1.67, "grad_norm": 11.127403259277344, "learning_rate": 4.126402980493171e-06, "logits/chosen": -1.7350540161132812, "logits/rejected": -1.6723353862762451, "logps/chosen": -60.74663543701172, "logps/rejected": -234.6821746826172, "loss": 0.381, "rewards/accuracies": 1.0, "rewards/chosen": 29.135547637939453, "rewards/margins": 16.059070587158203, "rewards/rejected": 13.07647705078125, "step": 1000 }, { "epoch": 1.67, "eval_logits/chosen": -0.7807645797729492, "eval_logits/rejected": -0.7846214771270752, "eval_logps/chosen": -161.0571746826172, "eval_logps/rejected": -201.4547576904297, "eval_loss": 2.581395387649536, "eval_rewards/accuracies": 0.6212766170501709, "eval_rewards/chosen": 15.018560409545898, "eval_rewards/margins": 1.537255883216858, "eval_rewards/rejected": 13.481305122375488, "eval_runtime": 933.6733, "eval_samples_per_second": 0.755, "eval_steps_per_second": 0.755, "step": 1000 }, { "epoch": 1.68, "grad_norm": 2.986391305923462, "learning_rate": 4.0404684890284815e-06, "logits/chosen": -1.6968259811401367, "logits/rejected": -1.62332022190094, "logps/chosen": -54.682281494140625, "logps/rejected": -259.25335693359375, "loss": 0.3598, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 30.244094848632812, "rewards/margins": 18.490741729736328, "rewards/rejected": 11.753351211547852, "step": 1010 }, { "epoch": 1.7, "grad_norm": 3.1635079383850098, "learning_rate": 3.954827289489429e-06, "logits/chosen": -1.8833897113800049, "logits/rejected": -1.7493565082550049, "logps/chosen": -53.182655334472656, "logps/rejected": -240.314697265625, "loss": 0.3551, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 30.2075252532959, "rewards/margins": 17.21380615234375, "rewards/rejected": 12.993721008300781, "step": 1020 }, { "epoch": 1.72, "grad_norm": 2.4462807178497314, "learning_rate": 3.86950555910108e-06, "logits/chosen": -1.6473881006240845, "logits/rejected": -1.5638983249664307, "logps/chosen": -54.882896423339844, "logps/rejected": -230.82162475585938, "loss": 0.3772, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 29.62346839904785, "rewards/margins": 15.928651809692383, "rewards/rejected": 13.694814682006836, "step": 1030 }, { "epoch": 1.73, "grad_norm": 4.168172836303711, "learning_rate": 3.784529377439067e-06, "logits/chosen": -1.6475791931152344, "logits/rejected": -1.5899393558502197, "logps/chosen": -59.67030715942383, "logps/rejected": -221.40066528320312, "loss": 0.3826, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 29.023029327392578, "rewards/margins": 16.1686954498291, "rewards/rejected": 12.854331970214844, "step": 1040 }, { "epoch": 1.75, "grad_norm": 17.74908447265625, "learning_rate": 3.699924718458036e-06, "logits/chosen": -1.7890431880950928, "logits/rejected": -1.7369229793548584, "logps/chosen": -48.85768127441406, "logps/rejected": -251.3222198486328, "loss": 0.3095, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 30.195240020751953, "rewards/margins": 17.660083770751953, "rewards/rejected": 12.53515338897705, "step": 1050 }, { "epoch": 1.77, "grad_norm": 6.809030532836914, "learning_rate": 3.615717442552429e-06, "logits/chosen": -1.7820018529891968, "logits/rejected": -1.7489522695541382, "logps/chosen": -54.51831817626953, "logps/rejected": -246.6309356689453, "loss": 0.337, "rewards/accuracies": 1.0, "rewards/chosen": 29.5898380279541, "rewards/margins": 18.221317291259766, "rewards/rejected": 11.368522644042969, "step": 1060 }, { "epoch": 1.78, "grad_norm": 7.795753002166748, "learning_rate": 3.5319332886519393e-06, "logits/chosen": -1.6910899877548218, "logits/rejected": -1.6332054138183594, "logps/chosen": -46.748443603515625, "logps/rejected": -252.2515106201172, "loss": 0.3168, "rewards/accuracies": 1.0, "rewards/chosen": 29.91961669921875, "rewards/margins": 18.060075759887695, "rewards/rejected": 11.85954475402832, "step": 1070 }, { "epoch": 1.8, "grad_norm": 4.77955436706543, "learning_rate": 3.4485978663541233e-06, "logits/chosen": -1.4919878244400024, "logits/rejected": -1.4713367223739624, "logps/chosen": -66.22251892089844, "logps/rejected": -239.391357421875, "loss": 0.4289, "rewards/accuracies": 1.0, "rewards/chosen": 28.419330596923828, "rewards/margins": 17.399511337280273, "rewards/rejected": 11.019818305969238, "step": 1080 }, { "epoch": 1.82, "grad_norm": 2.38405179977417, "learning_rate": 3.3657366480965158e-06, "logits/chosen": -1.8986093997955322, "logits/rejected": -1.8109142780303955, "logps/chosen": -47.243350982666016, "logps/rejected": -263.37347412109375, "loss": 0.2852, "rewards/accuracies": 1.0, "rewards/chosen": 30.1839656829834, "rewards/margins": 18.492839813232422, "rewards/rejected": 11.691123962402344, "step": 1090 }, { "epoch": 1.83, "grad_norm": 5.4925994873046875, "learning_rate": 3.2833749613706988e-06, "logits/chosen": -1.583234429359436, "logits/rejected": -1.497194766998291, "logps/chosen": -58.58795166015625, "logps/rejected": -240.7587432861328, "loss": 0.4343, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 27.720067977905273, "rewards/margins": 14.912628173828125, "rewards/rejected": 12.807439804077148, "step": 1100 }, { "epoch": 1.85, "grad_norm": 3.2018399238586426, "learning_rate": 3.201537980980646e-06, "logits/chosen": -1.3224772214889526, "logits/rejected": -1.2666256427764893, "logps/chosen": -54.696876525878906, "logps/rejected": -243.18539428710938, "loss": 0.3813, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 28.097915649414062, "rewards/margins": 16.58795738220215, "rewards/rejected": 11.50995922088623, "step": 1110 }, { "epoch": 1.87, "grad_norm": 4.00109338760376, "learning_rate": 3.1202507213477658e-06, "logits/chosen": -1.6143379211425781, "logits/rejected": -1.5576748847961426, "logps/chosen": -56.855003356933594, "logps/rejected": -233.1626739501953, "loss": 0.3651, "rewards/accuracies": 1.0, "rewards/chosen": 30.580801010131836, "rewards/margins": 18.555524826049805, "rewards/rejected": 12.025275230407715, "step": 1120 }, { "epoch": 1.88, "grad_norm": 5.0345635414123535, "learning_rate": 3.039538028864939e-06, "logits/chosen": -1.6249618530273438, "logits/rejected": -1.6053358316421509, "logps/chosen": -57.446815490722656, "logps/rejected": -246.0026092529297, "loss": 0.3495, "rewards/accuracies": 1.0, "rewards/chosen": 29.101943969726562, "rewards/margins": 17.34486198425293, "rewards/rejected": 11.757081985473633, "step": 1130 }, { "epoch": 1.9, "grad_norm": 2.065876007080078, "learning_rate": 2.9594245743019477e-06, "logits/chosen": -1.8201977014541626, "logits/rejected": -1.8249000310897827, "logps/chosen": -52.830360412597656, "logps/rejected": -243.8861083984375, "loss": 0.3389, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 31.09743881225586, "rewards/margins": 18.6008358001709, "rewards/rejected": 12.496602058410645, "step": 1140 }, { "epoch": 1.92, "grad_norm": 4.4573588371276855, "learning_rate": 2.8799348452645515e-06, "logits/chosen": -1.7230621576309204, "logits/rejected": -1.6711606979370117, "logps/chosen": -52.5847282409668, "logps/rejected": -234.39968872070312, "loss": 0.3286, "rewards/accuracies": 1.0, "rewards/chosen": 30.327728271484375, "rewards/margins": 18.147693634033203, "rewards/rejected": 12.180032730102539, "step": 1150 }, { "epoch": 1.93, "grad_norm": 14.367749214172363, "learning_rate": 2.801093138709582e-06, "logits/chosen": -1.7738536596298218, "logits/rejected": -1.687819480895996, "logps/chosen": -49.588436126708984, "logps/rejected": -238.64501953125, "loss": 0.3485, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 29.91376304626465, "rewards/margins": 17.238935470581055, "rewards/rejected": 12.674827575683594, "step": 1160 }, { "epoch": 1.95, "grad_norm": 9.441333770751953, "learning_rate": 2.722923553518285e-06, "logits/chosen": -1.5205357074737549, "logits/rejected": -1.480017900466919, "logps/chosen": -57.287017822265625, "logps/rejected": -243.42919921875, "loss": 0.3522, "rewards/accuracies": 1.0, "rewards/chosen": 29.02022933959961, "rewards/margins": 17.072509765625, "rewards/rejected": 11.947721481323242, "step": 1170 }, { "epoch": 1.97, "grad_norm": 12.407715797424316, "learning_rate": 2.6454499831302223e-06, "logits/chosen": -1.6342418193817139, "logits/rejected": -1.5738509893417358, "logps/chosen": -60.34296798706055, "logps/rejected": -226.05990600585938, "loss": 0.5105, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 30.577117919921875, "rewards/margins": 17.6352481842041, "rewards/rejected": 12.941868782043457, "step": 1180 }, { "epoch": 1.98, "grad_norm": 3.367372512817383, "learning_rate": 2.5686961082399716e-06, "logits/chosen": -1.8151267766952515, "logits/rejected": -1.7858302593231201, "logps/chosen": -57.720130920410156, "logps/rejected": -260.25494384765625, "loss": 0.3546, "rewards/accuracies": 1.0, "rewards/chosen": 29.963693618774414, "rewards/margins": 18.514841079711914, "rewards/rejected": 11.44885540008545, "step": 1190 }, { "epoch": 2.0, "grad_norm": 3.254979133605957, "learning_rate": 2.4926853895588343e-06, "logits/chosen": -1.7511732578277588, "logits/rejected": -1.6836206912994385, "logps/chosen": -41.321022033691406, "logps/rejected": -248.452880859375, "loss": 0.2993, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 31.503093719482422, "rewards/margins": 19.927473068237305, "rewards/rejected": 11.575621604919434, "step": 1200 }, { "epoch": 2.0, "eval_logits/chosen": -0.7554205060005188, "eval_logits/rejected": -0.758985161781311, "eval_logps/chosen": -160.93551635742188, "eval_logps/rejected": -202.3504638671875, "eval_loss": 2.5816383361816406, "eval_rewards/accuracies": 0.6382978558540344, "eval_rewards/chosen": 15.030726432800293, "eval_rewards/margins": 1.6389882564544678, "eval_rewards/rejected": 13.39173698425293, "eval_runtime": 930.5524, "eval_samples_per_second": 0.758, "eval_steps_per_second": 0.758, "step": 1200 }, { "epoch": 2.02, "grad_norm": 2.5797388553619385, "learning_rate": 2.417441060643809e-06, "logits/chosen": -1.783384084701538, "logits/rejected": -1.7092987298965454, "logps/chosen": -39.536407470703125, "logps/rejected": -250.8395538330078, "loss": 0.2602, "rewards/accuracies": 1.0, "rewards/chosen": 31.661413192749023, "rewards/margins": 19.599422454833984, "rewards/rejected": 12.061990737915039, "step": 1210 }, { "epoch": 2.03, "grad_norm": 3.2678422927856445, "learning_rate": 2.342986120795978e-06, "logits/chosen": -1.6378501653671265, "logits/rejected": -1.6011245250701904, "logps/chosen": -50.96418762207031, "logps/rejected": -246.3638153076172, "loss": 0.3088, "rewards/accuracies": 1.0, "rewards/chosen": 30.954126358032227, "rewards/margins": 19.514755249023438, "rewards/rejected": 11.439367294311523, "step": 1220 }, { "epoch": 2.05, "grad_norm": 2.6984269618988037, "learning_rate": 2.2693433280305127e-06, "logits/chosen": -1.7482858896255493, "logits/rejected": -1.7367308139801025, "logps/chosen": -43.29402160644531, "logps/rejected": -253.78939819335938, "loss": 0.2572, "rewards/accuracies": 1.0, "rewards/chosen": 31.882701873779297, "rewards/margins": 21.47805404663086, "rewards/rejected": 10.40464973449707, "step": 1230 }, { "epoch": 2.07, "grad_norm": 3.53029727935791, "learning_rate": 2.19653519212041e-06, "logits/chosen": -1.7963911294937134, "logits/rejected": -1.7613227367401123, "logps/chosen": -44.7977294921875, "logps/rejected": -243.6311798095703, "loss": 0.2862, "rewards/accuracies": 1.0, "rewards/chosen": 30.34331703186035, "rewards/margins": 19.52144432067871, "rewards/rejected": 10.821873664855957, "step": 1240 }, { "epoch": 2.08, "grad_norm": 1.513169288635254, "learning_rate": 2.124583967716136e-06, "logits/chosen": -2.0151898860931396, "logits/rejected": -1.9886600971221924, "logps/chosen": -38.372108459472656, "logps/rejected": -267.5242919921875, "loss": 0.2252, "rewards/accuracies": 1.0, "rewards/chosen": 31.756805419921875, "rewards/margins": 20.560382843017578, "rewards/rejected": 11.196426391601562, "step": 1250 }, { "epoch": 2.1, "grad_norm": 2.537264347076416, "learning_rate": 2.053511647543218e-06, "logits/chosen": -1.4479506015777588, "logits/rejected": -1.408527135848999, "logps/chosen": -49.19340896606445, "logps/rejected": -229.5871124267578, "loss": 0.3208, "rewards/accuracies": 1.0, "rewards/chosen": 29.4880428314209, "rewards/margins": 17.939146041870117, "rewards/rejected": 11.548896789550781, "step": 1260 }, { "epoch": 2.12, "grad_norm": 2.1548752784729004, "learning_rate": 1.98333995567992e-06, "logits/chosen": -1.5170295238494873, "logits/rejected": -1.4900450706481934, "logps/chosen": -50.18815231323242, "logps/rejected": -242.35986328125, "loss": 0.3042, "rewards/accuracies": 1.0, "rewards/chosen": 31.04024887084961, "rewards/margins": 19.42563247680664, "rewards/rejected": 11.614619255065918, "step": 1270 }, { "epoch": 2.13, "grad_norm": 2.6000428199768066, "learning_rate": 1.9140903409170276e-06, "logits/chosen": -1.59740149974823, "logits/rejected": -1.5533560514450073, "logps/chosen": -49.30552291870117, "logps/rejected": -255.7830810546875, "loss": 0.3064, "rewards/accuracies": 1.0, "rewards/chosen": 30.910289764404297, "rewards/margins": 19.63786506652832, "rewards/rejected": 11.272429466247559, "step": 1280 }, { "epoch": 2.15, "grad_norm": 2.6881704330444336, "learning_rate": 1.8457839702017783e-06, "logits/chosen": -1.788030982017517, "logits/rejected": -1.7674751281738281, "logps/chosen": -38.47369384765625, "logps/rejected": -241.8249053955078, "loss": 0.2368, "rewards/accuracies": 1.0, "rewards/chosen": 30.96920394897461, "rewards/margins": 19.61397933959961, "rewards/rejected": 11.355224609375, "step": 1290 }, { "epoch": 2.17, "grad_norm": 1.9455982446670532, "learning_rate": 1.7784417221679346e-06, "logits/chosen": -1.7658576965332031, "logits/rejected": -1.7436749935150146, "logps/chosen": -43.32396697998047, "logps/rejected": -260.0502624511719, "loss": 0.2658, "rewards/accuracies": 1.0, "rewards/chosen": 30.932819366455078, "rewards/margins": 20.97707176208496, "rewards/rejected": 9.955748558044434, "step": 1300 }, { "epoch": 2.18, "grad_norm": 2.7157328128814697, "learning_rate": 1.7120841807539867e-06, "logits/chosen": -1.5605119466781616, "logits/rejected": -1.533529281616211, "logps/chosen": -45.2177848815918, "logps/rejected": -252.9283905029297, "loss": 0.2779, "rewards/accuracies": 1.0, "rewards/chosen": 31.456167221069336, "rewards/margins": 20.617300033569336, "rewards/rejected": 10.8388671875, "step": 1310 }, { "epoch": 2.2, "grad_norm": 4.16888952255249, "learning_rate": 1.6467316289114365e-06, "logits/chosen": -1.7607736587524414, "logits/rejected": -1.72702157497406, "logps/chosen": -52.18560028076172, "logps/rejected": -237.4579620361328, "loss": 0.3116, "rewards/accuracies": 1.0, "rewards/chosen": 30.68082618713379, "rewards/margins": 19.22063446044922, "rewards/rejected": 11.460187911987305, "step": 1320 }, { "epoch": 2.22, "grad_norm": 3.37455153465271, "learning_rate": 1.5824040424050763e-06, "logits/chosen": -1.8833210468292236, "logits/rejected": -1.844347357749939, "logps/chosen": -40.77017593383789, "logps/rejected": -248.3719940185547, "loss": 0.2508, "rewards/accuracies": 1.0, "rewards/chosen": 32.0168342590332, "rewards/margins": 21.67666244506836, "rewards/rejected": 10.34017562866211, "step": 1330 }, { "epoch": 2.23, "grad_norm": 1.9589617252349854, "learning_rate": 1.5191210837071695e-06, "logits/chosen": -1.8483455181121826, "logits/rejected": -1.8569103479385376, "logps/chosen": -37.26815414428711, "logps/rejected": -267.3866882324219, "loss": 0.231, "rewards/accuracies": 1.0, "rewards/chosen": 32.11620330810547, "rewards/margins": 21.5185604095459, "rewards/rejected": 10.597644805908203, "step": 1340 }, { "epoch": 2.25, "grad_norm": 3.257993221282959, "learning_rate": 1.4569020959873809e-06, "logits/chosen": -1.786929726600647, "logits/rejected": -1.7171766757965088, "logps/chosen": -38.82440948486328, "logps/rejected": -267.8984375, "loss": 0.2323, "rewards/accuracies": 1.0, "rewards/chosen": 31.542959213256836, "rewards/margins": 20.44384002685547, "rewards/rejected": 11.099119186401367, "step": 1350 }, { "epoch": 2.27, "grad_norm": 2.4979026317596436, "learning_rate": 1.3957660972003167e-06, "logits/chosen": -1.6343905925750732, "logits/rejected": -1.6573156118392944, "logps/chosen": -45.56403732299805, "logps/rejected": -253.07199096679688, "loss": 0.2767, "rewards/accuracies": 1.0, "rewards/chosen": 30.549020767211914, "rewards/margins": 20.293630599975586, "rewards/rejected": 10.255391120910645, "step": 1360 }, { "epoch": 2.28, "grad_norm": 1.8096556663513184, "learning_rate": 1.3357317742724658e-06, "logits/chosen": -1.5966780185699463, "logits/rejected": -1.570854902267456, "logps/chosen": -48.08317184448242, "logps/rejected": -251.5933380126953, "loss": 0.2885, "rewards/accuracies": 1.0, "rewards/chosen": 30.559154510498047, "rewards/margins": 19.232830047607422, "rewards/rejected": 11.326326370239258, "step": 1370 }, { "epoch": 2.3, "grad_norm": 2.8283846378326416, "learning_rate": 1.2768174773903263e-06, "logits/chosen": -1.86501145362854, "logits/rejected": -1.822446584701538, "logps/chosen": -47.01121139526367, "logps/rejected": -263.478515625, "loss": 0.2764, "rewards/accuracies": 1.0, "rewards/chosen": 32.028045654296875, "rewards/margins": 20.432361602783203, "rewards/rejected": 11.595685958862305, "step": 1380 }, { "epoch": 2.32, "grad_norm": 2.776090621948242, "learning_rate": 1.2190412143914536e-06, "logits/chosen": -1.513856291770935, "logits/rejected": -1.46199631690979, "logps/chosen": -42.10026931762695, "logps/rejected": -247.57339477539062, "loss": 0.2768, "rewards/accuracies": 1.0, "rewards/chosen": 29.788198471069336, "rewards/margins": 19.206623077392578, "rewards/rejected": 10.58157730102539, "step": 1390 }, { "epoch": 2.33, "grad_norm": 2.4359893798828125, "learning_rate": 1.1624206452601623e-06, "logits/chosen": -1.7050933837890625, "logits/rejected": -1.6352756023406982, "logps/chosen": -48.3925666809082, "logps/rejected": -255.81640625, "loss": 0.2917, "rewards/accuracies": 1.0, "rewards/chosen": 30.2396240234375, "rewards/margins": 19.301578521728516, "rewards/rejected": 10.938047409057617, "step": 1400 }, { "epoch": 2.33, "eval_logits/chosen": -0.8292198181152344, "eval_logits/rejected": -0.8336656093597412, "eval_logps/chosen": -165.87318420410156, "eval_logps/rejected": -207.3828887939453, "eval_loss": 2.626953363418579, "eval_rewards/accuracies": 0.6425532102584839, "eval_rewards/chosen": 14.536958694458008, "eval_rewards/margins": 1.6484651565551758, "eval_rewards/rejected": 12.888493537902832, "eval_runtime": 931.7841, "eval_samples_per_second": 0.757, "eval_steps_per_second": 0.757, "step": 1400 }, { "epoch": 2.35, "grad_norm": 2.7294886112213135, "learning_rate": 1.1069730767295394e-06, "logits/chosen": -1.7735283374786377, "logits/rejected": -1.7543232440948486, "logps/chosen": -40.81293869018555, "logps/rejected": -259.8978576660156, "loss": 0.2463, "rewards/accuracies": 1.0, "rewards/chosen": 31.607662200927734, "rewards/margins": 20.695466995239258, "rewards/rejected": 10.912198066711426, "step": 1410 }, { "epoch": 2.37, "grad_norm": 2.734895944595337, "learning_rate": 1.0527154569914472e-06, "logits/chosen": -1.5312741994857788, "logits/rejected": -1.4923324584960938, "logps/chosen": -42.63439178466797, "logps/rejected": -244.82046508789062, "loss": 0.2697, "rewards/accuracies": 1.0, "rewards/chosen": 29.897241592407227, "rewards/margins": 18.947063446044922, "rewards/rejected": 10.950177192687988, "step": 1420 }, { "epoch": 2.38, "grad_norm": 2.473545789718628, "learning_rate": 9.996643705161125e-07, "logits/chosen": -1.6525815725326538, "logits/rejected": -1.6200284957885742, "logps/chosen": -46.02938461303711, "logps/rejected": -261.2445068359375, "loss": 0.2916, "rewards/accuracies": 1.0, "rewards/chosen": 29.908071517944336, "rewards/margins": 19.2839298248291, "rewards/rejected": 10.624141693115234, "step": 1430 }, { "epoch": 2.4, "grad_norm": 2.676964044570923, "learning_rate": 9.47836032982884e-07, "logits/chosen": -1.5494722127914429, "logits/rejected": -1.4936171770095825, "logps/chosen": -47.933658599853516, "logps/rejected": -252.27401733398438, "loss": 0.3175, "rewards/accuracies": 1.0, "rewards/chosen": 29.43538475036621, "rewards/margins": 18.781505584716797, "rewards/rejected": 10.653879165649414, "step": 1440 }, { "epoch": 2.42, "grad_norm": 2.895836591720581, "learning_rate": 8.972462863237341e-07, "logits/chosen": -1.6493561267852783, "logits/rejected": -1.5964713096618652, "logps/chosen": -44.10541915893555, "logps/rejected": -271.24859619140625, "loss": 0.2734, "rewards/accuracies": 1.0, "rewards/chosen": 29.411285400390625, "rewards/margins": 19.59650993347168, "rewards/rejected": 9.814775466918945, "step": 1450 }, { "epoch": 2.43, "grad_norm": 1.9935178756713867, "learning_rate": 8.479105938809701e-07, "logits/chosen": -1.485527515411377, "logits/rejected": -1.435868263244629, "logps/chosen": -50.575355529785156, "logps/rejected": -253.8799591064453, "loss": 0.3184, "rewards/accuracies": 1.0, "rewards/chosen": 29.476688385009766, "rewards/margins": 18.307661056518555, "rewards/rejected": 11.169027328491211, "step": 1460 }, { "epoch": 2.45, "grad_norm": 2.0916085243225098, "learning_rate": 7.998440356807075e-07, "logits/chosen": -1.8527294397354126, "logits/rejected": -1.8244997262954712, "logps/chosen": -34.15217208862305, "logps/rejected": -262.5540466308594, "loss": 0.2042, "rewards/accuracies": 1.0, "rewards/chosen": 31.451465606689453, "rewards/margins": 20.44220542907715, "rewards/rejected": 11.009258270263672, "step": 1470 }, { "epoch": 2.47, "grad_norm": 3.5937747955322266, "learning_rate": 7.530613038234646e-07, "logits/chosen": -1.5336072444915771, "logits/rejected": -1.4814928770065308, "logps/chosen": -42.957305908203125, "logps/rejected": -248.6365966796875, "loss": 0.2776, "rewards/accuracies": 1.0, "rewards/chosen": 31.6094970703125, "rewards/margins": 20.037199020385742, "rewards/rejected": 11.572298049926758, "step": 1480 }, { "epoch": 2.48, "grad_norm": 2.563070774078369, "learning_rate": 7.075766979933674e-07, "logits/chosen": -1.6885433197021484, "logits/rejected": -1.663726806640625, "logps/chosen": -35.838836669921875, "logps/rejected": -253.3392333984375, "loss": 0.2286, "rewards/accuracies": 1.0, "rewards/chosen": 32.139644622802734, "rewards/margins": 20.651351928710938, "rewards/rejected": 11.48829174041748, "step": 1490 }, { "epoch": 2.5, "grad_norm": 1.7424818277359009, "learning_rate": 6.634041210872743e-07, "logits/chosen": -1.8395694494247437, "logits/rejected": -1.8219817876815796, "logps/chosen": -43.62278366088867, "logps/rejected": -261.8132629394531, "loss": 0.259, "rewards/accuracies": 1.0, "rewards/chosen": 30.96283531188965, "rewards/margins": 20.133371353149414, "rewards/rejected": 10.829463005065918, "step": 1500 }, { "epoch": 2.52, "grad_norm": 3.5391411781311035, "learning_rate": 6.205570749652002e-07, "logits/chosen": -1.5959376096725464, "logits/rejected": -1.5644586086273193, "logps/chosen": -47.74095916748047, "logps/rejected": -254.2753448486328, "loss": 0.3061, "rewards/accuracies": 1.0, "rewards/chosen": 30.81204605102539, "rewards/margins": 19.811317443847656, "rewards/rejected": 11.00072956085205, "step": 1510 }, { "epoch": 2.53, "grad_norm": 2.486269235610962, "learning_rate": 5.790486563233145e-07, "logits/chosen": -1.8033177852630615, "logits/rejected": -1.7998552322387695, "logps/chosen": -35.27519226074219, "logps/rejected": -256.3056640625, "loss": 0.2171, "rewards/accuracies": 1.0, "rewards/chosen": 31.810821533203125, "rewards/margins": 20.776798248291016, "rewards/rejected": 11.034022331237793, "step": 1520 }, { "epoch": 2.55, "grad_norm": 2.615795612335205, "learning_rate": 5.388915526907862e-07, "logits/chosen": -1.7600593566894531, "logits/rejected": -1.7274726629257202, "logps/chosen": -38.91143035888672, "logps/rejected": -256.5948791503906, "loss": 0.2455, "rewards/accuracies": 1.0, "rewards/chosen": 30.60187339782715, "rewards/margins": 19.52530288696289, "rewards/rejected": 11.076571464538574, "step": 1530 }, { "epoch": 2.57, "grad_norm": 3.510915994644165, "learning_rate": 5.000980385516935e-07, "logits/chosen": -1.6004310846328735, "logits/rejected": -1.5690761804580688, "logps/chosen": -41.87531280517578, "logps/rejected": -251.2759246826172, "loss": 0.2711, "rewards/accuracies": 1.0, "rewards/chosen": 30.125137329101562, "rewards/margins": 18.800806045532227, "rewards/rejected": 11.32433032989502, "step": 1540 }, { "epoch": 2.58, "grad_norm": 3.1003832817077637, "learning_rate": 4.626799715931812e-07, "logits/chosen": -1.6454312801361084, "logits/rejected": -1.6356630325317383, "logps/chosen": -47.9179801940918, "logps/rejected": -259.88311767578125, "loss": 0.293, "rewards/accuracies": 1.0, "rewards/chosen": 30.139379501342773, "rewards/margins": 18.82354736328125, "rewards/rejected": 11.315831184387207, "step": 1550 }, { "epoch": 2.6, "grad_norm": 2.93013334274292, "learning_rate": 4.2664878908102556e-07, "logits/chosen": -1.88095223903656, "logits/rejected": -1.8393617868423462, "logps/chosen": -32.803287506103516, "logps/rejected": -261.6094055175781, "loss": 0.2059, "rewards/accuracies": 1.0, "rewards/chosen": 31.75473976135254, "rewards/margins": 20.777175903320312, "rewards/rejected": 10.977563858032227, "step": 1560 }, { "epoch": 2.62, "grad_norm": 2.920027017593384, "learning_rate": 3.9201550436370026e-07, "logits/chosen": -1.7541553974151611, "logits/rejected": -1.7304025888442993, "logps/chosen": -40.56893539428711, "logps/rejected": -264.2108459472656, "loss": 0.2521, "rewards/accuracies": 1.0, "rewards/chosen": 31.008968353271484, "rewards/margins": 20.572416305541992, "rewards/rejected": 10.436552047729492, "step": 1570 }, { "epoch": 2.63, "grad_norm": 2.7568492889404297, "learning_rate": 3.587907035060195e-07, "logits/chosen": -1.5946696996688843, "logits/rejected": -1.577462911605835, "logps/chosen": -43.67282485961914, "logps/rejected": -259.1739807128906, "loss": 0.2793, "rewards/accuracies": 1.0, "rewards/chosen": 29.5866641998291, "rewards/margins": 18.64238166809082, "rewards/rejected": 10.944284439086914, "step": 1580 }, { "epoch": 2.65, "grad_norm": 2.2731993198394775, "learning_rate": 3.269845420533824e-07, "logits/chosen": -1.7446489334106445, "logits/rejected": -1.701433777809143, "logps/chosen": -37.443397521972656, "logps/rejected": -254.092529296875, "loss": 0.279, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 31.358057022094727, "rewards/margins": 19.601125717163086, "rewards/rejected": 11.756929397583008, "step": 1590 }, { "epoch": 2.67, "grad_norm": 3.0889647006988525, "learning_rate": 2.9660674192761753e-07, "logits/chosen": -1.5739749670028687, "logits/rejected": -1.5466270446777344, "logps/chosen": -46.05111312866211, "logps/rejected": -245.4317626953125, "loss": 0.2881, "rewards/accuracies": 1.0, "rewards/chosen": 29.035449981689453, "rewards/margins": 18.32761573791504, "rewards/rejected": 10.707832336425781, "step": 1600 }, { "epoch": 2.67, "eval_logits/chosen": -0.8468231558799744, "eval_logits/rejected": -0.8503096103668213, "eval_logps/chosen": -167.3941192626953, "eval_logps/rejected": -209.29457092285156, "eval_loss": 2.6357526779174805, "eval_rewards/accuracies": 0.6468085050582886, "eval_rewards/chosen": 14.38486385345459, "eval_rewards/margins": 1.6875391006469727, "eval_rewards/rejected": 12.697324752807617, "eval_runtime": 934.2741, "eval_samples_per_second": 0.755, "eval_steps_per_second": 0.755, "step": 1600 }, { "epoch": 2.68, "grad_norm": 3.4763405323028564, "learning_rate": 2.676665884553559e-07, "logits/chosen": -1.6974531412124634, "logits/rejected": -1.6716630458831787, "logps/chosen": -45.91952133178711, "logps/rejected": -240.70156860351562, "loss": 0.288, "rewards/accuracies": 1.0, "rewards/chosen": 30.833568572998047, "rewards/margins": 19.545482635498047, "rewards/rejected": 11.288084030151367, "step": 1610 }, { "epoch": 2.7, "grad_norm": 2.333751916885376, "learning_rate": 2.401729275298753e-07, "logits/chosen": -1.663021445274353, "logits/rejected": -1.6606937646865845, "logps/chosen": -41.728370666503906, "logps/rejected": -257.09600830078125, "loss": 0.2824, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 31.06747817993164, "rewards/margins": 21.039501190185547, "rewards/rejected": 10.02797794342041, "step": 1620 }, { "epoch": 2.72, "grad_norm": 3.9069831371307373, "learning_rate": 2.1413416290723966e-07, "logits/chosen": -1.7814449071884155, "logits/rejected": -1.7398021221160889, "logps/chosen": -39.289886474609375, "logps/rejected": -252.5143585205078, "loss": 0.2494, "rewards/accuracies": 1.0, "rewards/chosen": 31.424646377563477, "rewards/margins": 20.74696922302246, "rewards/rejected": 10.677675247192383, "step": 1630 }, { "epoch": 2.73, "grad_norm": 2.8962225914001465, "learning_rate": 1.8955825363760172e-07, "logits/chosen": -1.6453288793563843, "logits/rejected": -1.6407811641693115, "logps/chosen": -48.07708740234375, "logps/rejected": -257.2157287597656, "loss": 0.2834, "rewards/accuracies": 1.0, "rewards/chosen": 30.098657608032227, "rewards/margins": 19.977779388427734, "rewards/rejected": 10.120881080627441, "step": 1640 }, { "epoch": 2.75, "grad_norm": 3.3172779083251953, "learning_rate": 1.6645271163242106e-07, "logits/chosen": -1.6090644598007202, "logits/rejected": -1.5861512422561646, "logps/chosen": -46.873756408691406, "logps/rejected": -253.38778686523438, "loss": 0.2934, "rewards/accuracies": 1.0, "rewards/chosen": 29.051366806030273, "rewards/margins": 18.360876083374023, "rewards/rejected": 10.6904935836792, "step": 1650 }, { "epoch": 2.77, "grad_norm": 3.810804605484009, "learning_rate": 1.448245993683639e-07, "logits/chosen": -1.6912416219711304, "logits/rejected": -1.6398060321807861, "logps/chosen": -42.75129318237305, "logps/rejected": -255.9859619140625, "loss": 0.2546, "rewards/accuracies": 1.0, "rewards/chosen": 31.15047836303711, "rewards/margins": 19.870410919189453, "rewards/rejected": 11.280068397521973, "step": 1660 }, { "epoch": 2.78, "grad_norm": 2.5805416107177734, "learning_rate": 1.2468052772857786e-07, "logits/chosen": -1.6538400650024414, "logits/rejected": -1.6332565546035767, "logps/chosen": -39.50425338745117, "logps/rejected": -243.5617218017578, "loss": 0.2568, "rewards/accuracies": 1.0, "rewards/chosen": 29.246089935302734, "rewards/margins": 18.635494232177734, "rewards/rejected": 10.610593795776367, "step": 1670 }, { "epoch": 2.8, "grad_norm": 3.1274917125701904, "learning_rate": 1.060266539819932e-07, "logits/chosen": -1.642760992050171, "logits/rejected": -1.6645612716674805, "logps/chosen": -52.89581298828125, "logps/rejected": -240.6681671142578, "loss": 0.3194, "rewards/accuracies": 1.0, "rewards/chosen": 29.198993682861328, "rewards/margins": 18.33091926574707, "rewards/rejected": 10.868074417114258, "step": 1680 }, { "epoch": 2.82, "grad_norm": 4.011273384094238, "learning_rate": 8.886867990128722e-08, "logits/chosen": -1.5743844509124756, "logits/rejected": -1.5511356592178345, "logps/chosen": -42.92226028442383, "logps/rejected": -239.4049072265625, "loss": 0.2702, "rewards/accuracies": 1.0, "rewards/chosen": 31.1669921875, "rewards/margins": 20.256168365478516, "rewards/rejected": 10.91082763671875, "step": 1690 }, { "epoch": 2.83, "grad_norm": 3.313141345977783, "learning_rate": 7.321185002006848e-08, "logits/chosen": -1.7491827011108398, "logits/rejected": -1.710752248764038, "logps/chosen": -36.811859130859375, "logps/rejected": -276.8099365234375, "loss": 0.2412, "rewards/accuracies": 1.0, "rewards/chosen": 30.183746337890625, "rewards/margins": 20.148881912231445, "rewards/rejected": 10.034868240356445, "step": 1700 }, { "epoch": 2.85, "grad_norm": 2.1705501079559326, "learning_rate": 5.906095002982615e-08, "logits/chosen": -1.6982301473617554, "logits/rejected": -1.6695678234100342, "logps/chosen": -39.64902877807617, "logps/rejected": -265.86370849609375, "loss": 0.258, "rewards/accuracies": 1.0, "rewards/chosen": 31.876617431640625, "rewards/margins": 21.336835861206055, "rewards/rejected": 10.539777755737305, "step": 1710 }, { "epoch": 2.87, "grad_norm": 3.416759729385376, "learning_rate": 4.642030531712582e-08, "logits/chosen": -1.6781730651855469, "logits/rejected": -1.6320860385894775, "logps/chosen": -44.278289794921875, "logps/rejected": -270.3473205566406, "loss": 0.2723, "rewards/accuracies": 1.0, "rewards/chosen": 31.628204345703125, "rewards/margins": 20.757869720458984, "rewards/rejected": 10.87033748626709, "step": 1720 }, { "epoch": 2.88, "grad_norm": 2.6415388584136963, "learning_rate": 3.5293779641508156e-08, "logits/chosen": -1.7743009328842163, "logits/rejected": -1.7475459575653076, "logps/chosen": -41.21794891357422, "logps/rejected": -267.9061584472656, "loss": 0.2469, "rewards/accuracies": 1.0, "rewards/chosen": 31.46852684020996, "rewards/margins": 19.723846435546875, "rewards/rejected": 11.74467945098877, "step": 1730 }, { "epoch": 2.9, "grad_norm": 3.2879960536956787, "learning_rate": 2.5684773954482433e-08, "logits/chosen": -1.7669696807861328, "logits/rejected": -1.705436110496521, "logps/chosen": -38.20746612548828, "logps/rejected": -279.90460205078125, "loss": 0.2434, "rewards/accuracies": 1.0, "rewards/chosen": 31.641986846923828, "rewards/margins": 21.152555465698242, "rewards/rejected": 10.489428520202637, "step": 1740 }, { "epoch": 2.92, "grad_norm": 2.513577461242676, "learning_rate": 1.7596225359988728e-08, "logits/chosen": -1.8220970630645752, "logits/rejected": -1.7889318466186523, "logps/chosen": -40.94774627685547, "logps/rejected": -259.34320068359375, "loss": 0.2511, "rewards/accuracies": 1.0, "rewards/chosen": 30.275482177734375, "rewards/margins": 19.547029495239258, "rewards/rejected": 10.728452682495117, "step": 1750 }, { "epoch": 2.93, "grad_norm": 3.861309766769409, "learning_rate": 1.1030606216637097e-08, "logits/chosen": -1.4674508571624756, "logits/rejected": -1.425626277923584, "logps/chosen": -48.60122299194336, "logps/rejected": -243.1677703857422, "loss": 0.3138, "rewards/accuracies": 1.0, "rewards/chosen": 28.8472957611084, "rewards/margins": 19.466501235961914, "rewards/rejected": 9.380796432495117, "step": 1760 }, { "epoch": 2.95, "grad_norm": 3.0707430839538574, "learning_rate": 5.989923382003216e-09, "logits/chosen": -1.5614324808120728, "logits/rejected": -1.5531564950942993, "logps/chosen": -50.88349151611328, "logps/rejected": -241.66943359375, "loss": 0.3286, "rewards/accuracies": 1.0, "rewards/chosen": 29.8480167388916, "rewards/margins": 17.89462661743164, "rewards/rejected": 11.953387260437012, "step": 1770 }, { "epoch": 2.97, "grad_norm": 1.1481070518493652, "learning_rate": 2.4757175992079496e-09, "logits/chosen": -1.654233694076538, "logits/rejected": -1.6192829608917236, "logps/chosen": -43.71112060546875, "logps/rejected": -246.0106201171875, "loss": 0.2606, "rewards/accuracies": 1.0, "rewards/chosen": 30.80698013305664, "rewards/margins": 19.817432403564453, "rewards/rejected": 10.989545822143555, "step": 1780 }, { "epoch": 2.98, "grad_norm": 1.825456976890564, "learning_rate": 4.890630259724027e-10, "logits/chosen": -1.703809380531311, "logits/rejected": -1.6901493072509766, "logps/chosen": -53.00714874267578, "logps/rejected": -266.7369384765625, "loss": 0.3145, "rewards/accuracies": 1.0, "rewards/chosen": 30.395549774169922, "rewards/margins": 19.31685447692871, "rewards/rejected": 11.078699111938477, "step": 1790 }, { "epoch": 3.0, "step": 1797, "total_flos": 5.920748380997222e+18, "train_loss": 0.5555986505650652, "train_runtime": 48641.0087, "train_samples_per_second": 0.296, "train_steps_per_second": 0.037 } ], "logging_steps": 10, "max_steps": 1797, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "total_flos": 5.920748380997222e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }