{ "best_metric": 0.42603132128715515, "best_model_checkpoint": "./mistral/20-04-24-Weni-WeniGPT-Agents-Mistral-1.0.6-SFT-1.0.5-DPO_Experiment on DPO with other hyperparameters and best SFT model of WeniGPT-2_max_steps-366_batch_4_2024-04-20_ppid_9/checkpoint-360", "epoch": 5.853658536585366, "eval_steps": 30, "global_step": 360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16, "grad_norm": 8.378021240234375, "learning_rate": 4.0909090909090915e-06, "logits/chosen": -1.830958604812622, "logits/rejected": -1.8507845401763916, "logps/chosen": -28.701984405517578, "logps/rejected": -54.28569793701172, "loss": 0.6924, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.0008967495523393154, "rewards/margins": 0.0014666033675894141, "rewards/rejected": -0.0005698538152500987, "step": 10 }, { "epoch": 0.33, "grad_norm": 5.193418502807617, "learning_rate": 4.887323943661972e-06, "logits/chosen": -1.7550897598266602, "logits/rejected": -1.770708680152893, "logps/chosen": -47.344207763671875, "logps/rejected": -64.0368423461914, "loss": 0.6852, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.017231885343790054, "rewards/margins": 0.01606021076440811, "rewards/rejected": 0.0011716745793819427, "step": 20 }, { "epoch": 0.49, "grad_norm": 7.308932304382324, "learning_rate": 4.746478873239437e-06, "logits/chosen": -1.781267762184143, "logits/rejected": -1.8114898204803467, "logps/chosen": -54.274559020996094, "logps/rejected": -95.20500183105469, "loss": 0.6635, "rewards/accuracies": 0.5, "rewards/chosen": 0.0641159638762474, "rewards/margins": 0.061691801995038986, "rewards/rejected": 0.0024241588544100523, "step": 30 }, { "epoch": 0.49, "eval_logits/chosen": -1.7831767797470093, "eval_logits/rejected": -1.8043663501739502, "eval_logps/chosen": -55.16960906982422, "eval_logps/rejected": -97.32585144042969, "eval_loss": 0.6523757576942444, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.09036973863840103, "eval_rewards/margins": 0.08673857897520065, "eval_rewards/rejected": 0.0036311547737568617, "eval_runtime": 8.141, "eval_samples_per_second": 3.439, "eval_steps_per_second": 1.72, "step": 30 }, { "epoch": 0.65, "grad_norm": 0.0, "learning_rate": 4.6056338028169015e-06, "logits/chosen": -1.889905333518982, "logits/rejected": -1.9024461507797241, "logps/chosen": -27.918941497802734, "logps/rejected": -42.093284606933594, "loss": 0.668, "rewards/accuracies": 0.25, "rewards/chosen": 0.054457180202007294, "rewards/margins": 0.0539846234023571, "rewards/rejected": 0.0004725646285805851, "step": 40 }, { "epoch": 0.81, "grad_norm": 8.53225326538086, "learning_rate": 4.464788732394367e-06, "logits/chosen": -1.8278567790985107, "logits/rejected": -1.849957823753357, "logps/chosen": -43.8238639831543, "logps/rejected": -68.02179718017578, "loss": 0.6358, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.13941256701946259, "rewards/margins": 0.13133978843688965, "rewards/rejected": 0.008072790689766407, "step": 50 }, { "epoch": 0.98, "grad_norm": 9.436968803405762, "learning_rate": 4.3239436619718315e-06, "logits/chosen": -1.805991768836975, "logits/rejected": -1.8437427282333374, "logps/chosen": -43.8873291015625, "logps/rejected": -95.2943115234375, "loss": 0.6026, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.18793432414531708, "rewards/margins": 0.21308371424674988, "rewards/rejected": -0.025149401277303696, "step": 60 }, { "epoch": 0.98, "eval_logits/chosen": -1.7877694368362427, "eval_logits/rejected": -1.8098936080932617, "eval_logps/chosen": -53.567203521728516, "eval_logps/rejected": -97.33795928955078, "eval_loss": 0.5890871286392212, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.25061002373695374, "eval_rewards/margins": 0.2481890469789505, "eval_rewards/rejected": 0.002420984674245119, "eval_runtime": 8.1404, "eval_samples_per_second": 3.44, "eval_steps_per_second": 1.72, "step": 60 }, { "epoch": 1.14, "grad_norm": 0.0, "learning_rate": 4.183098591549296e-06, "logits/chosen": -1.8344879150390625, "logits/rejected": -1.8489716053009033, "logps/chosen": -40.38930892944336, "logps/rejected": -60.9084358215332, "loss": 0.6031, "rewards/accuracies": 0.375, "rewards/chosen": 0.19739331305027008, "rewards/margins": 0.22638121247291565, "rewards/rejected": -0.028987903147935867, "step": 70 }, { "epoch": 1.3, "grad_norm": 5.49536657333374, "learning_rate": 4.042253521126761e-06, "logits/chosen": -1.7903095483779907, "logits/rejected": -1.8362411260604858, "logps/chosen": -44.288116455078125, "logps/rejected": -90.21073913574219, "loss": 0.5357, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.34061312675476074, "rewards/margins": 0.40679749846458435, "rewards/rejected": -0.06618441641330719, "step": 80 }, { "epoch": 1.46, "grad_norm": 13.401692390441895, "learning_rate": 3.901408450704225e-06, "logits/chosen": -1.8004281520843506, "logits/rejected": -1.8247934579849243, "logps/chosen": -42.32465362548828, "logps/rejected": -70.9749984741211, "loss": 0.5387, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.3678433299064636, "rewards/margins": 0.4186524450778961, "rewards/rejected": -0.05080908536911011, "step": 90 }, { "epoch": 1.46, "eval_logits/chosen": -1.7943389415740967, "eval_logits/rejected": -1.8181126117706299, "eval_logps/chosen": -51.677486419677734, "eval_logps/rejected": -97.63689422607422, "eval_loss": 0.529485821723938, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.4395819306373596, "eval_rewards/margins": 0.4670555889606476, "eval_rewards/rejected": -0.027473628520965576, "eval_runtime": 8.1412, "eval_samples_per_second": 3.439, "eval_steps_per_second": 1.72, "step": 90 }, { "epoch": 1.63, "grad_norm": 5.040858745574951, "learning_rate": 3.7605633802816903e-06, "logits/chosen": -1.8601042032241821, "logits/rejected": -1.8790462017059326, "logps/chosen": -43.77570343017578, "logps/rejected": -70.64997863769531, "loss": 0.5466, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.36673134565353394, "rewards/margins": 0.42903366684913635, "rewards/rejected": -0.06230226159095764, "step": 100 }, { "epoch": 1.79, "grad_norm": 11.182683944702148, "learning_rate": 3.6197183098591553e-06, "logits/chosen": -1.8602203130722046, "logits/rejected": -1.8786903619766235, "logps/chosen": -29.601736068725586, "logps/rejected": -66.1338882446289, "loss": 0.6003, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.3122637867927551, "rewards/margins": 0.2756831645965576, "rewards/rejected": 0.03658062964677811, "step": 110 }, { "epoch": 1.95, "grad_norm": 3.9169583320617676, "learning_rate": 3.47887323943662e-06, "logits/chosen": -1.8304624557495117, "logits/rejected": -1.8451646566390991, "logps/chosen": -31.413599014282227, "logps/rejected": -56.841880798339844, "loss": 0.6033, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.21899382770061493, "rewards/margins": 0.2744571566581726, "rewards/rejected": -0.05546332150697708, "step": 120 }, { "epoch": 1.95, "eval_logits/chosen": -1.80086350440979, "eval_logits/rejected": -1.8260576725006104, "eval_logps/chosen": -50.32191848754883, "eval_logps/rejected": -98.02101135253906, "eval_loss": 0.49604225158691406, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.5751391053199768, "eval_rewards/margins": 0.6410244107246399, "eval_rewards/rejected": -0.0658852607011795, "eval_runtime": 8.1445, "eval_samples_per_second": 3.438, "eval_steps_per_second": 1.719, "step": 120 }, { "epoch": 2.11, "grad_norm": 1.4047716856002808, "learning_rate": 3.338028169014085e-06, "logits/chosen": -1.8776130676269531, "logits/rejected": -1.8995519876480103, "logps/chosen": -22.69371795654297, "logps/rejected": -53.5282096862793, "loss": 0.5611, "rewards/accuracies": 0.25, "rewards/chosen": 0.35938918590545654, "rewards/margins": 0.5045264959335327, "rewards/rejected": -0.14513733983039856, "step": 130 }, { "epoch": 2.28, "grad_norm": 0.7528722882270813, "learning_rate": 3.1971830985915496e-06, "logits/chosen": -1.8126357793807983, "logits/rejected": -1.832371711730957, "logps/chosen": -38.33379364013672, "logps/rejected": -67.96979522705078, "loss": 0.5142, "rewards/accuracies": 0.375, "rewards/chosen": 0.5568062663078308, "rewards/margins": 0.6818712949752808, "rewards/rejected": -0.12506499886512756, "step": 140 }, { "epoch": 2.44, "grad_norm": 3.405579090118408, "learning_rate": 3.056338028169014e-06, "logits/chosen": -1.8196109533309937, "logits/rejected": -1.8556429147720337, "logps/chosen": -36.78864669799805, "logps/rejected": -83.05890655517578, "loss": 0.5042, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.542107105255127, "rewards/margins": 0.6411095857620239, "rewards/rejected": -0.09900249540805817, "step": 150 }, { "epoch": 2.44, "eval_logits/chosen": -1.805869698524475, "eval_logits/rejected": -1.8330577611923218, "eval_logps/chosen": -49.10601043701172, "eval_logps/rejected": -98.84068298339844, "eval_loss": 0.4709201455116272, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.6967297196388245, "eval_rewards/margins": 0.8445812463760376, "eval_rewards/rejected": -0.1478516012430191, "eval_runtime": 8.1382, "eval_samples_per_second": 3.441, "eval_steps_per_second": 1.72, "step": 150 }, { "epoch": 2.6, "grad_norm": 7.778740882873535, "learning_rate": 2.915492957746479e-06, "logits/chosen": -1.848589301109314, "logits/rejected": -1.8790754079818726, "logps/chosen": -36.49171447753906, "logps/rejected": -72.55968475341797, "loss": 0.4927, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.49555450677871704, "rewards/margins": 0.6891830563545227, "rewards/rejected": -0.1936284601688385, "step": 160 }, { "epoch": 2.76, "grad_norm": 4.058627605438232, "learning_rate": 2.774647887323944e-06, "logits/chosen": -1.812421441078186, "logits/rejected": -1.8415311574935913, "logps/chosen": -45.62999725341797, "logps/rejected": -87.85527038574219, "loss": 0.4541, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.7084562182426453, "rewards/margins": 0.9553689956665039, "rewards/rejected": -0.24691279232501984, "step": 170 }, { "epoch": 2.93, "grad_norm": 0.0, "learning_rate": 2.6338028169014084e-06, "logits/chosen": -1.8475942611694336, "logits/rejected": -1.8678725957870483, "logps/chosen": -40.53328323364258, "logps/rejected": -64.86616516113281, "loss": 0.5087, "rewards/accuracies": 0.375, "rewards/chosen": 0.5022943019866943, "rewards/margins": 0.7252141833305359, "rewards/rejected": -0.22291991114616394, "step": 180 }, { "epoch": 2.93, "eval_logits/chosen": -1.8136398792266846, "eval_logits/rejected": -1.8424787521362305, "eval_logps/chosen": -48.19547653198242, "eval_logps/rejected": -99.7900161743164, "eval_loss": 0.4541548192501068, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.7877826690673828, "eval_rewards/margins": 1.0305674076080322, "eval_rewards/rejected": -0.24278469383716583, "eval_runtime": 8.1397, "eval_samples_per_second": 3.44, "eval_steps_per_second": 1.72, "step": 180 }, { "epoch": 3.09, "grad_norm": 2.1618106365203857, "learning_rate": 2.4929577464788734e-06, "logits/chosen": -1.876151442527771, "logits/rejected": -1.9132931232452393, "logps/chosen": -38.02617645263672, "logps/rejected": -84.4028549194336, "loss": 0.4372, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.723468005657196, "rewards/margins": 1.1590527296066284, "rewards/rejected": -0.43558478355407715, "step": 190 }, { "epoch": 3.25, "grad_norm": 0.0, "learning_rate": 2.352112676056338e-06, "logits/chosen": -1.8977773189544678, "logits/rejected": -1.9120800495147705, "logps/chosen": -38.097923278808594, "logps/rejected": -55.17757034301758, "loss": 0.4778, "rewards/accuracies": 0.375, "rewards/chosen": 0.5927585959434509, "rewards/margins": 0.8746024370193481, "rewards/rejected": -0.2818438410758972, "step": 200 }, { "epoch": 3.41, "grad_norm": 7.095726013183594, "learning_rate": 2.211267605633803e-06, "logits/chosen": -1.8508259057998657, "logits/rejected": -1.8876402378082275, "logps/chosen": -33.23273468017578, "logps/rejected": -79.0272445678711, "loss": 0.4874, "rewards/accuracies": 0.375, "rewards/chosen": 0.5798195600509644, "rewards/margins": 0.9200228452682495, "rewards/rejected": -0.34020328521728516, "step": 210 }, { "epoch": 3.41, "eval_logits/chosen": -1.821912407875061, "eval_logits/rejected": -1.8520457744598389, "eval_logps/chosen": -47.6314697265625, "eval_logps/rejected": -100.92195129394531, "eval_loss": 0.4427572786808014, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.8441829681396484, "eval_rewards/margins": 1.2001608610153198, "eval_rewards/rejected": -0.35597795248031616, "eval_runtime": 8.1451, "eval_samples_per_second": 3.438, "eval_steps_per_second": 1.719, "step": 210 }, { "epoch": 3.58, "grad_norm": 4.825575351715088, "learning_rate": 2.0704225352112676e-06, "logits/chosen": -1.889478325843811, "logits/rejected": -1.9106714725494385, "logps/chosen": -30.769512176513672, "logps/rejected": -68.92756652832031, "loss": 0.5277, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.379320353269577, "rewards/margins": 0.6602964401245117, "rewards/rejected": -0.28097596764564514, "step": 220 }, { "epoch": 3.74, "grad_norm": 5.236915588378906, "learning_rate": 1.9295774647887326e-06, "logits/chosen": -1.8926284313201904, "logits/rejected": -1.9087079763412476, "logps/chosen": -36.48774719238281, "logps/rejected": -59.29833221435547, "loss": 0.5176, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.6325365304946899, "rewards/margins": 0.867927074432373, "rewards/rejected": -0.2353905737400055, "step": 230 }, { "epoch": 3.9, "grad_norm": 1.3737443685531616, "learning_rate": 1.7887323943661974e-06, "logits/chosen": -1.7782018184661865, "logits/rejected": -1.8105701208114624, "logps/chosen": -41.42538833618164, "logps/rejected": -93.73129272460938, "loss": 0.4229, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.8450711369514465, "rewards/margins": 1.3813583850860596, "rewards/rejected": -0.5362871885299683, "step": 240 }, { "epoch": 3.9, "eval_logits/chosen": -1.826602816581726, "eval_logits/rejected": -1.8575078248977661, "eval_logps/chosen": -47.322914123535156, "eval_logps/rejected": -101.7520980834961, "eval_loss": 0.4358247220516205, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.8750395178794861, "eval_rewards/margins": 1.3140336275100708, "eval_rewards/rejected": -0.4389941692352295, "eval_runtime": 8.1403, "eval_samples_per_second": 3.44, "eval_steps_per_second": 1.72, "step": 240 }, { "epoch": 4.07, "grad_norm": 1.977386713027954, "learning_rate": 1.647887323943662e-06, "logits/chosen": -1.875792145729065, "logits/rejected": -1.8937476873397827, "logps/chosen": -25.06104278564453, "logps/rejected": -46.700584411621094, "loss": 0.5274, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.42553478479385376, "rewards/margins": 0.7891250252723694, "rewards/rejected": -0.3635903000831604, "step": 250 }, { "epoch": 4.23, "grad_norm": 3.320791244506836, "learning_rate": 1.5070422535211269e-06, "logits/chosen": -1.7908179759979248, "logits/rejected": -1.8309694528579712, "logps/chosen": -54.056663513183594, "logps/rejected": -108.03240966796875, "loss": 0.3569, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.07839035987854, "rewards/margins": 1.643531084060669, "rewards/rejected": -0.5651407837867737, "step": 260 }, { "epoch": 4.39, "grad_norm": 4.999856948852539, "learning_rate": 1.3661971830985919e-06, "logits/chosen": -1.9177863597869873, "logits/rejected": -1.9476194381713867, "logps/chosen": -22.58294105529785, "logps/rejected": -61.99756622314453, "loss": 0.5295, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.5058903694152832, "rewards/margins": 0.8186748623847961, "rewards/rejected": -0.31278449296951294, "step": 270 }, { "epoch": 4.39, "eval_logits/chosen": -1.8289211988449097, "eval_logits/rejected": -1.860676646232605, "eval_logps/chosen": -47.04714584350586, "eval_logps/rejected": -102.3218994140625, "eval_loss": 0.43130752444267273, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.9026166200637817, "eval_rewards/margins": 1.3985893726348877, "eval_rewards/rejected": -0.4959728717803955, "eval_runtime": 8.1397, "eval_samples_per_second": 3.44, "eval_steps_per_second": 1.72, "step": 270 }, { "epoch": 4.55, "grad_norm": 0.36910170316696167, "learning_rate": 1.2253521126760565e-06, "logits/chosen": -1.881696105003357, "logits/rejected": -1.9073266983032227, "logps/chosen": -33.28648376464844, "logps/rejected": -68.246337890625, "loss": 0.4962, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.5577932000160217, "rewards/margins": 1.0508588552474976, "rewards/rejected": -0.4930656850337982, "step": 280 }, { "epoch": 4.72, "grad_norm": 11.332355499267578, "learning_rate": 1.084507042253521e-06, "logits/chosen": -1.8866857290267944, "logits/rejected": -1.900857925415039, "logps/chosen": -40.804874420166016, "logps/rejected": -71.67508697509766, "loss": 0.4851, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.7005030512809753, "rewards/margins": 1.2071340084075928, "rewards/rejected": -0.5066308379173279, "step": 290 }, { "epoch": 4.88, "grad_norm": 3.657494306564331, "learning_rate": 9.43661971830986e-07, "logits/chosen": -1.9023106098175049, "logits/rejected": -1.9253908395767212, "logps/chosen": -18.57657814025879, "logps/rejected": -53.88740158081055, "loss": 0.5466, "rewards/accuracies": 0.25, "rewards/chosen": 0.3226935565471649, "rewards/margins": 0.6567031145095825, "rewards/rejected": -0.33400958776474, "step": 300 }, { "epoch": 4.88, "eval_logits/chosen": -1.8308794498443604, "eval_logits/rejected": -1.8629435300827026, "eval_logps/chosen": -46.95443344116211, "eval_logps/rejected": -102.74605560302734, "eval_loss": 0.4291366934776306, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.9118875861167908, "eval_rewards/margins": 1.4502772092819214, "eval_rewards/rejected": -0.5383896827697754, "eval_runtime": 8.1441, "eval_samples_per_second": 3.438, "eval_steps_per_second": 1.719, "step": 300 }, { "epoch": 5.04, "grad_norm": 4.444954872131348, "learning_rate": 8.028169014084508e-07, "logits/chosen": -1.835021734237671, "logits/rejected": -1.858599066734314, "logps/chosen": -42.14970016479492, "logps/rejected": -86.938720703125, "loss": 0.4128, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.8941621780395508, "rewards/margins": 1.572546362876892, "rewards/rejected": -0.6783844232559204, "step": 310 }, { "epoch": 5.2, "grad_norm": 0.4182775616645813, "learning_rate": 6.619718309859155e-07, "logits/chosen": -1.8859401941299438, "logits/rejected": -1.910548448562622, "logps/chosen": -34.28424835205078, "logps/rejected": -77.3191146850586, "loss": 0.4465, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.739007294178009, "rewards/margins": 1.3678598403930664, "rewards/rejected": -0.6288524866104126, "step": 320 }, { "epoch": 5.37, "grad_norm": 2.8709957859973656e-06, "learning_rate": 5.211267605633803e-07, "logits/chosen": -1.7752397060394287, "logits/rejected": -1.8195409774780273, "logps/chosen": -42.48664855957031, "logps/rejected": -97.59371185302734, "loss": 0.4339, "rewards/accuracies": 0.5, "rewards/chosen": 0.8835798501968384, "rewards/margins": 1.3967663049697876, "rewards/rejected": -0.513186514377594, "step": 330 }, { "epoch": 5.37, "eval_logits/chosen": -1.8319826126098633, "eval_logits/rejected": -1.864353895187378, "eval_logps/chosen": -46.921607971191406, "eval_logps/rejected": -103.26231384277344, "eval_loss": 0.42683711647987366, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.9151698350906372, "eval_rewards/margins": 1.505185842514038, "eval_rewards/rejected": -0.5900159478187561, "eval_runtime": 8.1406, "eval_samples_per_second": 3.44, "eval_steps_per_second": 1.72, "step": 330 }, { "epoch": 5.53, "grad_norm": 0.23427560925483704, "learning_rate": 3.8028169014084507e-07, "logits/chosen": -1.8918192386627197, "logits/rejected": -1.9169318675994873, "logps/chosen": -35.16785430908203, "logps/rejected": -71.60049438476562, "loss": 0.4617, "rewards/accuracies": 0.375, "rewards/chosen": 0.5324742197990417, "rewards/margins": 1.1221383810043335, "rewards/rejected": -0.5896641612052917, "step": 340 }, { "epoch": 5.69, "grad_norm": 0.0, "learning_rate": 2.394366197183099e-07, "logits/chosen": -1.857642412185669, "logits/rejected": -1.888279676437378, "logps/chosen": -37.31398010253906, "logps/rejected": -90.64387512207031, "loss": 0.4569, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.7042752504348755, "rewards/margins": 1.4055907726287842, "rewards/rejected": -0.7013154625892639, "step": 350 }, { "epoch": 5.85, "grad_norm": 11.415884017944336, "learning_rate": 9.859154929577466e-08, "logits/chosen": -1.9216959476470947, "logits/rejected": -1.9308369159698486, "logps/chosen": -25.689884185791016, "logps/rejected": -36.790706634521484, "loss": 0.5438, "rewards/accuracies": 0.25, "rewards/chosen": 0.4890199303627014, "rewards/margins": 0.789040207862854, "rewards/rejected": -0.3000202775001526, "step": 360 }, { "epoch": 5.85, "eval_logits/chosen": -1.832722544670105, "eval_logits/rejected": -1.8652076721191406, "eval_logps/chosen": -46.90084457397461, "eval_logps/rejected": -103.44039154052734, "eval_loss": 0.42603132128715515, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.9172464609146118, "eval_rewards/margins": 1.5250685214996338, "eval_rewards/rejected": -0.6078222990036011, "eval_runtime": 8.1439, "eval_samples_per_second": 3.438, "eval_steps_per_second": 1.719, "step": 360 } ], "logging_steps": 10, "max_steps": 366, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 90, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }