diff --git "a/checkpoint-2000/trainer_state.json" "b/checkpoint-2000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2000/trainer_state.json" @@ -0,0 +1,28021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.617372812039915, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": -1.977054476737976, + "logits/rejected": -2.017892599105835, + "logps/chosen": -169.97320556640625, + "logps/rejected": -186.7821807861328, + "loss": 0.7082, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.016062308102846146, + "rewards/margins": -0.02699420601129532, + "rewards/rejected": 0.010931899771094322, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -1.8305251598358154, + "logits/rejected": -1.8582998514175415, + "logps/chosen": -155.8516082763672, + "logps/rejected": -165.23692321777344, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0017804148374125361, + "rewards/margins": 0.004515504464507103, + "rewards/rejected": -0.0027350913733243942, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 1.5e-06, + "logits/chosen": -1.7455682754516602, + "logits/rejected": -1.7944730520248413, + "logps/chosen": -158.9869842529297, + "logps/rejected": -179.4861602783203, + "loss": 0.6886, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.00722665898501873, + "rewards/margins": 0.01148004550486803, + "rewards/rejected": -0.004253389313817024, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -1.8648240566253662, + "logits/rejected": -1.8583375215530396, + "logps/chosen": -186.27041625976562, + "logps/rejected": -174.98153686523438, + "loss": 0.6867, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.013353967107832432, + "rewards/margins": 0.014140583574771881, + "rewards/rejected": -0.0007866150699555874, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 2.5e-06, + "logits/chosen": -1.5987751483917236, + "logits/rejected": -1.602742075920105, + "logps/chosen": -166.3009796142578, + "logps/rejected": -182.16493225097656, + "loss": 0.6973, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.01031036488711834, + "rewards/margins": -0.007737827021628618, + "rewards/rejected": -0.0025725378654897213, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 3e-06, + "logits/chosen": -1.4330253601074219, + "logits/rejected": -1.4852710962295532, + "logps/chosen": -170.77926635742188, + "logps/rejected": -203.38731384277344, + "loss": 0.6967, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0022835731506347656, + "rewards/margins": -0.005300428252667189, + "rewards/rejected": 0.003016853705048561, + "step": 6 + }, + { + "epoch": 0.01, + "learning_rate": 3.5000000000000004e-06, + "logits/chosen": -1.5559855699539185, + "logits/rejected": -1.5930315256118774, + "logps/chosen": -200.5897979736328, + "logps/rejected": -216.58615112304688, + "loss": 0.6832, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.015020323917269707, + "rewards/margins": 0.02334442362189293, + "rewards/rejected": -0.008324098773300648, + "step": 7 + }, + { + "epoch": 0.01, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -1.7643598318099976, + "logits/rejected": -1.7637088298797607, + "logps/chosen": -166.2308349609375, + "logps/rejected": -167.15399169921875, + "loss": 0.6973, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.03954277187585831, + "rewards/margins": -0.005299141630530357, + "rewards/rejected": -0.0342436321079731, + "step": 8 + }, + { + "epoch": 0.01, + "learning_rate": 4.5e-06, + "logits/chosen": -2.035048246383667, + "logits/rejected": -2.1090240478515625, + "logps/chosen": -192.5548095703125, + "logps/rejected": -188.7212371826172, + "loss": 0.6788, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.019565440714359283, + "rewards/margins": 0.030419450253248215, + "rewards/rejected": -0.010854003950953484, + "step": 9 + }, + { + "epoch": 0.01, + "learning_rate": 5e-06, + "logits/chosen": -1.78806471824646, + "logits/rejected": -1.771653175354004, + "logps/chosen": -169.3682403564453, + "logps/rejected": -163.67990112304688, + "loss": 0.6861, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0037752394564449787, + "rewards/margins": 0.01528622955083847, + "rewards/rejected": -0.01906147226691246, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 5.500000000000001e-06, + "logits/chosen": -1.6899704933166504, + "logits/rejected": -1.7716753482818604, + "logps/chosen": -197.17576599121094, + "logps/rejected": -232.6337127685547, + "loss": 0.7158, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.06386594474315643, + "rewards/margins": -0.04272947832942009, + "rewards/rejected": -0.02113647386431694, + "step": 11 + }, + { + "epoch": 0.02, + "learning_rate": 6e-06, + "logits/chosen": -1.8938390016555786, + "logits/rejected": -1.8334659337997437, + "logps/chosen": -150.29385375976562, + "logps/rejected": -159.55947875976562, + "loss": 0.6895, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0028306012973189354, + "rewards/margins": 0.01067290361970663, + "rewards/rejected": -0.00784230325371027, + "step": 12 + }, + { + "epoch": 0.02, + "learning_rate": 6.5000000000000004e-06, + "logits/chosen": -1.87659752368927, + "logits/rejected": -1.8801605701446533, + "logps/chosen": -205.460693359375, + "logps/rejected": -202.0462188720703, + "loss": 0.6984, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0009463531896471977, + "rewards/margins": -0.007184028625488281, + "rewards/rejected": 0.008130382746458054, + "step": 13 + }, + { + "epoch": 0.02, + "learning_rate": 7.000000000000001e-06, + "logits/chosen": -1.9238759279251099, + "logits/rejected": -2.0003201961517334, + "logps/chosen": -184.6726531982422, + "logps/rejected": -177.62509155273438, + "loss": 0.6972, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.01391973439604044, + "rewards/margins": -0.004122593905776739, + "rewards/rejected": -0.009797144681215286, + "step": 14 + }, + { + "epoch": 0.02, + "learning_rate": 7.5e-06, + "logits/chosen": -1.569742202758789, + "logits/rejected": -1.5442943572998047, + "logps/chosen": -171.07496643066406, + "logps/rejected": -170.74981689453125, + "loss": 0.6992, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.020387031137943268, + "rewards/margins": -0.008263109251856804, + "rewards/rejected": -0.012123920023441315, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": -1.7774536609649658, + "logits/rejected": -1.8797023296356201, + "logps/chosen": -173.0279541015625, + "logps/rejected": -187.43557739257812, + "loss": 0.6999, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.03834056854248047, + "rewards/margins": -0.01055521797388792, + "rewards/rejected": -0.027785349637269974, + "step": 16 + }, + { + "epoch": 0.02, + "learning_rate": 8.500000000000002e-06, + "logits/chosen": -1.7320338487625122, + "logits/rejected": -1.6945910453796387, + "logps/chosen": -185.27392578125, + "logps/rejected": -189.87738037109375, + "loss": 0.6808, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.007571219466626644, + "rewards/margins": 0.025459958240389824, + "rewards/rejected": -0.017888737842440605, + "step": 17 + }, + { + "epoch": 0.02, + "learning_rate": 9e-06, + "logits/chosen": -1.7604248523712158, + "logits/rejected": -1.7776434421539307, + "logps/chosen": -196.05404663085938, + "logps/rejected": -190.14569091796875, + "loss": 0.6914, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.02323136478662491, + "rewards/margins": 0.00760660320520401, + "rewards/rejected": -0.03083796612918377, + "step": 18 + }, + { + "epoch": 0.02, + "learning_rate": 9.5e-06, + "logits/chosen": -1.59357488155365, + "logits/rejected": -1.5590327978134155, + "logps/chosen": -213.9495391845703, + "logps/rejected": -218.56654357910156, + "loss": 0.7222, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.05001959949731827, + "rewards/margins": -0.05468587949872017, + "rewards/rejected": 0.004666280932724476, + "step": 19 + }, + { + "epoch": 0.03, + "learning_rate": 1e-05, + "logits/chosen": -1.752557635307312, + "logits/rejected": -1.7027242183685303, + "logps/chosen": -213.31336975097656, + "logps/rejected": -204.09646606445312, + "loss": 0.6887, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.017501091584563255, + "rewards/margins": 0.010245682671666145, + "rewards/rejected": -0.027746770530939102, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 1.05e-05, + "logits/chosen": -1.9553875923156738, + "logits/rejected": -1.9184911251068115, + "logps/chosen": -175.35333251953125, + "logps/rejected": -180.54550170898438, + "loss": 0.655, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.021092725917696953, + "rewards/margins": 0.08075069636106491, + "rewards/rejected": -0.059657976031303406, + "step": 21 + }, + { + "epoch": 0.03, + "learning_rate": 1.1000000000000001e-05, + "logits/chosen": -1.8159900903701782, + "logits/rejected": -1.771599531173706, + "logps/chosen": -185.33059692382812, + "logps/rejected": -209.6474609375, + "loss": 0.6913, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.017464350908994675, + "rewards/margins": 0.006189251318573952, + "rewards/rejected": -0.023653600364923477, + "step": 22 + }, + { + "epoch": 0.03, + "learning_rate": 1.1500000000000002e-05, + "logits/chosen": -1.8995972871780396, + "logits/rejected": -1.9293156862258911, + "logps/chosen": -178.39755249023438, + "logps/rejected": -211.63937377929688, + "loss": 0.7191, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.05782761424779892, + "rewards/margins": -0.04498009383678436, + "rewards/rejected": -0.012847519479691982, + "step": 23 + }, + { + "epoch": 0.03, + "learning_rate": 1.2e-05, + "logits/chosen": -1.8924778699874878, + "logits/rejected": -1.8979182243347168, + "logps/chosen": -167.22109985351562, + "logps/rejected": -176.33663940429688, + "loss": 0.7028, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02767309918999672, + "rewards/margins": -0.016704557463526726, + "rewards/rejected": -0.010968542657792568, + "step": 24 + }, + { + "epoch": 0.03, + "learning_rate": 1.25e-05, + "logits/chosen": -1.840967059135437, + "logits/rejected": -1.797666311264038, + "logps/chosen": -175.57766723632812, + "logps/rejected": -148.11917114257812, + "loss": 0.6967, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.01812169887125492, + "rewards/margins": -0.003334569279104471, + "rewards/rejected": -0.014787126332521439, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 1.3000000000000001e-05, + "logits/chosen": -1.8270690441131592, + "logits/rejected": -1.7811157703399658, + "logps/chosen": -165.993896484375, + "logps/rejected": -165.7574005126953, + "loss": 0.7151, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06210968643426895, + "rewards/margins": -0.040364596992731094, + "rewards/rejected": -0.02174508571624756, + "step": 26 + }, + { + "epoch": 0.04, + "learning_rate": 1.3500000000000001e-05, + "logits/chosen": -1.877970576286316, + "logits/rejected": -1.9722176790237427, + "logps/chosen": -148.4381103515625, + "logps/rejected": -173.12579345703125, + "loss": 0.6835, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.022617867216467857, + "rewards/margins": 0.021304797381162643, + "rewards/rejected": -0.04392266273498535, + "step": 27 + }, + { + "epoch": 0.04, + "learning_rate": 1.4000000000000001e-05, + "logits/chosen": -1.922934889793396, + "logits/rejected": -1.9657589197158813, + "logps/chosen": -170.66554260253906, + "logps/rejected": -176.7419891357422, + "loss": 0.7316, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.10023985058069229, + "rewards/margins": -0.0725136250257492, + "rewards/rejected": -0.027726221829652786, + "step": 28 + }, + { + "epoch": 0.04, + "learning_rate": 1.45e-05, + "logits/chosen": -1.9069832563400269, + "logits/rejected": -1.8908956050872803, + "logps/chosen": -193.16102600097656, + "logps/rejected": -193.22003173828125, + "loss": 0.6847, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.005608892068266869, + "rewards/margins": 0.020983649417757988, + "rewards/rejected": -0.01537475548684597, + "step": 29 + }, + { + "epoch": 0.04, + "learning_rate": 1.5e-05, + "logits/chosen": -1.7016417980194092, + "logits/rejected": -1.7221649885177612, + "logps/chosen": -188.40786743164062, + "logps/rejected": -175.92909240722656, + "loss": 0.7016, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09070225059986115, + "rewards/margins": -0.013078359887003899, + "rewards/rejected": -0.0776238888502121, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 1.55e-05, + "logits/chosen": -1.9492610692977905, + "logits/rejected": -1.9104335308074951, + "logps/chosen": -180.67147827148438, + "logps/rejected": -184.8843994140625, + "loss": 0.6708, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.019913675263524055, + "rewards/margins": 0.049428701400756836, + "rewards/rejected": -0.02951502799987793, + "step": 31 + }, + { + "epoch": 0.04, + "learning_rate": 1.6000000000000003e-05, + "logits/chosen": -1.8524138927459717, + "logits/rejected": -1.9383589029312134, + "logps/chosen": -159.52560424804688, + "logps/rejected": -170.27255249023438, + "loss": 0.7146, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03338008001446724, + "rewards/margins": -0.03860168159008026, + "rewards/rejected": 0.00522160530090332, + "step": 32 + }, + { + "epoch": 0.04, + "learning_rate": 1.65e-05, + "logits/chosen": -1.7788478136062622, + "logits/rejected": -1.835086703300476, + "logps/chosen": -180.18177795410156, + "logps/rejected": -206.07110595703125, + "loss": 0.7059, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.05509199947118759, + "rewards/margins": -0.023075008764863014, + "rewards/rejected": -0.03201699256896973, + "step": 33 + }, + { + "epoch": 0.04, + "learning_rate": 1.7000000000000003e-05, + "logits/chosen": -1.8998254537582397, + "logits/rejected": -1.9138494729995728, + "logps/chosen": -194.03167724609375, + "logps/rejected": -203.6524658203125, + "loss": 0.6915, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05050516128540039, + "rewards/margins": 0.007271335460245609, + "rewards/rejected": -0.057776499539613724, + "step": 34 + }, + { + "epoch": 0.05, + "learning_rate": 1.75e-05, + "logits/chosen": -1.9758315086364746, + "logits/rejected": -2.0455610752105713, + "logps/chosen": -153.28883361816406, + "logps/rejected": -162.89920043945312, + "loss": 0.7122, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.04079794883728027, + "rewards/margins": -0.034585997462272644, + "rewards/rejected": -0.006211946718394756, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 1.8e-05, + "logits/chosen": -1.597517967224121, + "logits/rejected": -1.631009817123413, + "logps/chosen": -160.9354248046875, + "logps/rejected": -168.85618591308594, + "loss": 0.6918, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07598643749952316, + "rewards/margins": 0.005950784310698509, + "rewards/rejected": -0.08193722367286682, + "step": 36 + }, + { + "epoch": 0.05, + "learning_rate": 1.85e-05, + "logits/chosen": -1.7368295192718506, + "logits/rejected": -1.7420881986618042, + "logps/chosen": -169.4617462158203, + "logps/rejected": -184.20599365234375, + "loss": 0.6948, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0290069580078125, + "rewards/margins": -0.0007306085899472237, + "rewards/rejected": -0.0282763484865427, + "step": 37 + }, + { + "epoch": 0.05, + "learning_rate": 1.9e-05, + "logits/chosen": -1.687361717224121, + "logits/rejected": -1.7176148891448975, + "logps/chosen": -174.06297302246094, + "logps/rejected": -192.69715881347656, + "loss": 0.6973, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09517102688550949, + "rewards/margins": -0.0029970891773700714, + "rewards/rejected": -0.09217393398284912, + "step": 38 + }, + { + "epoch": 0.05, + "learning_rate": 1.9500000000000003e-05, + "logits/chosen": -1.8842296600341797, + "logits/rejected": -1.9039793014526367, + "logps/chosen": -162.37741088867188, + "logps/rejected": -176.11697387695312, + "loss": 0.7109, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.13322168588638306, + "rewards/margins": -0.03214216232299805, + "rewards/rejected": -0.10107951611280441, + "step": 39 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "logits/chosen": -1.6896815299987793, + "logits/rejected": -1.7052747011184692, + "logps/chosen": -155.33111572265625, + "logps/rejected": -159.064208984375, + "loss": 0.6846, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.04464542865753174, + "rewards/margins": 0.025189755484461784, + "rewards/rejected": -0.06983518600463867, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 2.05e-05, + "logits/chosen": -1.7584484815597534, + "logits/rejected": -1.6827234029769897, + "logps/chosen": -161.47369384765625, + "logps/rejected": -166.0779571533203, + "loss": 0.6856, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07150936126708984, + "rewards/margins": 0.019259024411439896, + "rewards/rejected": -0.09076839685440063, + "step": 41 + }, + { + "epoch": 0.05, + "learning_rate": 2.1e-05, + "logits/chosen": -2.0230112075805664, + "logits/rejected": -1.9628382921218872, + "logps/chosen": -160.32073974609375, + "logps/rejected": -182.98248291015625, + "loss": 0.6857, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09749487042427063, + "rewards/margins": 0.01751875691115856, + "rewards/rejected": -0.11501362919807434, + "step": 42 + }, + { + "epoch": 0.06, + "learning_rate": 2.15e-05, + "logits/chosen": -1.8469973802566528, + "logits/rejected": -1.814754843711853, + "logps/chosen": -194.247314453125, + "logps/rejected": -204.80491638183594, + "loss": 0.7142, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.15520897507667542, + "rewards/margins": -0.03922419250011444, + "rewards/rejected": -0.11598476767539978, + "step": 43 + }, + { + "epoch": 0.06, + "learning_rate": 2.2000000000000003e-05, + "logits/chosen": -1.9133589267730713, + "logits/rejected": -1.9711928367614746, + "logps/chosen": -186.54978942871094, + "logps/rejected": -189.23045349121094, + "loss": 0.7238, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1358685940504074, + "rewards/margins": -0.05334620922803879, + "rewards/rejected": -0.08252239227294922, + "step": 44 + }, + { + "epoch": 0.06, + "learning_rate": 2.25e-05, + "logits/chosen": -1.6977447271347046, + "logits/rejected": -1.7883001565933228, + "logps/chosen": -178.20858764648438, + "logps/rejected": -201.03770446777344, + "loss": 0.6465, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.053537700325250626, + "rewards/margins": 0.1018117368221283, + "rewards/rejected": -0.15534944832324982, + "step": 45 + }, + { + "epoch": 0.06, + "learning_rate": 2.3000000000000003e-05, + "logits/chosen": -1.9271280765533447, + "logits/rejected": -1.9463679790496826, + "logps/chosen": -176.5238800048828, + "logps/rejected": -166.8386993408203, + "loss": 0.7159, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09149947762489319, + "rewards/margins": -0.04024248570203781, + "rewards/rejected": -0.05125699192285538, + "step": 46 + }, + { + "epoch": 0.06, + "learning_rate": 2.35e-05, + "logits/chosen": -1.7680522203445435, + "logits/rejected": -1.668277621269226, + "logps/chosen": -189.94366455078125, + "logps/rejected": -177.8812255859375, + "loss": 0.6647, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08811293542385101, + "rewards/margins": 0.06799888610839844, + "rewards/rejected": -0.15611180663108826, + "step": 47 + }, + { + "epoch": 0.06, + "learning_rate": 2.4e-05, + "logits/chosen": -1.5737125873565674, + "logits/rejected": -1.626237154006958, + "logps/chosen": -177.7281036376953, + "logps/rejected": -185.34068298339844, + "loss": 0.7038, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12158739566802979, + "rewards/margins": -0.004673934541642666, + "rewards/rejected": -0.11691347509622574, + "step": 48 + }, + { + "epoch": 0.06, + "learning_rate": 2.45e-05, + "logits/chosen": -1.487099528312683, + "logits/rejected": -1.5200517177581787, + "logps/chosen": -152.00973510742188, + "logps/rejected": -163.13150024414062, + "loss": 0.7211, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.12497053295373917, + "rewards/margins": -0.05033881962299347, + "rewards/rejected": -0.0746317207813263, + "step": 49 + }, + { + "epoch": 0.07, + "learning_rate": 2.5e-05, + "logits/chosen": -1.751338243484497, + "logits/rejected": -1.7326526641845703, + "logps/chosen": -180.03758239746094, + "logps/rejected": -191.03634643554688, + "loss": 0.7029, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12404017895460129, + "rewards/margins": -0.013475272804498672, + "rewards/rejected": -0.11056490242481232, + "step": 50 + }, + { + "epoch": 0.07, + "learning_rate": 2.5500000000000003e-05, + "logits/chosen": -1.9426243305206299, + "logits/rejected": -1.9601188898086548, + "logps/chosen": -158.4928741455078, + "logps/rejected": -164.06317138671875, + "loss": 0.7142, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.12027917802333832, + "rewards/margins": -0.028645988553762436, + "rewards/rejected": -0.09163318574428558, + "step": 51 + }, + { + "epoch": 0.07, + "learning_rate": 2.6000000000000002e-05, + "logits/chosen": -1.5965681076049805, + "logits/rejected": -1.597716212272644, + "logps/chosen": -195.06454467773438, + "logps/rejected": -193.9747314453125, + "loss": 0.6905, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14499236643314362, + "rewards/margins": 0.02140347845852375, + "rewards/rejected": -0.16639582812786102, + "step": 52 + }, + { + "epoch": 0.07, + "learning_rate": 2.6500000000000004e-05, + "logits/chosen": -1.9194509983062744, + "logits/rejected": -1.883784294128418, + "logps/chosen": -161.39295959472656, + "logps/rejected": -167.61610412597656, + "loss": 0.6965, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.11441681534051895, + "rewards/margins": -0.00019459612667560577, + "rewards/rejected": -0.1142222210764885, + "step": 53 + }, + { + "epoch": 0.07, + "learning_rate": 2.7000000000000002e-05, + "logits/chosen": -1.7368502616882324, + "logits/rejected": -1.7216427326202393, + "logps/chosen": -205.18130493164062, + "logps/rejected": -191.90237426757812, + "loss": 0.66, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1574774831533432, + "rewards/margins": 0.07581701874732971, + "rewards/rejected": -0.23329448699951172, + "step": 54 + }, + { + "epoch": 0.07, + "learning_rate": 2.7500000000000004e-05, + "logits/chosen": -2.0303847789764404, + "logits/rejected": -2.0456559658050537, + "logps/chosen": -169.33453369140625, + "logps/rejected": -166.40707397460938, + "loss": 0.6479, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14774441719055176, + "rewards/margins": 0.1020236536860466, + "rewards/rejected": -0.24976806342601776, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 2.8000000000000003e-05, + "logits/chosen": -2.035883903503418, + "logits/rejected": -1.9933511018753052, + "logps/chosen": -190.82186889648438, + "logps/rejected": -190.29147338867188, + "loss": 0.7023, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.11226066946983337, + "rewards/margins": -0.01058507151901722, + "rewards/rejected": -0.1016756072640419, + "step": 56 + }, + { + "epoch": 0.07, + "learning_rate": 2.8499999999999998e-05, + "logits/chosen": -1.9773008823394775, + "logits/rejected": -1.935595989227295, + "logps/chosen": -180.8785400390625, + "logps/rejected": -193.48155212402344, + "loss": 0.6739, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2835652828216553, + "rewards/margins": 0.048453718423843384, + "rewards/rejected": -0.33201897144317627, + "step": 57 + }, + { + "epoch": 0.08, + "learning_rate": 2.9e-05, + "logits/chosen": -1.681299090385437, + "logits/rejected": -1.6699227094650269, + "logps/chosen": -165.8355255126953, + "logps/rejected": -191.2743377685547, + "loss": 0.7356, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17172232270240784, + "rewards/margins": -0.07655029743909836, + "rewards/rejected": -0.09517201036214828, + "step": 58 + }, + { + "epoch": 0.08, + "learning_rate": 2.95e-05, + "logits/chosen": -1.71488356590271, + "logits/rejected": -1.7937383651733398, + "logps/chosen": -177.51776123046875, + "logps/rejected": -188.60760498046875, + "loss": 0.6358, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1568491905927658, + "rewards/margins": 0.12602630257606506, + "rewards/rejected": -0.2828754782676697, + "step": 59 + }, + { + "epoch": 0.08, + "learning_rate": 3e-05, + "logits/chosen": -2.0625619888305664, + "logits/rejected": -2.1217410564422607, + "logps/chosen": -190.26431274414062, + "logps/rejected": -215.28648376464844, + "loss": 0.7777, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3512558341026306, + "rewards/margins": -0.13770633935928345, + "rewards/rejected": -0.21354950964450836, + "step": 60 + }, + { + "epoch": 0.08, + "learning_rate": 3.05e-05, + "logits/chosen": -1.5000535249710083, + "logits/rejected": -1.508121371269226, + "logps/chosen": -167.42588806152344, + "logps/rejected": -151.4751739501953, + "loss": 0.692, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15655343234539032, + "rewards/margins": 0.01617264747619629, + "rewards/rejected": -0.1727260947227478, + "step": 61 + }, + { + "epoch": 0.08, + "learning_rate": 3.1e-05, + "logits/chosen": -1.628549337387085, + "logits/rejected": -1.7045596837997437, + "logps/chosen": -206.2198944091797, + "logps/rejected": -188.40350341796875, + "loss": 0.7688, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2638259530067444, + "rewards/margins": -0.12546539306640625, + "rewards/rejected": -0.13836055994033813, + "step": 62 + }, + { + "epoch": 0.08, + "learning_rate": 3.15e-05, + "logits/chosen": -1.6634955406188965, + "logits/rejected": -1.641862154006958, + "logps/chosen": -173.11769104003906, + "logps/rejected": -179.14816284179688, + "loss": 0.6446, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1516265571117401, + "rewards/margins": 0.10875654965639114, + "rewards/rejected": -0.26038309931755066, + "step": 63 + }, + { + "epoch": 0.08, + "learning_rate": 3.2000000000000005e-05, + "logits/chosen": -1.90239417552948, + "logits/rejected": -1.846576452255249, + "logps/chosen": -185.16522216796875, + "logps/rejected": -187.98654174804688, + "loss": 0.696, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.26641684770584106, + "rewards/margins": 0.004999059252440929, + "rewards/rejected": -0.27141591906547546, + "step": 64 + }, + { + "epoch": 0.09, + "learning_rate": 3.2500000000000004e-05, + "logits/chosen": -1.9015129804611206, + "logits/rejected": -1.9043163061141968, + "logps/chosen": -166.89393615722656, + "logps/rejected": -185.8280487060547, + "loss": 0.7096, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3341715931892395, + "rewards/margins": -0.005350928753614426, + "rewards/rejected": -0.32882067561149597, + "step": 65 + }, + { + "epoch": 0.09, + "learning_rate": 3.3e-05, + "logits/chosen": -1.7935004234313965, + "logits/rejected": -1.8295319080352783, + "logps/chosen": -145.04159545898438, + "logps/rejected": -152.42388916015625, + "loss": 0.7515, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.3907526433467865, + "rewards/margins": -0.10265941917896271, + "rewards/rejected": -0.2880932092666626, + "step": 66 + }, + { + "epoch": 0.09, + "learning_rate": 3.35e-05, + "logits/chosen": -1.6959490776062012, + "logits/rejected": -1.680245280265808, + "logps/chosen": -165.76852416992188, + "logps/rejected": -161.328857421875, + "loss": 0.7212, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3491905629634857, + "rewards/margins": -0.03104216605424881, + "rewards/rejected": -0.3181484043598175, + "step": 67 + }, + { + "epoch": 0.09, + "learning_rate": 3.4000000000000007e-05, + "logits/chosen": -1.5831743478775024, + "logits/rejected": -1.5631486177444458, + "logps/chosen": -182.02952575683594, + "logps/rejected": -183.06907653808594, + "loss": 0.6674, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22372691333293915, + "rewards/margins": 0.07622986286878586, + "rewards/rejected": -0.2999567687511444, + "step": 68 + }, + { + "epoch": 0.09, + "learning_rate": 3.45e-05, + "logits/chosen": -1.7518221139907837, + "logits/rejected": -1.7015327215194702, + "logps/chosen": -181.39324951171875, + "logps/rejected": -181.88720703125, + "loss": 0.6668, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3097344934940338, + "rewards/margins": 0.0728234276175499, + "rewards/rejected": -0.3825579285621643, + "step": 69 + }, + { + "epoch": 0.09, + "learning_rate": 3.5e-05, + "logits/chosen": -1.8494486808776855, + "logits/rejected": -1.80762779712677, + "logps/chosen": -181.36915588378906, + "logps/rejected": -186.561279296875, + "loss": 0.7449, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.47243672609329224, + "rewards/margins": -0.07020688056945801, + "rewards/rejected": -0.40222981572151184, + "step": 70 + }, + { + "epoch": 0.09, + "learning_rate": 3.55e-05, + "logits/chosen": -1.701026201248169, + "logits/rejected": -1.6742680072784424, + "logps/chosen": -157.6632843017578, + "logps/rejected": -161.87045288085938, + "loss": 0.6544, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.27613064646720886, + "rewards/margins": 0.10120917111635208, + "rewards/rejected": -0.37733981013298035, + "step": 71 + }, + { + "epoch": 0.09, + "learning_rate": 3.6e-05, + "logits/chosen": -1.7675864696502686, + "logits/rejected": -1.8016642332077026, + "logps/chosen": -185.2778778076172, + "logps/rejected": -212.45452880859375, + "loss": 0.6621, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3811972141265869, + "rewards/margins": 0.07623375207185745, + "rewards/rejected": -0.45743098855018616, + "step": 72 + }, + { + "epoch": 0.1, + "learning_rate": 3.65e-05, + "logits/chosen": -1.8149230480194092, + "logits/rejected": -1.8410683870315552, + "logps/chosen": -167.20913696289062, + "logps/rejected": -197.37989807128906, + "loss": 0.7319, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.48692482709884644, + "rewards/margins": -0.050217654556035995, + "rewards/rejected": -0.43670713901519775, + "step": 73 + }, + { + "epoch": 0.1, + "learning_rate": 3.7e-05, + "logits/chosen": -1.8524658679962158, + "logits/rejected": -1.8538126945495605, + "logps/chosen": -164.4295654296875, + "logps/rejected": -180.02191162109375, + "loss": 0.7363, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4177883267402649, + "rewards/margins": -0.050662752240896225, + "rewards/rejected": -0.36712557077407837, + "step": 74 + }, + { + "epoch": 0.1, + "learning_rate": 3.7500000000000003e-05, + "logits/chosen": -1.2946817874908447, + "logits/rejected": -1.3125836849212646, + "logps/chosen": -225.81192016601562, + "logps/rejected": -252.005859375, + "loss": 0.7049, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.37055703997612, + "rewards/margins": 0.016286462545394897, + "rewards/rejected": -0.3868435025215149, + "step": 75 + }, + { + "epoch": 0.1, + "learning_rate": 3.8e-05, + "logits/chosen": -1.7864320278167725, + "logits/rejected": -1.8551700115203857, + "logps/chosen": -176.35296630859375, + "logps/rejected": -182.03231811523438, + "loss": 0.7029, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49798864126205444, + "rewards/margins": 0.02277611568570137, + "rewards/rejected": -0.5207647681236267, + "step": 76 + }, + { + "epoch": 0.1, + "learning_rate": 3.85e-05, + "logits/chosen": -1.7414928674697876, + "logits/rejected": -1.7363300323486328, + "logps/chosen": -191.74154663085938, + "logps/rejected": -199.81724548339844, + "loss": 0.6928, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4908958375453949, + "rewards/margins": 0.02561158686876297, + "rewards/rejected": -0.5165074467658997, + "step": 77 + }, + { + "epoch": 0.1, + "learning_rate": 3.9000000000000006e-05, + "logits/chosen": -1.835726261138916, + "logits/rejected": -1.840294599533081, + "logps/chosen": -191.46029663085938, + "logps/rejected": -168.4354248046875, + "loss": 0.7986, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5988375544548035, + "rewards/margins": -0.1457357108592987, + "rewards/rejected": -0.45310184359550476, + "step": 78 + }, + { + "epoch": 0.1, + "learning_rate": 3.9500000000000005e-05, + "logits/chosen": -1.6384897232055664, + "logits/rejected": -1.5781760215759277, + "logps/chosen": -217.35772705078125, + "logps/rejected": -217.51126098632812, + "loss": 0.7354, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6283512115478516, + "rewards/margins": -0.027955979108810425, + "rewards/rejected": -0.6003952026367188, + "step": 79 + }, + { + "epoch": 0.1, + "learning_rate": 4e-05, + "logits/chosen": -1.7660375833511353, + "logits/rejected": -1.7639034986495972, + "logps/chosen": -152.1668243408203, + "logps/rejected": -141.91021728515625, + "loss": 0.7167, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5395210385322571, + "rewards/margins": -0.012110946699976921, + "rewards/rejected": -0.527410089969635, + "step": 80 + }, + { + "epoch": 0.11, + "learning_rate": 4.05e-05, + "logits/chosen": -1.5262843370437622, + "logits/rejected": -1.534919261932373, + "logps/chosen": -182.58859252929688, + "logps/rejected": -200.82281494140625, + "loss": 0.6912, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6189998388290405, + "rewards/margins": 0.02953934296965599, + "rewards/rejected": -0.6485391855239868, + "step": 81 + }, + { + "epoch": 0.11, + "learning_rate": 4.1e-05, + "logits/chosen": -1.6617029905319214, + "logits/rejected": -1.6961578130722046, + "logps/chosen": -177.42286682128906, + "logps/rejected": -168.7061767578125, + "loss": 0.7955, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7245805859565735, + "rewards/margins": -0.15311545133590698, + "rewards/rejected": -0.5714651942253113, + "step": 82 + }, + { + "epoch": 0.11, + "learning_rate": 4.15e-05, + "logits/chosen": -1.6676827669143677, + "logits/rejected": -1.5490450859069824, + "logps/chosen": -171.3192596435547, + "logps/rejected": -173.60470581054688, + "loss": 0.7403, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5414891242980957, + "rewards/margins": -0.05256550386548042, + "rewards/rejected": -0.4889236092567444, + "step": 83 + }, + { + "epoch": 0.11, + "learning_rate": 4.2e-05, + "logits/chosen": -1.8843668699264526, + "logits/rejected": -1.9513225555419922, + "logps/chosen": -172.60548400878906, + "logps/rejected": -164.8987579345703, + "loss": 0.7633, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5382730960845947, + "rewards/margins": -0.10875138640403748, + "rewards/rejected": -0.42952167987823486, + "step": 84 + }, + { + "epoch": 0.11, + "learning_rate": 4.25e-05, + "logits/chosen": -1.9940603971481323, + "logits/rejected": -1.9973390102386475, + "logps/chosen": -161.12863159179688, + "logps/rejected": -164.32958984375, + "loss": 0.7724, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5231802463531494, + "rewards/margins": -0.11174039542675018, + "rewards/rejected": -0.41143983602523804, + "step": 85 + }, + { + "epoch": 0.11, + "learning_rate": 4.3e-05, + "logits/chosen": -1.8103983402252197, + "logits/rejected": -1.7585985660552979, + "logps/chosen": -195.51132202148438, + "logps/rejected": -189.4246826171875, + "loss": 0.747, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7077435255050659, + "rewards/margins": -0.08058477193117142, + "rewards/rejected": -0.6271587610244751, + "step": 86 + }, + { + "epoch": 0.11, + "learning_rate": 4.35e-05, + "logits/chosen": -1.9289149045944214, + "logits/rejected": -1.982129693031311, + "logps/chosen": -170.78253173828125, + "logps/rejected": -172.57882690429688, + "loss": 0.8229, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6636737585067749, + "rewards/margins": -0.2039366364479065, + "rewards/rejected": -0.4597371816635132, + "step": 87 + }, + { + "epoch": 0.12, + "learning_rate": 4.4000000000000006e-05, + "logits/chosen": -1.875556230545044, + "logits/rejected": -1.8859914541244507, + "logps/chosen": -160.60577392578125, + "logps/rejected": -171.32774353027344, + "loss": 0.6905, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3266763389110565, + "rewards/margins": 0.03434550017118454, + "rewards/rejected": -0.36102184653282166, + "step": 88 + }, + { + "epoch": 0.12, + "learning_rate": 4.4500000000000004e-05, + "logits/chosen": -1.7824954986572266, + "logits/rejected": -1.7617722749710083, + "logps/chosen": -177.59042358398438, + "logps/rejected": -200.9052276611328, + "loss": 0.6799, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4084916114807129, + "rewards/margins": 0.0701964944601059, + "rewards/rejected": -0.47868812084198, + "step": 89 + }, + { + "epoch": 0.12, + "learning_rate": 4.5e-05, + "logits/chosen": -1.7419114112854004, + "logits/rejected": -1.7621021270751953, + "logps/chosen": -185.32742309570312, + "logps/rejected": -171.69085693359375, + "loss": 0.8305, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.3024221956729889, + "rewards/margins": -0.20739878714084625, + "rewards/rejected": -0.09502339363098145, + "step": 90 + }, + { + "epoch": 0.12, + "learning_rate": 4.55e-05, + "logits/chosen": -1.83467435836792, + "logits/rejected": -1.8328973054885864, + "logps/chosen": -207.63388061523438, + "logps/rejected": -210.3101043701172, + "loss": 0.6628, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3415074646472931, + "rewards/margins": 0.07163538783788681, + "rewards/rejected": -0.4131428897380829, + "step": 91 + }, + { + "epoch": 0.12, + "learning_rate": 4.600000000000001e-05, + "logits/chosen": -1.6864506006240845, + "logits/rejected": -1.7266168594360352, + "logps/chosen": -180.49522399902344, + "logps/rejected": -198.40599060058594, + "loss": 0.6523, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.27834925055503845, + "rewards/margins": 0.10665541142225266, + "rewards/rejected": -0.3850046396255493, + "step": 92 + }, + { + "epoch": 0.12, + "learning_rate": 4.6500000000000005e-05, + "logits/chosen": -1.4600155353546143, + "logits/rejected": -1.4545408487319946, + "logps/chosen": -191.01947021484375, + "logps/rejected": -180.70298767089844, + "loss": 0.7165, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1802830994129181, + "rewards/margins": -0.027431445196270943, + "rewards/rejected": -0.1528516411781311, + "step": 93 + }, + { + "epoch": 0.12, + "learning_rate": 4.7e-05, + "logits/chosen": -1.8134263753890991, + "logits/rejected": -1.8928412199020386, + "logps/chosen": -184.84503173828125, + "logps/rejected": -193.90377807617188, + "loss": 0.6644, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3032827377319336, + "rewards/margins": 0.0916454941034317, + "rewards/rejected": -0.3949282467365265, + "step": 94 + }, + { + "epoch": 0.12, + "learning_rate": 4.75e-05, + "logits/chosen": -1.8253490924835205, + "logits/rejected": -1.810834527015686, + "logps/chosen": -208.2128448486328, + "logps/rejected": -175.4803009033203, + "loss": 0.7444, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5496629476547241, + "rewards/margins": -0.08057989180088043, + "rewards/rejected": -0.4690830111503601, + "step": 95 + }, + { + "epoch": 0.13, + "learning_rate": 4.8e-05, + "logits/chosen": -1.908506989479065, + "logits/rejected": -1.9602031707763672, + "logps/chosen": -184.815185546875, + "logps/rejected": -176.07138061523438, + "loss": 0.6982, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3538370728492737, + "rewards/margins": 0.013028910383582115, + "rewards/rejected": -0.36686599254608154, + "step": 96 + }, + { + "epoch": 0.13, + "learning_rate": 4.85e-05, + "logits/chosen": -1.7022223472595215, + "logits/rejected": -1.6424753665924072, + "logps/chosen": -198.0614776611328, + "logps/rejected": -215.3161163330078, + "loss": 0.7737, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.2290499359369278, + "rewards/margins": -0.12730106711387634, + "rewards/rejected": -0.10174884647130966, + "step": 97 + }, + { + "epoch": 0.13, + "learning_rate": 4.9e-05, + "logits/chosen": -2.018606662750244, + "logits/rejected": -2.027151584625244, + "logps/chosen": -167.92147827148438, + "logps/rejected": -166.90982055664062, + "loss": 0.6961, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3297194838523865, + "rewards/margins": 0.02775608003139496, + "rewards/rejected": -0.35747551918029785, + "step": 98 + }, + { + "epoch": 0.13, + "learning_rate": 4.9500000000000004e-05, + "logits/chosen": -1.7240030765533447, + "logits/rejected": -1.7241712808609009, + "logps/chosen": -180.00389099121094, + "logps/rejected": -189.3558349609375, + "loss": 0.7948, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4574447572231293, + "rewards/margins": -0.12154103070497513, + "rewards/rejected": -0.33590370416641235, + "step": 99 + }, + { + "epoch": 0.13, + "learning_rate": 5e-05, + "logits/chosen": -1.7809938192367554, + "logits/rejected": -1.9036985635757446, + "logps/chosen": -157.1652069091797, + "logps/rejected": -173.80288696289062, + "loss": 0.6839, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2328178882598877, + "rewards/margins": 0.05165515094995499, + "rewards/rejected": -0.2844730615615845, + "step": 100 + }, + { + "epoch": 0.13, + "learning_rate": 4.999997432392803e-05, + "logits/chosen": -1.9480023384094238, + "logits/rejected": -1.9346106052398682, + "logps/chosen": -197.60128784179688, + "logps/rejected": -193.88124084472656, + "loss": 0.6427, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26800671219825745, + "rewards/margins": 0.13769717514514923, + "rewards/rejected": -0.40570390224456787, + "step": 101 + }, + { + "epoch": 0.13, + "learning_rate": 4.9999897295764844e-05, + "logits/chosen": -2.048476457595825, + "logits/rejected": -2.0353472232818604, + "logps/chosen": -180.3016357421875, + "logps/rejected": -180.51600646972656, + "loss": 0.6366, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35791218280792236, + "rewards/margins": 0.15290379524230957, + "rewards/rejected": -0.5108159780502319, + "step": 102 + }, + { + "epoch": 0.13, + "learning_rate": 4.9999768915668665e-05, + "logits/chosen": -1.9134316444396973, + "logits/rejected": -1.8900353908538818, + "logps/chosen": -152.71189880371094, + "logps/rejected": -153.96690368652344, + "loss": 0.6941, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.18337132036685944, + "rewards/margins": 0.03930587321519852, + "rewards/rejected": -0.22267718613147736, + "step": 103 + }, + { + "epoch": 0.14, + "learning_rate": 4.999958918390321e-05, + "logits/chosen": -1.8933653831481934, + "logits/rejected": -1.8493753671646118, + "logps/chosen": -188.6655731201172, + "logps/rejected": -183.1341552734375, + "loss": 0.7168, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.38441047072410583, + "rewards/margins": -0.010426240041851997, + "rewards/rejected": -0.3739842474460602, + "step": 104 + }, + { + "epoch": 0.14, + "learning_rate": 4.999935810083766e-05, + "logits/chosen": -1.7264684438705444, + "logits/rejected": -1.694319486618042, + "logps/chosen": -156.22084045410156, + "logps/rejected": -152.80894470214844, + "loss": 0.6763, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.28453487157821655, + "rewards/margins": 0.052657999098300934, + "rewards/rejected": -0.3371928334236145, + "step": 105 + }, + { + "epoch": 0.14, + "learning_rate": 4.999907566694667e-05, + "logits/chosen": -1.8885689973831177, + "logits/rejected": -1.9364815950393677, + "logps/chosen": -167.39117431640625, + "logps/rejected": -191.052001953125, + "loss": 0.6963, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.21066321432590485, + "rewards/margins": 0.03920959681272507, + "rewards/rejected": -0.2498728483915329, + "step": 106 + }, + { + "epoch": 0.14, + "learning_rate": 4.9998741882810384e-05, + "logits/chosen": -1.7698872089385986, + "logits/rejected": -1.7441281080245972, + "logps/chosen": -178.68572998046875, + "logps/rejected": -174.1964569091797, + "loss": 0.7459, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24953651428222656, + "rewards/margins": -0.08057431131601334, + "rewards/rejected": -0.16896219551563263, + "step": 107 + }, + { + "epoch": 0.14, + "learning_rate": 4.999835674911443e-05, + "logits/chosen": -1.812888741493225, + "logits/rejected": -1.7789666652679443, + "logps/chosen": -228.23631286621094, + "logps/rejected": -203.85357666015625, + "loss": 0.6838, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10872795432806015, + "rewards/margins": 0.05488254129886627, + "rewards/rejected": -0.16361048817634583, + "step": 108 + }, + { + "epoch": 0.14, + "learning_rate": 4.999792026664991e-05, + "logits/chosen": -1.6739952564239502, + "logits/rejected": -1.6701843738555908, + "logps/chosen": -203.80810546875, + "logps/rejected": -211.82334899902344, + "loss": 0.6853, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3305152356624603, + "rewards/margins": 0.05953298509120941, + "rewards/rejected": -0.39004823565483093, + "step": 109 + }, + { + "epoch": 0.14, + "learning_rate": 4.9997432436313384e-05, + "logits/chosen": -1.6255407333374023, + "logits/rejected": -1.586310625076294, + "logps/chosen": -165.31040954589844, + "logps/rejected": -186.821044921875, + "loss": 0.646, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3459918200969696, + "rewards/margins": 0.1395949125289917, + "rewards/rejected": -0.4855867028236389, + "step": 110 + }, + { + "epoch": 0.15, + "learning_rate": 4.99968932591069e-05, + "logits/chosen": -1.8984012603759766, + "logits/rejected": -1.8569310903549194, + "logps/chosen": -182.6356201171875, + "logps/rejected": -176.14752197265625, + "loss": 0.7585, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3858329653739929, + "rewards/margins": -0.05486001819372177, + "rewards/rejected": -0.33097296953201294, + "step": 111 + }, + { + "epoch": 0.15, + "learning_rate": 4.999630273613799e-05, + "logits/chosen": -1.8298226594924927, + "logits/rejected": -1.812253475189209, + "logps/chosen": -163.0826416015625, + "logps/rejected": -200.62098693847656, + "loss": 0.7425, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2680785357952118, + "rewards/margins": -0.04629645124077797, + "rewards/rejected": -0.22178205847740173, + "step": 112 + }, + { + "epoch": 0.15, + "learning_rate": 4.999566086861961e-05, + "logits/chosen": -1.6931625604629517, + "logits/rejected": -1.7089687585830688, + "logps/chosen": -147.44491577148438, + "logps/rejected": -150.4454345703125, + "loss": 0.7146, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.224314883351326, + "rewards/margins": 0.05584639310836792, + "rewards/rejected": -0.2801613211631775, + "step": 113 + }, + { + "epoch": 0.15, + "learning_rate": 4.999496765787024e-05, + "logits/chosen": -1.7311415672302246, + "logits/rejected": -1.612362027168274, + "logps/chosen": -195.65594482421875, + "logps/rejected": -194.56130981445312, + "loss": 0.7017, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.28545114398002625, + "rewards/margins": 0.020438771694898605, + "rewards/rejected": -0.30588990449905396, + "step": 114 + }, + { + "epoch": 0.15, + "learning_rate": 4.9994223105313774e-05, + "logits/chosen": -1.9310710430145264, + "logits/rejected": -1.9658077955245972, + "logps/chosen": -179.00779724121094, + "logps/rejected": -181.42135620117188, + "loss": 0.6605, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2686367928981781, + "rewards/margins": 0.10176892578601837, + "rewards/rejected": -0.37040573358535767, + "step": 115 + }, + { + "epoch": 0.15, + "learning_rate": 4.9993427212479606e-05, + "logits/chosen": -1.7969621419906616, + "logits/rejected": -1.8055509328842163, + "logps/chosen": -176.26036071777344, + "logps/rejected": -171.2470703125, + "loss": 0.6656, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.33693569898605347, + "rewards/margins": 0.08709227293729782, + "rewards/rejected": -0.4240279793739319, + "step": 116 + }, + { + "epoch": 0.15, + "learning_rate": 4.999257998100254e-05, + "logits/chosen": -1.5152311325073242, + "logits/rejected": -1.553884506225586, + "logps/chosen": -179.60372924804688, + "logps/rejected": -164.34481811523438, + "loss": 0.7563, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3611801564693451, + "rewards/margins": -0.08427368104457855, + "rewards/rejected": -0.27690649032592773, + "step": 117 + }, + { + "epoch": 0.15, + "learning_rate": 4.999168141262289e-05, + "logits/chosen": -1.7877562046051025, + "logits/rejected": -1.7855968475341797, + "logps/chosen": -165.04220581054688, + "logps/rejected": -179.1771240234375, + "loss": 0.7999, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3364974856376648, + "rewards/margins": -0.1316596120595932, + "rewards/rejected": -0.20483790338039398, + "step": 118 + }, + { + "epoch": 0.16, + "learning_rate": 4.9990731509186376e-05, + "logits/chosen": -1.4499574899673462, + "logits/rejected": -1.4440600872039795, + "logps/chosen": -183.58778381347656, + "logps/rejected": -198.231689453125, + "loss": 0.7216, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3034099340438843, + "rewards/margins": -0.00025239214301109314, + "rewards/rejected": -0.3031575381755829, + "step": 119 + }, + { + "epoch": 0.16, + "learning_rate": 4.998973027264419e-05, + "logits/chosen": -1.6583937406539917, + "logits/rejected": -1.689673900604248, + "logps/chosen": -187.60508728027344, + "logps/rejected": -212.7592315673828, + "loss": 0.6404, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3053140342235565, + "rewards/margins": 0.17152619361877441, + "rewards/rejected": -0.47684019804000854, + "step": 120 + }, + { + "epoch": 0.16, + "learning_rate": 4.998867770505295e-05, + "logits/chosen": -1.6554609537124634, + "logits/rejected": -1.6350326538085938, + "logps/chosen": -181.46209716796875, + "logps/rejected": -173.83157348632812, + "loss": 0.726, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.24238371849060059, + "rewards/margins": 0.002255776897072792, + "rewards/rejected": -0.24463950097560883, + "step": 121 + }, + { + "epoch": 0.16, + "learning_rate": 4.9987573808574726e-05, + "logits/chosen": -1.8908902406692505, + "logits/rejected": -1.858853816986084, + "logps/chosen": -174.7786865234375, + "logps/rejected": -177.30545043945312, + "loss": 0.7269, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0712527185678482, + "rewards/margins": -0.02764507755637169, + "rewards/rejected": 0.09889779984951019, + "step": 122 + }, + { + "epoch": 0.16, + "learning_rate": 4.9986418585477016e-05, + "logits/chosen": -1.7081693410873413, + "logits/rejected": -1.7438371181488037, + "logps/chosen": -191.28123474121094, + "logps/rejected": -197.5807647705078, + "loss": 0.6859, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2962351143360138, + "rewards/margins": 0.06765662133693695, + "rewards/rejected": -0.36389175057411194, + "step": 123 + }, + { + "epoch": 0.16, + "learning_rate": 4.998521203813274e-05, + "logits/chosen": -1.805833101272583, + "logits/rejected": -1.7511969804763794, + "logps/chosen": -166.08702087402344, + "logps/rejected": -159.9141082763672, + "loss": 0.6387, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.16753454506397247, + "rewards/margins": 0.16043387353420258, + "rewards/rejected": -0.32796838879585266, + "step": 124 + }, + { + "epoch": 0.16, + "learning_rate": 4.9983954169020256e-05, + "logits/chosen": -1.5241700410842896, + "logits/rejected": -1.6004612445831299, + "logps/chosen": -189.8046875, + "logps/rejected": -198.48001098632812, + "loss": 0.7803, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.35226038098335266, + "rewards/margins": -0.09872373938560486, + "rewards/rejected": -0.2535366714000702, + "step": 125 + }, + { + "epoch": 0.16, + "learning_rate": 4.9982644980723334e-05, + "logits/chosen": -1.3276153802871704, + "logits/rejected": -1.372768759727478, + "logps/chosen": -179.91940307617188, + "logps/rejected": -180.60751342773438, + "loss": 0.7245, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4333783686161041, + "rewards/margins": -0.014091454446315765, + "rewards/rejected": -0.41928690671920776, + "step": 126 + }, + { + "epoch": 0.17, + "learning_rate": 4.998128447593117e-05, + "logits/chosen": -1.5195338726043701, + "logits/rejected": -1.4305707216262817, + "logps/chosen": -179.16712951660156, + "logps/rejected": -162.93356323242188, + "loss": 0.7394, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3149658441543579, + "rewards/margins": -0.04299226403236389, + "rewards/rejected": -0.2719736099243164, + "step": 127 + }, + { + "epoch": 0.17, + "learning_rate": 4.997987265743834e-05, + "logits/chosen": -1.8512688875198364, + "logits/rejected": -1.791621446609497, + "logps/chosen": -171.71990966796875, + "logps/rejected": -177.26954650878906, + "loss": 0.7217, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2529350221157074, + "rewards/margins": -0.008312180638313293, + "rewards/rejected": -0.2446228414773941, + "step": 128 + }, + { + "epoch": 0.17, + "learning_rate": 4.997840952814484e-05, + "logits/chosen": -1.8096928596496582, + "logits/rejected": -1.7790553569793701, + "logps/chosen": -174.4140167236328, + "logps/rejected": -176.01708984375, + "loss": 0.7997, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.24751965701580048, + "rewards/margins": -0.1494641900062561, + "rewards/rejected": -0.09805545210838318, + "step": 129 + }, + { + "epoch": 0.17, + "learning_rate": 4.9976895091056075e-05, + "logits/chosen": -1.7724186182022095, + "logits/rejected": -1.777117371559143, + "logps/chosen": -171.97897338867188, + "logps/rejected": -197.2012939453125, + "loss": 0.6544, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3243841528892517, + "rewards/margins": 0.10846509784460068, + "rewards/rejected": -0.432849258184433, + "step": 130 + }, + { + "epoch": 0.17, + "learning_rate": 4.9975329349282826e-05, + "logits/chosen": -1.6474465131759644, + "logits/rejected": -1.632018804550171, + "logps/chosen": -176.29461669921875, + "logps/rejected": -191.7507781982422, + "loss": 0.7379, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5088250637054443, + "rewards/margins": -0.030252262949943542, + "rewards/rejected": -0.4785728454589844, + "step": 131 + }, + { + "epoch": 0.17, + "learning_rate": 4.9973712306041256e-05, + "logits/chosen": -2.033618927001953, + "logits/rejected": -2.0342512130737305, + "logps/chosen": -198.94796752929688, + "logps/rejected": -192.28482055664062, + "loss": 0.826, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.34583836793899536, + "rewards/margins": -0.18120835721492767, + "rewards/rejected": -0.1646299660205841, + "step": 132 + }, + { + "epoch": 0.17, + "learning_rate": 4.997204396465292e-05, + "logits/chosen": -1.5918235778808594, + "logits/rejected": -1.5706799030303955, + "logps/chosen": -157.1723175048828, + "logps/rejected": -182.7075958251953, + "loss": 0.8134, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5003759860992432, + "rewards/margins": -0.07385056465864182, + "rewards/rejected": -0.42652541399002075, + "step": 133 + }, + { + "epoch": 0.18, + "learning_rate": 4.997032432854472e-05, + "logits/chosen": -1.6958372592926025, + "logits/rejected": -1.7072336673736572, + "logps/chosen": -182.0128173828125, + "logps/rejected": -202.56802368164062, + "loss": 0.52, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08301500976085663, + "rewards/margins": 0.4688724875450134, + "rewards/rejected": -0.5518875122070312, + "step": 134 + }, + { + "epoch": 0.18, + "learning_rate": 4.996855340124894e-05, + "logits/chosen": -1.7832672595977783, + "logits/rejected": -1.767440915107727, + "logps/chosen": -193.7747344970703, + "logps/rejected": -197.20289611816406, + "loss": 0.7392, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.39458268880844116, + "rewards/margins": -0.038821250200271606, + "rewards/rejected": -0.35576146841049194, + "step": 135 + }, + { + "epoch": 0.18, + "learning_rate": 4.996673118640323e-05, + "logits/chosen": -1.7633535861968994, + "logits/rejected": -1.7420529127120972, + "logps/chosen": -154.8060760498047, + "logps/rejected": -170.86996459960938, + "loss": 0.6108, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15521948039531708, + "rewards/margins": 0.2389230877161026, + "rewards/rejected": -0.3941425681114197, + "step": 136 + }, + { + "epoch": 0.18, + "learning_rate": 4.996485768775055e-05, + "logits/chosen": -1.7529706954956055, + "logits/rejected": -1.7141683101654053, + "logps/chosen": -233.09747314453125, + "logps/rejected": -237.85906982421875, + "loss": 0.7094, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4181414544582367, + "rewards/margins": 0.051600366830825806, + "rewards/rejected": -0.4697418212890625, + "step": 137 + }, + { + "epoch": 0.18, + "learning_rate": 4.996293290913926e-05, + "logits/chosen": -1.6701140403747559, + "logits/rejected": -1.684870719909668, + "logps/chosen": -192.6127166748047, + "logps/rejected": -188.32798767089844, + "loss": 0.5773, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0851680114865303, + "rewards/margins": 0.31171905994415283, + "rewards/rejected": -0.39688706398010254, + "step": 138 + }, + { + "epoch": 0.18, + "learning_rate": 4.9960956854522986e-05, + "logits/chosen": -1.7777843475341797, + "logits/rejected": -1.73331618309021, + "logps/chosen": -199.59889221191406, + "logps/rejected": -174.27069091796875, + "loss": 0.785, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5378831028938293, + "rewards/margins": -0.13611721992492676, + "rewards/rejected": -0.4017658531665802, + "step": 139 + }, + { + "epoch": 0.18, + "learning_rate": 4.995892952796074e-05, + "logits/chosen": -1.7307249307632446, + "logits/rejected": -1.7584043741226196, + "logps/chosen": -184.14364624023438, + "logps/rejected": -196.90878295898438, + "loss": 0.6489, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4751961827278137, + "rewards/margins": 0.1301158368587494, + "rewards/rejected": -0.6053119897842407, + "step": 140 + }, + { + "epoch": 0.18, + "learning_rate": 4.995685093361682e-05, + "logits/chosen": -1.770018219947815, + "logits/rejected": -1.832364797592163, + "logps/chosen": -183.0137939453125, + "logps/rejected": -204.83804321289062, + "loss": 0.7497, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4263853132724762, + "rewards/margins": -0.025808706879615784, + "rewards/rejected": -0.4005766212940216, + "step": 141 + }, + { + "epoch": 0.19, + "learning_rate": 4.9954721075760824e-05, + "logits/chosen": -1.7604196071624756, + "logits/rejected": -1.7073791027069092, + "logps/chosen": -191.46923828125, + "logps/rejected": -201.73545837402344, + "loss": 0.8181, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5646533370018005, + "rewards/margins": -0.1740642488002777, + "rewards/rejected": -0.3905891180038452, + "step": 142 + }, + { + "epoch": 0.19, + "learning_rate": 4.995253995876767e-05, + "logits/chosen": -1.5798883438110352, + "logits/rejected": -1.5167430639266968, + "logps/chosen": -235.8765106201172, + "logps/rejected": -222.53204345703125, + "loss": 0.7115, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5441933870315552, + "rewards/margins": 0.09400075674057007, + "rewards/rejected": -0.6381941437721252, + "step": 143 + }, + { + "epoch": 0.19, + "learning_rate": 4.995030758711756e-05, + "logits/chosen": -1.932177186012268, + "logits/rejected": -1.8748353719711304, + "logps/chosen": -182.81915283203125, + "logps/rejected": -170.09844970703125, + "loss": 0.6448, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48413196206092834, + "rewards/margins": 0.19957152009010315, + "rewards/rejected": -0.6837034821510315, + "step": 144 + }, + { + "epoch": 0.19, + "learning_rate": 4.994802396539598e-05, + "logits/chosen": -1.8008129596710205, + "logits/rejected": -1.7928048372268677, + "logps/chosen": -202.68096923828125, + "logps/rejected": -199.8201904296875, + "loss": 0.8172, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0157395601272583, + "rewards/margins": -0.10564298927783966, + "rewards/rejected": -0.9100965857505798, + "step": 145 + }, + { + "epoch": 0.19, + "learning_rate": 4.994568909829368e-05, + "logits/chosen": -1.7543656826019287, + "logits/rejected": -1.7212865352630615, + "logps/chosen": -201.3524932861328, + "logps/rejected": -218.5485382080078, + "loss": 0.9316, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.7368103265762329, + "rewards/margins": -0.3777569532394409, + "rewards/rejected": -0.359053373336792, + "step": 146 + }, + { + "epoch": 0.19, + "learning_rate": 4.9943302990606684e-05, + "logits/chosen": -1.7704360485076904, + "logits/rejected": -1.6632460355758667, + "logps/chosen": -187.3475341796875, + "logps/rejected": -180.01144409179688, + "loss": 0.751, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4657554626464844, + "rewards/margins": 0.006175771355628967, + "rewards/rejected": -0.47193124890327454, + "step": 147 + }, + { + "epoch": 0.19, + "learning_rate": 4.994086564723626e-05, + "logits/chosen": -1.9261763095855713, + "logits/rejected": -1.9572409391403198, + "logps/chosen": -171.56101989746094, + "logps/rejected": -182.58717346191406, + "loss": 0.7349, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7470525503158569, + "rewards/margins": -0.043389588594436646, + "rewards/rejected": -0.7036629915237427, + "step": 148 + }, + { + "epoch": 0.19, + "learning_rate": 4.9938377073188905e-05, + "logits/chosen": -1.9480628967285156, + "logits/rejected": -2.002164363861084, + "logps/chosen": -197.41912841796875, + "logps/rejected": -184.93325805664062, + "loss": 0.813, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6927478909492493, + "rewards/margins": -0.12049313634634018, + "rewards/rejected": -0.5722547769546509, + "step": 149 + }, + { + "epoch": 0.2, + "learning_rate": 4.993583727357638e-05, + "logits/chosen": -1.6262449026107788, + "logits/rejected": -1.640842080116272, + "logps/chosen": -205.38461303710938, + "logps/rejected": -213.60650634765625, + "loss": 0.7821, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7866306304931641, + "rewards/margins": -0.11763662099838257, + "rewards/rejected": -0.6689940094947815, + "step": 150 + }, + { + "epoch": 0.2, + "learning_rate": 4.993324625361565e-05, + "logits/chosen": -1.8480533361434937, + "logits/rejected": -1.8557144403457642, + "logps/chosen": -158.26290893554688, + "logps/rejected": -169.06105041503906, + "loss": 0.7438, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6200124025344849, + "rewards/margins": -0.012443792074918747, + "rewards/rejected": -0.607568621635437, + "step": 151 + }, + { + "epoch": 0.2, + "learning_rate": 4.993060401862888e-05, + "logits/chosen": -1.8685041666030884, + "logits/rejected": -1.8648606538772583, + "logps/chosen": -176.7852020263672, + "logps/rejected": -183.40328979492188, + "loss": 0.6935, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7892077565193176, + "rewards/margins": 0.08869240432977676, + "rewards/rejected": -0.877900242805481, + "step": 152 + }, + { + "epoch": 0.2, + "learning_rate": 4.9927910574043465e-05, + "logits/chosen": -1.9234154224395752, + "logits/rejected": -1.904573917388916, + "logps/chosen": -159.97625732421875, + "logps/rejected": -152.645263671875, + "loss": 0.7778, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6230319142341614, + "rewards/margins": -0.06979034841060638, + "rewards/rejected": -0.5532415509223938, + "step": 153 + }, + { + "epoch": 0.2, + "learning_rate": 4.992516592539196e-05, + "logits/chosen": -1.6896902322769165, + "logits/rejected": -1.7036737203598022, + "logps/chosen": -148.6313018798828, + "logps/rejected": -164.7644500732422, + "loss": 0.5546, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29331690073013306, + "rewards/margins": 0.44352981448173523, + "rewards/rejected": -0.7368468046188354, + "step": 154 + }, + { + "epoch": 0.2, + "learning_rate": 4.9922370078312105e-05, + "logits/chosen": -2.013890266418457, + "logits/rejected": -1.9424934387207031, + "logps/chosen": -215.90118408203125, + "logps/rejected": -209.59071350097656, + "loss": 0.4938, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2655085027217865, + "rewards/margins": 0.5123203992843628, + "rewards/rejected": -0.7778289318084717, + "step": 155 + }, + { + "epoch": 0.2, + "learning_rate": 4.991952303854682e-05, + "logits/chosen": -1.8328962326049805, + "logits/rejected": -1.8138638734817505, + "logps/chosen": -170.13475036621094, + "logps/rejected": -176.11810302734375, + "loss": 0.6684, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3248033821582794, + "rewards/margins": 0.12224595248699188, + "rewards/rejected": -0.4470493197441101, + "step": 156 + }, + { + "epoch": 0.21, + "learning_rate": 4.9916624811944175e-05, + "logits/chosen": -1.9051162004470825, + "logits/rejected": -1.9407715797424316, + "logps/chosen": -177.2139434814453, + "logps/rejected": -185.92947387695312, + "loss": 0.6297, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6396099328994751, + "rewards/margins": 0.1931779384613037, + "rewards/rejected": -0.832787811756134, + "step": 157 + }, + { + "epoch": 0.21, + "learning_rate": 4.991367540445735e-05, + "logits/chosen": -1.7430989742279053, + "logits/rejected": -1.7986749410629272, + "logps/chosen": -199.38021850585938, + "logps/rejected": -195.27647399902344, + "loss": 0.6881, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8835601210594177, + "rewards/margins": 0.10807879269123077, + "rewards/rejected": -0.9916388392448425, + "step": 158 + }, + { + "epoch": 0.21, + "learning_rate": 4.991067482214471e-05, + "logits/chosen": -1.868577241897583, + "logits/rejected": -1.799201488494873, + "logps/chosen": -177.93130493164062, + "logps/rejected": -164.038330078125, + "loss": 0.7179, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7006940245628357, + "rewards/margins": 0.020756253972649574, + "rewards/rejected": -0.7214502096176147, + "step": 159 + }, + { + "epoch": 0.21, + "learning_rate": 4.9907623071169686e-05, + "logits/chosen": -1.8050721883773804, + "logits/rejected": -1.7359880208969116, + "logps/chosen": -197.66583251953125, + "logps/rejected": -172.34146118164062, + "loss": 0.6379, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7713190317153931, + "rewards/margins": 0.19513630867004395, + "rewards/rejected": -0.966455340385437, + "step": 160 + }, + { + "epoch": 0.21, + "learning_rate": 4.990452015780085e-05, + "logits/chosen": -1.74982750415802, + "logits/rejected": -1.719763159751892, + "logps/chosen": -204.21517944335938, + "logps/rejected": -196.62576293945312, + "loss": 0.7434, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.35949403047561646, + "rewards/margins": 0.03506145626306534, + "rewards/rejected": -0.3945554494857788, + "step": 161 + }, + { + "epoch": 0.21, + "learning_rate": 4.9901366088411846e-05, + "logits/chosen": -1.6477172374725342, + "logits/rejected": -1.622018814086914, + "logps/chosen": -207.30174255371094, + "logps/rejected": -229.1028289794922, + "loss": 0.9207, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1269769668579102, + "rewards/margins": -0.2236841320991516, + "rewards/rejected": -0.9032928943634033, + "step": 162 + }, + { + "epoch": 0.21, + "learning_rate": 4.98981608694814e-05, + "logits/chosen": -1.8223265409469604, + "logits/rejected": -1.8162957429885864, + "logps/chosen": -171.70675659179688, + "logps/rejected": -171.83108520507812, + "loss": 0.9174, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8722136616706848, + "rewards/margins": -0.21637174487113953, + "rewards/rejected": -0.6558419466018677, + "step": 163 + }, + { + "epoch": 0.21, + "learning_rate": 4.9894904507593316e-05, + "logits/chosen": -1.9424258470535278, + "logits/rejected": -1.828155279159546, + "logps/chosen": -196.1715545654297, + "logps/rejected": -192.08624267578125, + "loss": 0.7506, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5839821696281433, + "rewards/margins": 0.0038854647427797318, + "rewards/rejected": -0.5878676772117615, + "step": 164 + }, + { + "epoch": 0.22, + "learning_rate": 4.989159700943643e-05, + "logits/chosen": -1.623518705368042, + "logits/rejected": -1.679386854171753, + "logps/chosen": -175.6849365234375, + "logps/rejected": -179.52059936523438, + "loss": 0.9491, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7352603673934937, + "rewards/margins": -0.2515270411968231, + "rewards/rejected": -0.4837333559989929, + "step": 165 + }, + { + "epoch": 0.22, + "learning_rate": 4.988823838180464e-05, + "logits/chosen": -1.997894048690796, + "logits/rejected": -1.9666211605072021, + "logps/chosen": -167.2881622314453, + "logps/rejected": -183.72474670410156, + "loss": 0.6663, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5555349588394165, + "rewards/margins": 0.17635540664196014, + "rewards/rejected": -0.7318904399871826, + "step": 166 + }, + { + "epoch": 0.22, + "learning_rate": 4.988482863159684e-05, + "logits/chosen": -1.8912848234176636, + "logits/rejected": -1.968542218208313, + "logps/chosen": -174.55911254882812, + "logps/rejected": -169.36610412597656, + "loss": 0.7905, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.227503702044487, + "rewards/margins": -0.08394889533519745, + "rewards/rejected": -0.14355483651161194, + "step": 167 + }, + { + "epoch": 0.22, + "learning_rate": 4.988136776581696e-05, + "logits/chosen": -2.151402711868286, + "logits/rejected": -2.186088800430298, + "logps/chosen": -157.3328094482422, + "logps/rejected": -150.9196319580078, + "loss": 0.6989, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.29456794261932373, + "rewards/margins": 0.04511295258998871, + "rewards/rejected": -0.33968091011047363, + "step": 168 + }, + { + "epoch": 0.22, + "learning_rate": 4.9877855791573915e-05, + "logits/chosen": -1.8164244890213013, + "logits/rejected": -1.8539032936096191, + "logps/chosen": -188.2481689453125, + "logps/rejected": -171.02090454101562, + "loss": 0.9341, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5969239473342896, + "rewards/margins": -0.33455953001976013, + "rewards/rejected": -0.2623644471168518, + "step": 169 + }, + { + "epoch": 0.22, + "learning_rate": 4.9874292716081595e-05, + "logits/chosen": -1.7419726848602295, + "logits/rejected": -1.7444337606430054, + "logps/chosen": -169.46658325195312, + "logps/rejected": -173.33348083496094, + "loss": 0.6817, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4356076121330261, + "rewards/margins": 0.14688673615455627, + "rewards/rejected": -0.5824943780899048, + "step": 170 + }, + { + "epoch": 0.22, + "learning_rate": 4.9870678546658865e-05, + "logits/chosen": -1.6884466409683228, + "logits/rejected": -1.7295485734939575, + "logps/chosen": -160.8187713623047, + "logps/rejected": -176.16746520996094, + "loss": 0.9101, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.6096308827400208, + "rewards/margins": -0.28187263011932373, + "rewards/rejected": -0.3277583122253418, + "step": 171 + }, + { + "epoch": 0.23, + "learning_rate": 4.9867013290729535e-05, + "logits/chosen": -1.932777762413025, + "logits/rejected": -1.8993282318115234, + "logps/chosen": -198.2738037109375, + "logps/rejected": -197.27252197265625, + "loss": 0.7183, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19178181886672974, + "rewards/margins": 0.006053738296031952, + "rewards/rejected": -0.1978355497121811, + "step": 172 + }, + { + "epoch": 0.23, + "learning_rate": 4.986329695582237e-05, + "logits/chosen": -2.03489351272583, + "logits/rejected": -2.076827049255371, + "logps/chosen": -179.49679565429688, + "logps/rejected": -177.2965545654297, + "loss": 0.7798, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6112560629844666, + "rewards/margins": -0.09085651487112045, + "rewards/rejected": -0.5203995704650879, + "step": 173 + }, + { + "epoch": 0.23, + "learning_rate": 4.985952954957103e-05, + "logits/chosen": -1.8877630233764648, + "logits/rejected": -1.8290894031524658, + "logps/chosen": -233.42221069335938, + "logps/rejected": -228.90179443359375, + "loss": 0.7997, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4796496629714966, + "rewards/margins": 0.0218522846698761, + "rewards/rejected": -0.5015019774436951, + "step": 174 + }, + { + "epoch": 0.23, + "learning_rate": 4.985571107971408e-05, + "logits/chosen": -1.8358758687973022, + "logits/rejected": -1.8217942714691162, + "logps/chosen": -173.35556030273438, + "logps/rejected": -176.17031860351562, + "loss": 0.6678, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03331407904624939, + "rewards/margins": 0.18146347999572754, + "rewards/rejected": -0.14814940094947815, + "step": 175 + }, + { + "epoch": 0.23, + "learning_rate": 4.9851841554095e-05, + "logits/chosen": -1.9195314645767212, + "logits/rejected": -1.9220951795578003, + "logps/chosen": -233.7957305908203, + "logps/rejected": -216.81581115722656, + "loss": 0.5707, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0034320950508117676, + "rewards/margins": 0.3490698039531708, + "rewards/rejected": -0.35250189900398254, + "step": 176 + }, + { + "epoch": 0.23, + "learning_rate": 4.9847920980662134e-05, + "logits/chosen": -1.598222255706787, + "logits/rejected": -1.591965913772583, + "logps/chosen": -226.130615234375, + "logps/rejected": -236.63760375976562, + "loss": 0.6588, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06091824173927307, + "rewards/margins": 0.13529105484485626, + "rewards/rejected": -0.07437281310558319, + "step": 177 + }, + { + "epoch": 0.23, + "learning_rate": 4.984394936746865e-05, + "logits/chosen": -1.7949796915054321, + "logits/rejected": -1.805631399154663, + "logps/chosen": -232.09498596191406, + "logps/rejected": -235.5016632080078, + "loss": 0.709, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5284535884857178, + "rewards/margins": 0.1780376434326172, + "rewards/rejected": -0.706491231918335, + "step": 178 + }, + { + "epoch": 0.23, + "learning_rate": 4.98399267226726e-05, + "logits/chosen": -2.0285134315490723, + "logits/rejected": -2.060884714126587, + "logps/chosen": -183.62449645996094, + "logps/rejected": -172.28872680664062, + "loss": 0.7795, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.42201852798461914, + "rewards/margins": -0.029273340478539467, + "rewards/rejected": -0.3927451968193054, + "step": 179 + }, + { + "epoch": 0.24, + "learning_rate": 4.9835853054536846e-05, + "logits/chosen": -1.7763793468475342, + "logits/rejected": -1.8531831502914429, + "logps/chosen": -182.07516479492188, + "logps/rejected": -205.60504150390625, + "loss": 0.9431, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.2773244380950928, + "rewards/margins": -0.36518311500549316, + "rewards/rejected": 0.08785867691040039, + "step": 180 + }, + { + "epoch": 0.24, + "learning_rate": 4.9831728371429046e-05, + "logits/chosen": -1.8723288774490356, + "logits/rejected": -1.9752717018127441, + "logps/chosen": -200.94346618652344, + "logps/rejected": -210.53933715820312, + "loss": 0.7466, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15453863143920898, + "rewards/margins": -0.037006717175245285, + "rewards/rejected": -0.117531917989254, + "step": 181 + }, + { + "epoch": 0.24, + "learning_rate": 4.982755268182164e-05, + "logits/chosen": -1.7186784744262695, + "logits/rejected": -1.752870798110962, + "logps/chosen": -168.73394775390625, + "logps/rejected": -177.32054138183594, + "loss": 0.7175, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1720157265663147, + "rewards/margins": -0.019572071731090546, + "rewards/rejected": -0.15244367718696594, + "step": 182 + }, + { + "epoch": 0.24, + "learning_rate": 4.982332599429187e-05, + "logits/chosen": -1.9437085390090942, + "logits/rejected": -1.9051276445388794, + "logps/chosen": -168.2744903564453, + "logps/rejected": -158.18289184570312, + "loss": 0.8465, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.40080204606056213, + "rewards/margins": -0.24280087649822235, + "rewards/rejected": -0.15800118446350098, + "step": 183 + }, + { + "epoch": 0.24, + "learning_rate": 4.981904831752171e-05, + "logits/chosen": -1.9985157251358032, + "logits/rejected": -1.9973095655441284, + "logps/chosen": -182.0140380859375, + "logps/rejected": -169.61883544921875, + "loss": 0.6244, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.13748487830162048, + "rewards/margins": 0.31921717524528503, + "rewards/rejected": -0.4567020535469055, + "step": 184 + }, + { + "epoch": 0.24, + "learning_rate": 4.981471966029787e-05, + "logits/chosen": -1.9101628065109253, + "logits/rejected": -1.9257937669754028, + "logps/chosen": -192.24288940429688, + "logps/rejected": -188.32144165039062, + "loss": 0.6148, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2866635024547577, + "rewards/margins": 0.2923651337623596, + "rewards/rejected": -0.00570157915353775, + "step": 185 + }, + { + "epoch": 0.24, + "learning_rate": 4.981034003151178e-05, + "logits/chosen": -1.8881627321243286, + "logits/rejected": -1.9550986289978027, + "logps/chosen": -195.71372985839844, + "logps/rejected": -210.1089324951172, + "loss": 0.8689, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2077367603778839, + "rewards/margins": -0.24497191607952118, + "rewards/rejected": 0.03723515570163727, + "step": 186 + }, + { + "epoch": 0.24, + "learning_rate": 4.980590944015958e-05, + "logits/chosen": -1.7422230243682861, + "logits/rejected": -1.7998031377792358, + "logps/chosen": -216.39398193359375, + "logps/rejected": -222.33880615234375, + "loss": 0.7159, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.11723195761442184, + "rewards/margins": 0.26700979471206665, + "rewards/rejected": -0.14977779984474182, + "step": 187 + }, + { + "epoch": 0.25, + "learning_rate": 4.98014278953421e-05, + "logits/chosen": -1.8071612119674683, + "logits/rejected": -1.8180828094482422, + "logps/chosen": -176.4374542236328, + "logps/rejected": -189.68438720703125, + "loss": 0.8578, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.02567705512046814, + "rewards/margins": -0.21896688640117645, + "rewards/rejected": 0.2446439117193222, + "step": 188 + }, + { + "epoch": 0.25, + "learning_rate": 4.979689540626479e-05, + "logits/chosen": -1.7617331743240356, + "logits/rejected": -1.8094758987426758, + "logps/chosen": -158.13441467285156, + "logps/rejected": -177.64797973632812, + "loss": 0.5662, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3027116656303406, + "rewards/margins": 0.35941970348358154, + "rewards/rejected": -0.05670810118317604, + "step": 189 + }, + { + "epoch": 0.25, + "learning_rate": 4.9792311982237774e-05, + "logits/chosen": -1.528577208518982, + "logits/rejected": -1.5104334354400635, + "logps/chosen": -164.3386993408203, + "logps/rejected": -167.84840393066406, + "loss": 0.7506, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04946248233318329, + "rewards/margins": -0.02245260775089264, + "rewards/rejected": -0.027009889483451843, + "step": 190 + }, + { + "epoch": 0.25, + "learning_rate": 4.9787677632675825e-05, + "logits/chosen": -1.86935555934906, + "logits/rejected": -1.8780128955841064, + "logps/chosen": -167.26119995117188, + "logps/rejected": -187.39584350585938, + "loss": 0.6634, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.09376392513513565, + "rewards/margins": 0.16032886505126953, + "rewards/rejected": -0.06656493991613388, + "step": 191 + }, + { + "epoch": 0.25, + "learning_rate": 4.978299236709826e-05, + "logits/chosen": -1.79931640625, + "logits/rejected": -1.7840967178344727, + "logps/chosen": -173.83987426757812, + "logps/rejected": -174.4556427001953, + "loss": 0.8509, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.17128746211528778, + "rewards/margins": -0.21691852807998657, + "rewards/rejected": 0.04563106596469879, + "step": 192 + }, + { + "epoch": 0.25, + "learning_rate": 4.977825619512904e-05, + "logits/chosen": -1.9714162349700928, + "logits/rejected": -1.9255212545394897, + "logps/chosen": -216.35391235351562, + "logps/rejected": -213.25570678710938, + "loss": 0.787, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.07267741113901138, + "rewards/margins": -0.07426212728023529, + "rewards/rejected": 0.0015847217291593552, + "step": 193 + }, + { + "epoch": 0.25, + "learning_rate": 4.977346912649666e-05, + "logits/chosen": -1.8389030694961548, + "logits/rejected": -1.872612476348877, + "logps/chosen": -199.1173095703125, + "logps/rejected": -170.51205444335938, + "loss": 0.6682, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.054506294429302216, + "rewards/margins": 0.1410951465368271, + "rewards/rejected": -0.1956014782190323, + "step": 194 + }, + { + "epoch": 0.26, + "learning_rate": 4.9768631171034175e-05, + "logits/chosen": -1.6105570793151855, + "logits/rejected": -1.655145525932312, + "logps/chosen": -183.67723083496094, + "logps/rejected": -176.943115234375, + "loss": 0.8043, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.09822743386030197, + "rewards/margins": -0.16278451681137085, + "rewards/rejected": 0.06455708295106888, + "step": 195 + }, + { + "epoch": 0.26, + "learning_rate": 4.9763742338679145e-05, + "logits/chosen": -1.589104413986206, + "logits/rejected": -1.544286847114563, + "logps/chosen": -188.53167724609375, + "logps/rejected": -190.18521118164062, + "loss": 0.8959, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08583441376686096, + "rewards/margins": -0.30193185806274414, + "rewards/rejected": 0.21609747409820557, + "step": 196 + }, + { + "epoch": 0.26, + "learning_rate": 4.975880263947367e-05, + "logits/chosen": -1.5240559577941895, + "logits/rejected": -1.5749508142471313, + "logps/chosen": -173.70506286621094, + "logps/rejected": -174.54681396484375, + "loss": 0.708, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.030573375523090363, + "rewards/margins": 0.030332941561937332, + "rewards/rejected": 0.00024041905999183655, + "step": 197 + }, + { + "epoch": 0.26, + "learning_rate": 4.9753812083564304e-05, + "logits/chosen": -1.879758596420288, + "logits/rejected": -1.8182477951049805, + "logps/chosen": -186.97525024414062, + "logps/rejected": -159.67259216308594, + "loss": 0.7133, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18908704817295074, + "rewards/margins": 0.015400439500808716, + "rewards/rejected": 0.17368660867214203, + "step": 198 + }, + { + "epoch": 0.26, + "learning_rate": 4.974877068120208e-05, + "logits/chosen": -1.8003509044647217, + "logits/rejected": -1.8137016296386719, + "logps/chosen": -180.23757934570312, + "logps/rejected": -191.359375, + "loss": 0.7853, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0054572634398937225, + "rewards/margins": -0.046278372406959534, + "rewards/rejected": 0.05173564702272415, + "step": 199 + }, + { + "epoch": 0.26, + "learning_rate": 4.974367844274248e-05, + "logits/chosen": -1.7015259265899658, + "logits/rejected": -1.6682251691818237, + "logps/chosen": -155.98707580566406, + "logps/rejected": -146.8739776611328, + "loss": 0.7147, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.09909596294164658, + "rewards/margins": 0.09801648557186127, + "rewards/rejected": 0.001079469919204712, + "step": 200 + }, + { + "epoch": 0.26, + "learning_rate": 4.973853537864538e-05, + "logits/chosen": -1.9078797101974487, + "logits/rejected": -1.8878577947616577, + "logps/chosen": -213.63034057617188, + "logps/rejected": -196.01133728027344, + "loss": 0.7176, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07858332991600037, + "rewards/margins": 0.04805755987763405, + "rewards/rejected": -0.12664087116718292, + "step": 201 + }, + { + "epoch": 0.26, + "learning_rate": 4.973334149947508e-05, + "logits/chosen": -1.8100643157958984, + "logits/rejected": -1.8916385173797607, + "logps/chosen": -154.20033264160156, + "logps/rejected": -180.57833862304688, + "loss": 0.9816, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.11480588465929031, + "rewards/margins": -0.3577505350112915, + "rewards/rejected": 0.24294468760490417, + "step": 202 + }, + { + "epoch": 0.27, + "learning_rate": 4.972809681590026e-05, + "logits/chosen": -1.6414841413497925, + "logits/rejected": -1.6490445137023926, + "logps/chosen": -186.58531188964844, + "logps/rejected": -188.83343505859375, + "loss": 0.7131, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0989757627248764, + "rewards/margins": 0.02306460589170456, + "rewards/rejected": 0.07591113448143005, + "step": 203 + }, + { + "epoch": 0.27, + "learning_rate": 4.972280133869396e-05, + "logits/chosen": -1.7980338335037231, + "logits/rejected": -1.8528972864151, + "logps/chosen": -194.47787475585938, + "logps/rejected": -186.64306640625, + "loss": 0.7408, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.09773577749729156, + "rewards/margins": 0.03546319156885147, + "rewards/rejected": 0.06227259710431099, + "step": 204 + }, + { + "epoch": 0.27, + "learning_rate": 4.971745507873352e-05, + "logits/chosen": -1.8588396310806274, + "logits/rejected": -1.7567483186721802, + "logps/chosen": -181.69345092773438, + "logps/rejected": -187.5961151123047, + "loss": 0.8451, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.1902497261762619, + "rewards/margins": -0.21404078602790833, + "rewards/rejected": 0.023791024461388588, + "step": 205 + }, + { + "epoch": 0.27, + "learning_rate": 4.971205804700063e-05, + "logits/chosen": -1.8144092559814453, + "logits/rejected": -1.8621768951416016, + "logps/chosen": -143.86961364746094, + "logps/rejected": -168.99295043945312, + "loss": 0.8074, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.11274349689483643, + "rewards/margins": -0.13580113649368286, + "rewards/rejected": 0.023057660087943077, + "step": 206 + }, + { + "epoch": 0.27, + "learning_rate": 4.970661025458125e-05, + "logits/chosen": -1.573486089706421, + "logits/rejected": -1.5931901931762695, + "logps/chosen": -157.45480346679688, + "logps/rejected": -169.86550903320312, + "loss": 0.7474, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.007056853733956814, + "rewards/margins": -0.06664810329675674, + "rewards/rejected": 0.07370495796203613, + "step": 207 + }, + { + "epoch": 0.27, + "learning_rate": 4.9701111712665625e-05, + "logits/chosen": -2.0106117725372314, + "logits/rejected": -1.9366306066513062, + "logps/chosen": -190.40353393554688, + "logps/rejected": -176.56820678710938, + "loss": 0.6494, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.12490460276603699, + "rewards/margins": 0.2138100564479828, + "rewards/rejected": -0.08890549838542938, + "step": 208 + }, + { + "epoch": 0.27, + "learning_rate": 4.969556243254822e-05, + "logits/chosen": -1.7928516864776611, + "logits/rejected": -1.8632910251617432, + "logps/chosen": -232.3812255859375, + "logps/rejected": -237.18328857421875, + "loss": 0.725, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20639315247535706, + "rewards/margins": 0.010047540068626404, + "rewards/rejected": 0.19634561240673065, + "step": 209 + }, + { + "epoch": 0.27, + "learning_rate": 4.968996242562774e-05, + "logits/chosen": -1.8890585899353027, + "logits/rejected": -1.8688653707504272, + "logps/chosen": -191.57362365722656, + "logps/rejected": -199.00550842285156, + "loss": 0.7262, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2404320389032364, + "rewards/margins": -0.03374467045068741, + "rewards/rejected": -0.2066873461008072, + "step": 210 + }, + { + "epoch": 0.28, + "learning_rate": 4.968431170340706e-05, + "logits/chosen": -1.6715140342712402, + "logits/rejected": -1.619262456893921, + "logps/chosen": -178.09326171875, + "logps/rejected": -181.5880126953125, + "loss": 0.7146, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.08340275287628174, + "rewards/margins": 0.0807909369468689, + "rewards/rejected": 0.0026118261739611626, + "step": 211 + }, + { + "epoch": 0.28, + "learning_rate": 4.9678610277493275e-05, + "logits/chosen": -1.6335276365280151, + "logits/rejected": -1.6984977722167969, + "logps/chosen": -181.51470947265625, + "logps/rejected": -188.1268768310547, + "loss": 0.6818, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13411936163902283, + "rewards/margins": 0.05135034769773483, + "rewards/rejected": 0.0827689841389656, + "step": 212 + }, + { + "epoch": 0.28, + "learning_rate": 4.967285815959759e-05, + "logits/chosen": -1.5702835321426392, + "logits/rejected": -1.6215555667877197, + "logps/chosen": -177.0418701171875, + "logps/rejected": -186.50494384765625, + "loss": 0.6603, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.04579095542430878, + "rewards/margins": 0.188707172870636, + "rewards/rejected": -0.23449814319610596, + "step": 213 + }, + { + "epoch": 0.28, + "learning_rate": 4.9667055361535354e-05, + "logits/chosen": -1.7180269956588745, + "logits/rejected": -1.7135778665542603, + "logps/chosen": -195.20785522460938, + "logps/rejected": -210.96878051757812, + "loss": 0.9002, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03346429392695427, + "rewards/margins": -0.26186707615852356, + "rewards/rejected": 0.29533132910728455, + "step": 214 + }, + { + "epoch": 0.28, + "learning_rate": 4.9661201895226e-05, + "logits/chosen": -1.7542705535888672, + "logits/rejected": -1.7284282445907593, + "logps/chosen": -173.01751708984375, + "logps/rejected": -157.28419494628906, + "loss": 0.6185, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02107839845120907, + "rewards/margins": 0.2878818213939667, + "rewards/rejected": -0.26680341362953186, + "step": 215 + }, + { + "epoch": 0.28, + "learning_rate": 4.965529777269306e-05, + "logits/chosen": -1.736549973487854, + "logits/rejected": -1.771423578262329, + "logps/chosen": -158.92160034179688, + "logps/rejected": -163.87283325195312, + "loss": 0.7139, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.006686069071292877, + "rewards/margins": 0.04404951259493828, + "rewards/rejected": -0.0373634397983551, + "step": 216 + }, + { + "epoch": 0.28, + "learning_rate": 4.964934300606411e-05, + "logits/chosen": -1.511568307876587, + "logits/rejected": -1.5145093202590942, + "logps/chosen": -170.07809448242188, + "logps/rejected": -186.4696807861328, + "loss": 0.607, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2800918221473694, + "rewards/margins": 0.30444180965423584, + "rewards/rejected": -0.02434997633099556, + "step": 217 + }, + { + "epoch": 0.29, + "learning_rate": 4.964333760757074e-05, + "logits/chosen": -1.436962366104126, + "logits/rejected": -1.4119391441345215, + "logps/chosen": -309.4395446777344, + "logps/rejected": -291.1634521484375, + "loss": 0.6898, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0010376125574111938, + "rewards/margins": 0.03738358989357948, + "rewards/rejected": -0.036345988512039185, + "step": 218 + }, + { + "epoch": 0.29, + "learning_rate": 4.963728158954856e-05, + "logits/chosen": -1.891182541847229, + "logits/rejected": -1.8770077228546143, + "logps/chosen": -162.81988525390625, + "logps/rejected": -169.5299072265625, + "loss": 0.8258, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.07328500598669052, + "rewards/margins": -0.13723579049110413, + "rewards/rejected": 0.21052080392837524, + "step": 219 + }, + { + "epoch": 0.29, + "learning_rate": 4.963117496443715e-05, + "logits/chosen": -1.8470525741577148, + "logits/rejected": -1.8625476360321045, + "logps/chosen": -166.7591552734375, + "logps/rejected": -194.91290283203125, + "loss": 0.9473, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.4374409317970276, + "rewards/margins": -0.35452863574028015, + "rewards/rejected": -0.08291231095790863, + "step": 220 + }, + { + "epoch": 0.29, + "learning_rate": 4.9625017744780045e-05, + "logits/chosen": -1.5161206722259521, + "logits/rejected": -1.4952480792999268, + "logps/chosen": -173.3487548828125, + "logps/rejected": -167.93292236328125, + "loss": 0.8109, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.0004083700478076935, + "rewards/margins": -0.171901136636734, + "rewards/rejected": 0.1714927703142166, + "step": 221 + }, + { + "epoch": 0.29, + "learning_rate": 4.96188099432247e-05, + "logits/chosen": -1.617663025856018, + "logits/rejected": -1.6117898225784302, + "logps/chosen": -194.1741180419922, + "logps/rejected": -188.07025146484375, + "loss": 0.7477, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.21349965035915375, + "rewards/margins": 0.016070939600467682, + "rewards/rejected": -0.22957059741020203, + "step": 222 + }, + { + "epoch": 0.29, + "learning_rate": 4.9612551572522464e-05, + "logits/chosen": -1.899290680885315, + "logits/rejected": -1.8964145183563232, + "logps/chosen": -192.67469787597656, + "logps/rejected": -179.21112060546875, + "loss": 1.0079, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24318143725395203, + "rewards/margins": -0.3611811697483063, + "rewards/rejected": 0.11799970269203186, + "step": 223 + }, + { + "epoch": 0.29, + "learning_rate": 4.960624264552858e-05, + "logits/chosen": -1.6361061334609985, + "logits/rejected": -1.6821738481521606, + "logps/chosen": -194.6638641357422, + "logps/rejected": -179.90225219726562, + "loss": 0.6424, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.19569191336631775, + "rewards/margins": 0.2166380137205124, + "rewards/rejected": -0.020946092903614044, + "step": 224 + }, + { + "epoch": 0.29, + "learning_rate": 4.9599883175202124e-05, + "logits/chosen": -1.522665023803711, + "logits/rejected": -1.4538843631744385, + "logps/chosen": -222.08251953125, + "logps/rejected": -200.12472534179688, + "loss": 0.8261, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.26220959424972534, + "rewards/margins": -0.16530472040176392, + "rewards/rejected": -0.09690490365028381, + "step": 225 + }, + { + "epoch": 0.3, + "learning_rate": 4.9593473174605974e-05, + "logits/chosen": -1.4936704635620117, + "logits/rejected": -1.5721888542175293, + "logps/chosen": -182.029541015625, + "logps/rejected": -204.53567504882812, + "loss": 0.6881, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3360684812068939, + "rewards/margins": 0.14660386741161346, + "rewards/rejected": -0.4826723635196686, + "step": 226 + }, + { + "epoch": 0.3, + "learning_rate": 4.958701265690685e-05, + "logits/chosen": -1.6544736623764038, + "logits/rejected": -1.672118902206421, + "logps/chosen": -174.0331268310547, + "logps/rejected": -203.7425079345703, + "loss": 0.7058, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.34017136693000793, + "rewards/margins": 0.01865684613585472, + "rewards/rejected": -0.3588281571865082, + "step": 227 + }, + { + "epoch": 0.3, + "learning_rate": 4.958050163537519e-05, + "logits/chosen": -1.8430697917938232, + "logits/rejected": -1.7734485864639282, + "logps/chosen": -208.96421813964844, + "logps/rejected": -212.52711486816406, + "loss": 0.8319, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6053764820098877, + "rewards/margins": -0.1353476196527481, + "rewards/rejected": -0.4700288772583008, + "step": 228 + }, + { + "epoch": 0.3, + "learning_rate": 4.957394012338519e-05, + "logits/chosen": -1.9725829362869263, + "logits/rejected": -1.9301397800445557, + "logps/chosen": -229.4776153564453, + "logps/rejected": -215.60470581054688, + "loss": 0.7281, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.046676263213157654, + "rewards/margins": 0.04501792788505554, + "rewards/rejected": 0.0016583409160375595, + "step": 229 + }, + { + "epoch": 0.3, + "learning_rate": 4.956732813441477e-05, + "logits/chosen": -1.733205795288086, + "logits/rejected": -1.617655873298645, + "logps/chosen": -174.42959594726562, + "logps/rejected": -154.43499755859375, + "loss": 0.9236, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.30461782217025757, + "rewards/margins": -0.30838024616241455, + "rewards/rejected": 0.0037624058313667774, + "step": 230 + }, + { + "epoch": 0.3, + "learning_rate": 4.956066568204552e-05, + "logits/chosen": -1.6661994457244873, + "logits/rejected": -1.7003294229507446, + "logps/chosen": -179.96853637695312, + "logps/rejected": -187.9959716796875, + "loss": 0.8541, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.47250720858573914, + "rewards/margins": -0.12853975594043732, + "rewards/rejected": -0.3439674377441406, + "step": 231 + }, + { + "epoch": 0.3, + "learning_rate": 4.955395277996268e-05, + "logits/chosen": -1.786563754081726, + "logits/rejected": -1.7648732662200928, + "logps/chosen": -193.4469757080078, + "logps/rejected": -202.775146484375, + "loss": 0.5666, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.182941734790802, + "rewards/margins": 0.3387344479560852, + "rewards/rejected": -0.5216761231422424, + "step": 232 + }, + { + "epoch": 0.3, + "learning_rate": 4.954718944195512e-05, + "logits/chosen": -1.7109639644622803, + "logits/rejected": -1.6762810945510864, + "logps/chosen": -154.25538635253906, + "logps/rejected": -158.89683532714844, + "loss": 0.7073, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.25806286931037903, + "rewards/margins": 0.007834136486053467, + "rewards/rejected": -0.2658970057964325, + "step": 233 + }, + { + "epoch": 0.31, + "learning_rate": 4.954037568191534e-05, + "logits/chosen": -1.7765631675720215, + "logits/rejected": -1.8208155632019043, + "logps/chosen": -189.78883361816406, + "logps/rejected": -214.85067749023438, + "loss": 0.7049, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3182518482208252, + "rewards/margins": 0.08276516944169998, + "rewards/rejected": -0.4010169804096222, + "step": 234 + }, + { + "epoch": 0.31, + "learning_rate": 4.9533511513839384e-05, + "logits/chosen": -1.6697825193405151, + "logits/rejected": -1.688278079032898, + "logps/chosen": -174.92227172851562, + "logps/rejected": -173.88099670410156, + "loss": 0.579, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08302205801010132, + "rewards/margins": 0.34601137042045593, + "rewards/rejected": -0.42903345823287964, + "step": 235 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526596951826824e-05, + "logits/chosen": -1.9389506578445435, + "logits/rejected": -1.8745498657226562, + "logps/chosen": -193.29092407226562, + "logps/rejected": -187.18719482421875, + "loss": 0.8118, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6870115995407104, + "rewards/margins": -0.025065027177333832, + "rewards/rejected": -0.6619465351104736, + "step": 236 + }, + { + "epoch": 0.31, + "learning_rate": 4.951963201008076e-05, + "logits/chosen": -1.9140545129776, + "logits/rejected": -1.8272314071655273, + "logps/chosen": -154.29287719726562, + "logps/rejected": -149.56021118164062, + "loss": 0.8096, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10280251502990723, + "rewards/margins": -0.07113885879516602, + "rewards/rejected": -0.03166365623474121, + "step": 237 + }, + { + "epoch": 0.31, + "learning_rate": 4.951261670290781e-05, + "logits/chosen": -2.0082364082336426, + "logits/rejected": -2.00789737701416, + "logps/chosen": -186.12057495117188, + "logps/rejected": -193.4689483642578, + "loss": 0.7454, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.14050771296024323, + "rewards/margins": 0.02027921937406063, + "rewards/rejected": -0.16078691184520721, + "step": 238 + }, + { + "epoch": 0.31, + "learning_rate": 4.950555104471799e-05, + "logits/chosen": -1.851813554763794, + "logits/rejected": -1.8402018547058105, + "logps/chosen": -157.9239959716797, + "logps/rejected": -147.2781524658203, + "loss": 0.7481, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21430522203445435, + "rewards/margins": -0.030568838119506836, + "rewards/rejected": -0.1837363839149475, + "step": 239 + }, + { + "epoch": 0.31, + "learning_rate": 4.949843505002477e-05, + "logits/chosen": -1.9467929601669312, + "logits/rejected": -1.976270079612732, + "logps/chosen": -170.30682373046875, + "logps/rejected": -167.61927795410156, + "loss": 0.7485, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2596588134765625, + "rewards/margins": -0.021375911310315132, + "rewards/rejected": -0.23828287422657013, + "step": 240 + }, + { + "epoch": 0.32, + "learning_rate": 4.9491268733445034e-05, + "logits/chosen": -1.724785327911377, + "logits/rejected": -1.7340233325958252, + "logps/chosen": -204.80548095703125, + "logps/rejected": -209.44329833984375, + "loss": 0.7051, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09321151673793793, + "rewards/margins": 0.0742495059967041, + "rewards/rejected": -0.16746100783348083, + "step": 241 + }, + { + "epoch": 0.32, + "learning_rate": 4.9484052109698984e-05, + "logits/chosen": -1.7430789470672607, + "logits/rejected": -1.7313511371612549, + "logps/chosen": -181.70632934570312, + "logps/rejected": -162.24334716796875, + "loss": 0.844, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.014489106833934784, + "rewards/margins": -0.029044844210147858, + "rewards/rejected": 0.014555716887116432, + "step": 242 + }, + { + "epoch": 0.32, + "learning_rate": 4.947678519361021e-05, + "logits/chosen": -1.9160277843475342, + "logits/rejected": -1.8753935098648071, + "logps/chosen": -175.2951202392578, + "logps/rejected": -161.4536590576172, + "loss": 0.6499, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1920948028564453, + "rewards/margins": 0.18029190599918365, + "rewards/rejected": -0.3723866939544678, + "step": 243 + }, + { + "epoch": 0.32, + "learning_rate": 4.946946800010556e-05, + "logits/chosen": -1.788377046585083, + "logits/rejected": -1.804762601852417, + "logps/chosen": -190.5827178955078, + "logps/rejected": -207.48460388183594, + "loss": 0.7442, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.20643703639507294, + "rewards/margins": 0.0016913870349526405, + "rewards/rejected": -0.20812839269638062, + "step": 244 + }, + { + "epoch": 0.32, + "learning_rate": 4.946210054421518e-05, + "logits/chosen": -1.943693995475769, + "logits/rejected": -1.9860758781433105, + "logps/chosen": -162.47232055664062, + "logps/rejected": -187.59640502929688, + "loss": 0.5544, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04131259024143219, + "rewards/margins": 0.34646379947662354, + "rewards/rejected": -0.3877764344215393, + "step": 245 + }, + { + "epoch": 0.32, + "learning_rate": 4.945468284107246e-05, + "logits/chosen": -1.7154016494750977, + "logits/rejected": -1.729323387145996, + "logps/chosen": -151.67153930664062, + "logps/rejected": -175.7374725341797, + "loss": 0.7351, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3676840364933014, + "rewards/margins": -0.05210195109248161, + "rewards/rejected": -0.3155820667743683, + "step": 246 + }, + { + "epoch": 0.32, + "learning_rate": 4.944721490591401e-05, + "logits/chosen": -1.5419683456420898, + "logits/rejected": -1.5722306966781616, + "logps/chosen": -158.3173065185547, + "logps/rejected": -168.21975708007812, + "loss": 0.7106, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.040096037089824677, + "rewards/margins": 0.040343452244997025, + "rewards/rejected": -0.00024740397930145264, + "step": 247 + }, + { + "epoch": 0.32, + "learning_rate": 4.9439696754079595e-05, + "logits/chosen": -1.8851487636566162, + "logits/rejected": -1.927181601524353, + "logps/chosen": -163.97447204589844, + "logps/rejected": -171.12020874023438, + "loss": 0.6697, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4691685140132904, + "rewards/margins": 0.14997676014900208, + "rewards/rejected": -0.6191452741622925, + "step": 248 + }, + { + "epoch": 0.33, + "learning_rate": 4.9432128401012144e-05, + "logits/chosen": -1.5929148197174072, + "logits/rejected": -1.5544054508209229, + "logps/chosen": -143.14022827148438, + "logps/rejected": -158.71368408203125, + "loss": 0.6763, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0918157696723938, + "rewards/margins": 0.08131375163793564, + "rewards/rejected": -0.17312952876091003, + "step": 249 + }, + { + "epoch": 0.33, + "learning_rate": 4.9424509862257706e-05, + "logits/chosen": -1.599873423576355, + "logits/rejected": -1.5568993091583252, + "logps/chosen": -197.35276794433594, + "logps/rejected": -228.1996307373047, + "loss": 0.6008, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22382640838623047, + "rewards/margins": 0.30180901288986206, + "rewards/rejected": -0.5256354808807373, + "step": 250 + }, + { + "epoch": 0.33, + "learning_rate": 4.941684115346541e-05, + "logits/chosen": -1.9682908058166504, + "logits/rejected": -1.9601702690124512, + "logps/chosen": -178.14833068847656, + "logps/rejected": -180.44769287109375, + "loss": 0.6096, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2652210593223572, + "rewards/margins": 0.3119816184043884, + "rewards/rejected": -0.5772026181221008, + "step": 251 + }, + { + "epoch": 0.33, + "learning_rate": 4.940912229038745e-05, + "logits/chosen": -1.7443188428878784, + "logits/rejected": -1.720470666885376, + "logps/chosen": -176.2379150390625, + "logps/rejected": -166.10626220703125, + "loss": 0.8199, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5284540057182312, + "rewards/margins": -0.06340186297893524, + "rewards/rejected": -0.46505218744277954, + "step": 252 + }, + { + "epoch": 0.33, + "learning_rate": 4.9401353288879024e-05, + "logits/chosen": -1.8005255460739136, + "logits/rejected": -1.814915657043457, + "logps/chosen": -173.22021484375, + "logps/rejected": -187.9818878173828, + "loss": 0.6487, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36548957228660583, + "rewards/margins": 0.12811800837516785, + "rewards/rejected": -0.49360761046409607, + "step": 253 + }, + { + "epoch": 0.33, + "learning_rate": 4.9393534164898335e-05, + "logits/chosen": -1.8766534328460693, + "logits/rejected": -1.9305753707885742, + "logps/chosen": -166.86866760253906, + "logps/rejected": -195.80569458007812, + "loss": 0.6937, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2575337588787079, + "rewards/margins": 0.2505089044570923, + "rewards/rejected": -0.5080426931381226, + "step": 254 + }, + { + "epoch": 0.33, + "learning_rate": 4.9385664934506526e-05, + "logits/chosen": -1.7149075269699097, + "logits/rejected": -1.7573699951171875, + "logps/chosen": -169.73626708984375, + "logps/rejected": -178.3174591064453, + "loss": 0.6638, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4145450294017792, + "rewards/margins": 0.1643792688846588, + "rewards/rejected": -0.578924298286438, + "step": 255 + }, + { + "epoch": 0.34, + "learning_rate": 4.937774561386768e-05, + "logits/chosen": -1.8144739866256714, + "logits/rejected": -1.8054416179656982, + "logps/chosen": -201.68247985839844, + "logps/rejected": -208.70188903808594, + "loss": 0.7625, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2262319028377533, + "rewards/margins": -0.053480371832847595, + "rewards/rejected": -0.1727515161037445, + "step": 256 + }, + { + "epoch": 0.34, + "learning_rate": 4.936977621924875e-05, + "logits/chosen": -1.721892237663269, + "logits/rejected": -1.7585711479187012, + "logps/chosen": -193.45179748535156, + "logps/rejected": -200.13726806640625, + "loss": 0.798, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7297480702400208, + "rewards/margins": -0.1376451551914215, + "rewards/rejected": -0.5921030044555664, + "step": 257 + }, + { + "epoch": 0.34, + "learning_rate": 4.9361756767019564e-05, + "logits/chosen": -1.8132922649383545, + "logits/rejected": -1.8062866926193237, + "logps/chosen": -204.11619567871094, + "logps/rejected": -196.605224609375, + "loss": 0.802, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10710492730140686, + "rewards/margins": -0.09782031178474426, + "rewards/rejected": -0.009284593164920807, + "step": 258 + }, + { + "epoch": 0.34, + "learning_rate": 4.935368727365276e-05, + "logits/chosen": -1.6550960540771484, + "logits/rejected": -1.6377525329589844, + "logps/chosen": -191.87579345703125, + "logps/rejected": -181.06930541992188, + "loss": 0.7402, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8034918308258057, + "rewards/margins": -0.04004772752523422, + "rewards/rejected": -0.763444185256958, + "step": 259 + }, + { + "epoch": 0.34, + "learning_rate": 4.934556775572377e-05, + "logits/chosen": -1.9349067211151123, + "logits/rejected": -1.9205700159072876, + "logps/chosen": -173.06373596191406, + "logps/rejected": -173.32766723632812, + "loss": 0.6361, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.24924635887145996, + "rewards/margins": 0.23114144802093506, + "rewards/rejected": -0.480387806892395, + "step": 260 + }, + { + "epoch": 0.34, + "learning_rate": 4.9337398229910784e-05, + "logits/chosen": -1.8233386278152466, + "logits/rejected": -1.8753117322921753, + "logps/chosen": -189.73959350585938, + "logps/rejected": -197.85728454589844, + "loss": 0.7225, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24001392722129822, + "rewards/margins": 0.08805333077907562, + "rewards/rejected": -0.32806724309921265, + "step": 261 + }, + { + "epoch": 0.34, + "learning_rate": 4.932917871299471e-05, + "logits/chosen": -1.540401816368103, + "logits/rejected": -1.5170302391052246, + "logps/chosen": -205.3408203125, + "logps/rejected": -206.5533905029297, + "loss": 0.8948, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7126679420471191, + "rewards/margins": -0.22102710604667664, + "rewards/rejected": -0.4916408061981201, + "step": 262 + }, + { + "epoch": 0.34, + "learning_rate": 4.9320909221859134e-05, + "logits/chosen": -1.934309482574463, + "logits/rejected": -1.945433497428894, + "logps/chosen": -170.4419708251953, + "logps/rejected": -165.6936492919922, + "loss": 0.7461, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23198306560516357, + "rewards/margins": -0.018536821007728577, + "rewards/rejected": -0.213446244597435, + "step": 263 + }, + { + "epoch": 0.35, + "learning_rate": 4.9312589773490304e-05, + "logits/chosen": -2.026982545852661, + "logits/rejected": -1.9359885454177856, + "logps/chosen": -185.02920532226562, + "logps/rejected": -173.5999298095703, + "loss": 0.6839, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39162588119506836, + "rewards/margins": 0.09019112586975098, + "rewards/rejected": -0.48181700706481934, + "step": 264 + }, + { + "epoch": 0.35, + "learning_rate": 4.930422038497708e-05, + "logits/chosen": -1.9103275537490845, + "logits/rejected": -1.8527649641036987, + "logps/chosen": -167.06378173828125, + "logps/rejected": -153.1953125, + "loss": 0.6232, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32273101806640625, + "rewards/margins": 0.2372804582118988, + "rewards/rejected": -0.5600115060806274, + "step": 265 + }, + { + "epoch": 0.35, + "learning_rate": 4.92958010735109e-05, + "logits/chosen": -1.9541754722595215, + "logits/rejected": -2.0632808208465576, + "logps/chosen": -181.56781005859375, + "logps/rejected": -199.48483276367188, + "loss": 0.5323, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1671663373708725, + "rewards/margins": 0.5313636064529419, + "rewards/rejected": -0.6985299587249756, + "step": 266 + }, + { + "epoch": 0.35, + "learning_rate": 4.928733185638575e-05, + "logits/chosen": -1.7843657732009888, + "logits/rejected": -1.8627678155899048, + "logps/chosen": -167.1883087158203, + "logps/rejected": -172.48223876953125, + "loss": 0.7816, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2200300097465515, + "rewards/margins": -0.0496581606566906, + "rewards/rejected": -0.1703718900680542, + "step": 267 + }, + { + "epoch": 0.35, + "learning_rate": 4.927881275099815e-05, + "logits/chosen": -1.713842511177063, + "logits/rejected": -1.805971622467041, + "logps/chosen": -192.92250061035156, + "logps/rejected": -212.14866638183594, + "loss": 0.6343, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.17002353072166443, + "rewards/margins": 0.28741562366485596, + "rewards/rejected": -0.4574391543865204, + "step": 268 + }, + { + "epoch": 0.35, + "learning_rate": 4.927024377484705e-05, + "logits/chosen": -1.682020664215088, + "logits/rejected": -1.7268104553222656, + "logps/chosen": -156.587158203125, + "logps/rejected": -159.53341674804688, + "loss": 0.6657, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.16453158855438232, + "rewards/margins": 0.15859198570251465, + "rewards/rejected": -0.323123574256897, + "step": 269 + }, + { + "epoch": 0.35, + "learning_rate": 4.9261624945533855e-05, + "logits/chosen": -1.8595139980316162, + "logits/rejected": -1.8612048625946045, + "logps/chosen": -163.1502685546875, + "logps/rejected": -193.11166381835938, + "loss": 0.6646, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2516610622406006, + "rewards/margins": 0.18862426280975342, + "rewards/rejected": -0.440285325050354, + "step": 270 + }, + { + "epoch": 0.35, + "learning_rate": 4.925295628076241e-05, + "logits/chosen": -1.8986504077911377, + "logits/rejected": -1.951588749885559, + "logps/chosen": -161.01625061035156, + "logps/rejected": -171.39744567871094, + "loss": 0.7413, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18460389971733093, + "rewards/margins": -0.004900887608528137, + "rewards/rejected": -0.1797029972076416, + "step": 271 + }, + { + "epoch": 0.36, + "learning_rate": 4.9244237798338866e-05, + "logits/chosen": -1.7942367792129517, + "logits/rejected": -1.8609907627105713, + "logps/chosen": -190.5836181640625, + "logps/rejected": -181.18942260742188, + "loss": 0.8213, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7920368909835815, + "rewards/margins": -0.13006603717803955, + "rewards/rejected": -0.6619707942008972, + "step": 272 + }, + { + "epoch": 0.36, + "learning_rate": 4.923546951617175e-05, + "logits/chosen": -1.7586820125579834, + "logits/rejected": -1.8338139057159424, + "logps/chosen": -161.4608154296875, + "logps/rejected": -174.6810302734375, + "loss": 0.7838, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.15520796179771423, + "rewards/margins": 0.007751762866973877, + "rewards/rejected": -0.1629597246646881, + "step": 273 + }, + { + "epoch": 0.36, + "learning_rate": 4.922665145227187e-05, + "logits/chosen": -1.999558925628662, + "logits/rejected": -1.9667410850524902, + "logps/chosen": -177.38986206054688, + "logps/rejected": -182.5379638671875, + "loss": 0.8666, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.26553717255592346, + "rewards/margins": -0.1137295514345169, + "rewards/rejected": -0.15180760622024536, + "step": 274 + }, + { + "epoch": 0.36, + "learning_rate": 4.9217783624752266e-05, + "logits/chosen": -1.7799978256225586, + "logits/rejected": -1.800316572189331, + "logps/chosen": -189.352783203125, + "logps/rejected": -173.5353546142578, + "loss": 0.7974, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6156832575798035, + "rewards/margins": -0.09660260379314423, + "rewards/rejected": -0.519080638885498, + "step": 275 + }, + { + "epoch": 0.36, + "learning_rate": 4.920886605182823e-05, + "logits/chosen": -1.7154024839401245, + "logits/rejected": -1.7451473474502563, + "logps/chosen": -162.1999053955078, + "logps/rejected": -177.0426483154297, + "loss": 0.6234, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3276807367801666, + "rewards/margins": 0.274338036775589, + "rewards/rejected": -0.6020187139511108, + "step": 276 + }, + { + "epoch": 0.36, + "learning_rate": 4.919989875181722e-05, + "logits/chosen": -1.769112467765808, + "logits/rejected": -1.759423851966858, + "logps/chosen": -170.71876525878906, + "logps/rejected": -173.2998809814453, + "loss": 0.7953, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2965158224105835, + "rewards/margins": 0.05386320501565933, + "rewards/rejected": -0.3503790497779846, + "step": 277 + }, + { + "epoch": 0.36, + "learning_rate": 4.919088174313884e-05, + "logits/chosen": -1.2533071041107178, + "logits/rejected": -1.3566581010818481, + "logps/chosen": -200.62548828125, + "logps/rejected": -190.6791534423828, + "loss": 0.7712, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7463827729225159, + "rewards/margins": -0.011319484561681747, + "rewards/rejected": -0.7350633144378662, + "step": 278 + }, + { + "epoch": 0.37, + "learning_rate": 4.91818150443148e-05, + "logits/chosen": -1.958874225616455, + "logits/rejected": -1.8372151851654053, + "logps/chosen": -184.42295837402344, + "logps/rejected": -178.30995178222656, + "loss": 0.9, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.6464177370071411, + "rewards/margins": -0.2897469401359558, + "rewards/rejected": -0.3566707670688629, + "step": 279 + }, + { + "epoch": 0.37, + "learning_rate": 4.917269867396886e-05, + "logits/chosen": -1.731322169303894, + "logits/rejected": -1.8198059797286987, + "logps/chosen": -157.6179962158203, + "logps/rejected": -162.81597900390625, + "loss": 0.7364, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6854066848754883, + "rewards/margins": 0.03314337879419327, + "rewards/rejected": -0.7185500860214233, + "step": 280 + }, + { + "epoch": 0.37, + "learning_rate": 4.916353265082686e-05, + "logits/chosen": -1.5918539762496948, + "logits/rejected": -1.5675849914550781, + "logps/chosen": -229.21499633789062, + "logps/rejected": -248.54562377929688, + "loss": 0.6943, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4582862854003906, + "rewards/margins": 0.10020212829113007, + "rewards/rejected": -0.5584883689880371, + "step": 281 + }, + { + "epoch": 0.37, + "learning_rate": 4.9154316993716565e-05, + "logits/chosen": -1.9066779613494873, + "logits/rejected": -1.9495766162872314, + "logps/chosen": -149.90614318847656, + "logps/rejected": -146.74632263183594, + "loss": 0.7492, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.44106772541999817, + "rewards/margins": -0.07195230573415756, + "rewards/rejected": -0.36911541223526, + "step": 282 + }, + { + "epoch": 0.37, + "learning_rate": 4.9145051721567734e-05, + "logits/chosen": -2.0382936000823975, + "logits/rejected": -1.9569990634918213, + "logps/chosen": -166.32168579101562, + "logps/rejected": -162.79673767089844, + "loss": 0.6561, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3699617087841034, + "rewards/margins": 0.14387893676757812, + "rewards/rejected": -0.5138406157493591, + "step": 283 + }, + { + "epoch": 0.37, + "learning_rate": 4.913573685341205e-05, + "logits/chosen": -1.474026083946228, + "logits/rejected": -1.5045154094696045, + "logps/chosen": -229.38619995117188, + "logps/rejected": -236.83164978027344, + "loss": 0.8663, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.9608985781669617, + "rewards/margins": -0.17912134528160095, + "rewards/rejected": -0.7817772626876831, + "step": 284 + }, + { + "epoch": 0.37, + "learning_rate": 4.9126372408383025e-05, + "logits/chosen": -1.8263182640075684, + "logits/rejected": -1.8128119707107544, + "logps/chosen": -190.57106018066406, + "logps/rejected": -223.3215789794922, + "loss": 0.7163, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5443102717399597, + "rewards/margins": 0.06481970846652985, + "rewards/rejected": -0.6091300249099731, + "step": 285 + }, + { + "epoch": 0.37, + "learning_rate": 4.911695840571605e-05, + "logits/chosen": -1.7644751071929932, + "logits/rejected": -1.8120529651641846, + "logps/chosen": -205.31759643554688, + "logps/rejected": -199.3420867919922, + "loss": 0.8917, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6110701560974121, + "rewards/margins": -0.2775843143463135, + "rewards/rejected": -0.33348581194877625, + "step": 286 + }, + { + "epoch": 0.38, + "learning_rate": 4.910749486474828e-05, + "logits/chosen": -1.6636385917663574, + "logits/rejected": -1.5808690786361694, + "logps/chosen": -176.58572387695312, + "logps/rejected": -195.53761291503906, + "loss": 0.7043, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6308318972587585, + "rewards/margins": 0.1960841417312622, + "rewards/rejected": -0.8269160985946655, + "step": 287 + }, + { + "epoch": 0.38, + "learning_rate": 4.909798180491865e-05, + "logits/chosen": -1.946243166923523, + "logits/rejected": -2.0143542289733887, + "logps/chosen": -178.8447265625, + "logps/rejected": -188.88853454589844, + "loss": 0.796, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4359901547431946, + "rewards/margins": -0.07123897969722748, + "rewards/rejected": -0.3647511601448059, + "step": 288 + }, + { + "epoch": 0.38, + "learning_rate": 4.9088419245767803e-05, + "logits/chosen": -2.0332159996032715, + "logits/rejected": -1.9962480068206787, + "logps/chosen": -181.74835205078125, + "logps/rejected": -198.86822509765625, + "loss": 0.8508, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9367510676383972, + "rewards/margins": -0.1084718257188797, + "rewards/rejected": -0.8282791972160339, + "step": 289 + }, + { + "epoch": 0.38, + "learning_rate": 4.907880720693804e-05, + "logits/chosen": -2.006517171859741, + "logits/rejected": -1.8688626289367676, + "logps/chosen": -176.2784423828125, + "logps/rejected": -171.2430419921875, + "loss": 0.8853, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.8897907733917236, + "rewards/margins": -0.2713664770126343, + "rewards/rejected": -0.6184243559837341, + "step": 290 + }, + { + "epoch": 0.38, + "learning_rate": 4.9069145708173324e-05, + "logits/chosen": -1.914872169494629, + "logits/rejected": -1.9809991121292114, + "logps/chosen": -152.42398071289062, + "logps/rejected": -172.4163360595703, + "loss": 0.7878, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8594350814819336, + "rewards/margins": -0.11893659830093384, + "rewards/rejected": -0.740498423576355, + "step": 291 + }, + { + "epoch": 0.38, + "learning_rate": 4.9059434769319205e-05, + "logits/chosen": -1.4920971393585205, + "logits/rejected": -1.479290246963501, + "logps/chosen": -221.7364501953125, + "logps/rejected": -217.07138061523438, + "loss": 0.8203, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8598781824111938, + "rewards/margins": -0.09480879455804825, + "rewards/rejected": -0.7650693655014038, + "step": 292 + }, + { + "epoch": 0.38, + "learning_rate": 4.904967441032278e-05, + "logits/chosen": -1.6795321702957153, + "logits/rejected": -1.7278181314468384, + "logps/chosen": -176.05941772460938, + "logps/rejected": -176.9445037841797, + "loss": 0.6673, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6722793579101562, + "rewards/margins": 0.1413556933403015, + "rewards/rejected": -0.8136351108551025, + "step": 293 + }, + { + "epoch": 0.38, + "learning_rate": 4.903986465123266e-05, + "logits/chosen": -1.8752367496490479, + "logits/rejected": -1.815263032913208, + "logps/chosen": -163.13931274414062, + "logps/rejected": -167.08154296875, + "loss": 0.6859, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.587963879108429, + "rewards/margins": 0.14399470388889313, + "rewards/rejected": -0.7319585084915161, + "step": 294 + }, + { + "epoch": 0.39, + "learning_rate": 4.903000551219894e-05, + "logits/chosen": -2.049884796142578, + "logits/rejected": -2.096831798553467, + "logps/chosen": -159.4933624267578, + "logps/rejected": -172.0717010498047, + "loss": 0.7986, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7208172082901001, + "rewards/margins": -0.12478935718536377, + "rewards/rejected": -0.5960277915000916, + "step": 295 + }, + { + "epoch": 0.39, + "learning_rate": 4.902009701347313e-05, + "logits/chosen": -1.733120322227478, + "logits/rejected": -1.7238028049468994, + "logps/chosen": -207.2619171142578, + "logps/rejected": -196.6807098388672, + "loss": 0.7436, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.46741175651550293, + "rewards/margins": 0.01438647136092186, + "rewards/rejected": -0.4817982316017151, + "step": 296 + }, + { + "epoch": 0.39, + "learning_rate": 4.901013917540814e-05, + "logits/chosen": -2.027272939682007, + "logits/rejected": -1.9993281364440918, + "logps/chosen": -184.6562042236328, + "logps/rejected": -183.0182647705078, + "loss": 0.7772, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6997889280319214, + "rewards/margins": 0.0013962779194116592, + "rewards/rejected": -0.7011851668357849, + "step": 297 + }, + { + "epoch": 0.39, + "learning_rate": 4.900013201845821e-05, + "logits/chosen": -1.5796035528182983, + "logits/rejected": -1.3949412107467651, + "logps/chosen": -175.8470458984375, + "logps/rejected": -212.80296325683594, + "loss": 0.7523, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.37455612421035767, + "rewards/margins": -0.005637466907501221, + "rewards/rejected": -0.36891865730285645, + "step": 298 + }, + { + "epoch": 0.39, + "learning_rate": 4.899007556317893e-05, + "logits/chosen": -1.9345701932907104, + "logits/rejected": -2.001033306121826, + "logps/chosen": -250.92816162109375, + "logps/rejected": -241.9100341796875, + "loss": 0.7686, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.4188327193260193, + "rewards/margins": -0.09854313731193542, + "rewards/rejected": -0.32028958201408386, + "step": 299 + }, + { + "epoch": 0.39, + "learning_rate": 4.8979969830227086e-05, + "logits/chosen": -1.9917688369750977, + "logits/rejected": -2.0171382427215576, + "logps/chosen": -177.34434509277344, + "logps/rejected": -170.58657836914062, + "loss": 0.8629, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5179021954536438, + "rewards/margins": -0.09615220129489899, + "rewards/rejected": -0.4217500388622284, + "step": 300 + }, + { + "epoch": 0.39, + "learning_rate": 4.896981484036074e-05, + "logits/chosen": -2.010779619216919, + "logits/rejected": -2.0212411880493164, + "logps/chosen": -190.38487243652344, + "logps/rejected": -189.54226684570312, + "loss": 0.6184, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.45494896173477173, + "rewards/margins": 0.2873493432998657, + "rewards/rejected": -0.7422983050346375, + "step": 301 + }, + { + "epoch": 0.4, + "learning_rate": 4.895961061443911e-05, + "logits/chosen": -1.8409286737442017, + "logits/rejected": -1.7947205305099487, + "logps/chosen": -189.4046173095703, + "logps/rejected": -178.43617248535156, + "loss": 0.8209, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7244110107421875, + "rewards/margins": -0.0501336008310318, + "rewards/rejected": -0.6742774248123169, + "step": 302 + }, + { + "epoch": 0.4, + "learning_rate": 4.894935717342255e-05, + "logits/chosen": -1.7337801456451416, + "logits/rejected": -1.6998471021652222, + "logps/chosen": -228.77264404296875, + "logps/rejected": -214.67515563964844, + "loss": 0.7953, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7624901533126831, + "rewards/margins": -0.0503702238202095, + "rewards/rejected": -0.7121198773384094, + "step": 303 + }, + { + "epoch": 0.4, + "learning_rate": 4.8939054538372496e-05, + "logits/chosen": -1.7520679235458374, + "logits/rejected": -1.752877116203308, + "logps/chosen": -186.9480743408203, + "logps/rejected": -195.37429809570312, + "loss": 0.8588, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6405542492866516, + "rewards/margins": -0.206780344247818, + "rewards/rejected": -0.4337739050388336, + "step": 304 + }, + { + "epoch": 0.4, + "learning_rate": 4.8928702730451456e-05, + "logits/chosen": -1.9656537771224976, + "logits/rejected": -2.027155876159668, + "logps/chosen": -199.00657653808594, + "logps/rejected": -212.56494140625, + "loss": 0.7799, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6149024963378906, + "rewards/margins": -0.018680021166801453, + "rewards/rejected": -0.5962225198745728, + "step": 305 + }, + { + "epoch": 0.4, + "learning_rate": 4.891830177092294e-05, + "logits/chosen": -1.629424810409546, + "logits/rejected": -1.6878198385238647, + "logps/chosen": -170.93138122558594, + "logps/rejected": -171.99090576171875, + "loss": 0.6828, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.673701822757721, + "rewards/margins": 0.13800469040870667, + "rewards/rejected": -0.8117064237594604, + "step": 306 + }, + { + "epoch": 0.4, + "learning_rate": 4.8907851681151396e-05, + "logits/chosen": -1.7744640111923218, + "logits/rejected": -1.855400800704956, + "logps/chosen": -162.42169189453125, + "logps/rejected": -166.64620971679688, + "loss": 0.9846, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8018124103546143, + "rewards/margins": -0.3769915699958801, + "rewards/rejected": -0.42482078075408936, + "step": 307 + }, + { + "epoch": 0.4, + "learning_rate": 4.889735248260221e-05, + "logits/chosen": -1.889973521232605, + "logits/rejected": -1.907044768333435, + "logps/chosen": -166.85736083984375, + "logps/rejected": -188.35101318359375, + "loss": 0.6128, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3316620886325836, + "rewards/margins": 0.2465287297964096, + "rewards/rejected": -0.578190803527832, + "step": 308 + }, + { + "epoch": 0.4, + "learning_rate": 4.8886804196841626e-05, + "logits/chosen": -2.047497034072876, + "logits/rejected": -2.0145740509033203, + "logps/chosen": -201.215087890625, + "logps/rejected": -203.3246307373047, + "loss": 0.7635, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49714764952659607, + "rewards/margins": 0.03132334351539612, + "rewards/rejected": -0.5284709930419922, + "step": 309 + }, + { + "epoch": 0.41, + "learning_rate": 4.887620684553674e-05, + "logits/chosen": -1.831432580947876, + "logits/rejected": -1.826611876487732, + "logps/chosen": -170.79603576660156, + "logps/rejected": -184.5007781982422, + "loss": 0.734, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7777528166770935, + "rewards/margins": 0.054428160190582275, + "rewards/rejected": -0.832180917263031, + "step": 310 + }, + { + "epoch": 0.41, + "learning_rate": 4.886556045045542e-05, + "logits/chosen": -2.050309896469116, + "logits/rejected": -2.0222089290618896, + "logps/chosen": -186.81155395507812, + "logps/rejected": -177.9705810546875, + "loss": 0.7883, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6639207601547241, + "rewards/margins": 0.03769933432340622, + "rewards/rejected": -0.7016200423240662, + "step": 311 + }, + { + "epoch": 0.41, + "learning_rate": 4.8854865033466275e-05, + "logits/chosen": -2.076099395751953, + "logits/rejected": -2.0824105739593506, + "logps/chosen": -179.39295959472656, + "logps/rejected": -174.53573608398438, + "loss": 0.7306, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6415868401527405, + "rewards/margins": 0.04155872389674187, + "rewards/rejected": -0.6831455230712891, + "step": 312 + }, + { + "epoch": 0.41, + "learning_rate": 4.88441206165386e-05, + "logits/chosen": -1.6709840297698975, + "logits/rejected": -1.8081105947494507, + "logps/chosen": -166.87539672851562, + "logps/rejected": -183.02621459960938, + "loss": 0.7271, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.518951952457428, + "rewards/margins": 0.06794089823961258, + "rewards/rejected": -0.58689284324646, + "step": 313 + }, + { + "epoch": 0.41, + "learning_rate": 4.8833327221742356e-05, + "logits/chosen": -1.9388179779052734, + "logits/rejected": -1.9303326606750488, + "logps/chosen": -229.31259155273438, + "logps/rejected": -218.38491821289062, + "loss": 0.9369, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7209200263023376, + "rewards/margins": -0.36736997961997986, + "rewards/rejected": -0.3535500466823578, + "step": 314 + }, + { + "epoch": 0.41, + "learning_rate": 4.88224848712481e-05, + "logits/chosen": -1.8626773357391357, + "logits/rejected": -1.9242452383041382, + "logps/chosen": -190.93325805664062, + "logps/rejected": -220.92437744140625, + "loss": 0.7958, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9179306030273438, + "rewards/margins": -0.005557693541049957, + "rewards/rejected": -0.9123728275299072, + "step": 315 + }, + { + "epoch": 0.41, + "learning_rate": 4.881159358732694e-05, + "logits/chosen": -1.9244499206542969, + "logits/rejected": -1.9016033411026, + "logps/chosen": -208.19638061523438, + "logps/rejected": -194.61898803710938, + "loss": 0.6003, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5419009923934937, + "rewards/margins": 0.4854595363140106, + "rewards/rejected": -1.0273605585098267, + "step": 316 + }, + { + "epoch": 0.41, + "learning_rate": 4.8800653392350526e-05, + "logits/chosen": -2.065157413482666, + "logits/rejected": -1.979295015335083, + "logps/chosen": -167.4358367919922, + "logps/rejected": -148.04000854492188, + "loss": 0.79, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6589381098747253, + "rewards/margins": -0.009491220116615295, + "rewards/rejected": -0.6494468450546265, + "step": 317 + }, + { + "epoch": 0.42, + "learning_rate": 4.8789664308790936e-05, + "logits/chosen": -1.8643383979797363, + "logits/rejected": -1.801065444946289, + "logps/chosen": -163.1219482421875, + "logps/rejected": -162.10537719726562, + "loss": 0.6148, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.19017738103866577, + "rewards/margins": 0.2843154966831207, + "rewards/rejected": -0.4744928777217865, + "step": 318 + }, + { + "epoch": 0.42, + "learning_rate": 4.8778626359220715e-05, + "logits/chosen": -1.7437247037887573, + "logits/rejected": -1.704676628112793, + "logps/chosen": -152.5662078857422, + "logps/rejected": -159.8699493408203, + "loss": 0.7078, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5272018909454346, + "rewards/margins": 0.054309070110321045, + "rewards/rejected": -0.5815109014511108, + "step": 319 + }, + { + "epoch": 0.42, + "learning_rate": 4.8767539566312734e-05, + "logits/chosen": -1.884958028793335, + "logits/rejected": -1.849034070968628, + "logps/chosen": -162.9790802001953, + "logps/rejected": -154.30078125, + "loss": 0.7916, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5811585187911987, + "rewards/margins": -0.11695164442062378, + "rewards/rejected": -0.46420690417289734, + "step": 320 + }, + { + "epoch": 0.42, + "learning_rate": 4.875640395284023e-05, + "logits/chosen": -1.9072563648223877, + "logits/rejected": -1.8811615705490112, + "logps/chosen": -173.24008178710938, + "logps/rejected": -179.58273315429688, + "loss": 0.7763, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.484822154045105, + "rewards/margins": -0.08394555747509003, + "rewards/rejected": -0.40087658166885376, + "step": 321 + }, + { + "epoch": 0.42, + "learning_rate": 4.874521954167671e-05, + "logits/chosen": -1.8919446468353271, + "logits/rejected": -1.9220166206359863, + "logps/chosen": -158.00631713867188, + "logps/rejected": -158.1962890625, + "loss": 0.6527, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2469102144241333, + "rewards/margins": 0.14598041772842407, + "rewards/rejected": -0.39289066195487976, + "step": 322 + }, + { + "epoch": 0.42, + "learning_rate": 4.8733986355795905e-05, + "logits/chosen": -2.0576484203338623, + "logits/rejected": -1.957137107849121, + "logps/chosen": -183.29859924316406, + "logps/rejected": -164.59544372558594, + "loss": 0.6596, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19953250885009766, + "rewards/margins": 0.1992800384759903, + "rewards/rejected": -0.39881253242492676, + "step": 323 + }, + { + "epoch": 0.42, + "learning_rate": 4.8722704418271745e-05, + "logits/chosen": -1.9749754667282104, + "logits/rejected": -1.918540120124817, + "logps/chosen": -167.0876922607422, + "logps/rejected": -155.88064575195312, + "loss": 0.8313, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.398946613073349, + "rewards/margins": -0.19024060666561127, + "rewards/rejected": -0.20870603621006012, + "step": 324 + }, + { + "epoch": 0.43, + "learning_rate": 4.871137375227829e-05, + "logits/chosen": -1.6178803443908691, + "logits/rejected": -1.73251211643219, + "logps/chosen": -267.42718505859375, + "logps/rejected": -300.83477783203125, + "loss": 0.8896, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6622705459594727, + "rewards/margins": -0.21010492742061615, + "rewards/rejected": -0.4521656334400177, + "step": 325 + }, + { + "epoch": 0.43, + "learning_rate": 4.869999438108971e-05, + "logits/chosen": -1.888526439666748, + "logits/rejected": -1.8802413940429688, + "logps/chosen": -190.6936798095703, + "logps/rejected": -198.58245849609375, + "loss": 0.8107, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6914330124855042, + "rewards/margins": -0.09704277664422989, + "rewards/rejected": -0.5943902134895325, + "step": 326 + }, + { + "epoch": 0.43, + "learning_rate": 4.8688566328080215e-05, + "logits/chosen": -1.5124105215072632, + "logits/rejected": -1.5067615509033203, + "logps/chosen": -282.9752502441406, + "logps/rejected": -281.37933349609375, + "loss": 0.7879, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5186710953712463, + "rewards/margins": -0.08422104269266129, + "rewards/rejected": -0.4344501197338104, + "step": 327 + }, + { + "epoch": 0.43, + "learning_rate": 4.867708961672399e-05, + "logits/chosen": -1.8901722431182861, + "logits/rejected": -1.8916290998458862, + "logps/chosen": -194.58010864257812, + "logps/rejected": -206.20706176757812, + "loss": 0.7058, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2762334942817688, + "rewards/margins": 0.04776221513748169, + "rewards/rejected": -0.3239956796169281, + "step": 328 + }, + { + "epoch": 0.43, + "learning_rate": 4.866556427059519e-05, + "logits/chosen": -1.9781274795532227, + "logits/rejected": -1.991908073425293, + "logps/chosen": -190.02462768554688, + "logps/rejected": -172.36788940429688, + "loss": 0.6999, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3898717761039734, + "rewards/margins": 0.08414015173912048, + "rewards/rejected": -0.47401195764541626, + "step": 329 + }, + { + "epoch": 0.43, + "learning_rate": 4.865399031336787e-05, + "logits/chosen": -1.7196893692016602, + "logits/rejected": -1.7024250030517578, + "logps/chosen": -183.65408325195312, + "logps/rejected": -183.18309020996094, + "loss": 0.6556, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18018794059753418, + "rewards/margins": 0.1877795308828354, + "rewards/rejected": -0.3679674565792084, + "step": 330 + }, + { + "epoch": 0.43, + "learning_rate": 4.8642367768815936e-05, + "logits/chosen": -1.82463800907135, + "logits/rejected": -1.9342741966247559, + "logps/chosen": -136.54745483398438, + "logps/rejected": -149.12637329101562, + "loss": 0.7994, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.28317660093307495, + "rewards/margins": -0.111660435795784, + "rewards/rejected": -0.17151619493961334, + "step": 331 + }, + { + "epoch": 0.43, + "learning_rate": 4.863069666081307e-05, + "logits/chosen": -1.9666064977645874, + "logits/rejected": -1.9600831270217896, + "logps/chosen": -160.50009155273438, + "logps/rejected": -172.79641723632812, + "loss": 0.7938, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3549930453300476, + "rewards/margins": -0.036753974854946136, + "rewards/rejected": -0.3182390332221985, + "step": 332 + }, + { + "epoch": 0.44, + "learning_rate": 4.861897701333274e-05, + "logits/chosen": -1.7229609489440918, + "logits/rejected": -1.7512197494506836, + "logps/chosen": -160.8597869873047, + "logps/rejected": -182.94717407226562, + "loss": 0.7614, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5264466404914856, + "rewards/margins": -0.05138474702835083, + "rewards/rejected": -0.4750618636608124, + "step": 333 + }, + { + "epoch": 0.44, + "learning_rate": 4.86072088504481e-05, + "logits/chosen": -1.8726041316986084, + "logits/rejected": -1.8346372842788696, + "logps/chosen": -193.1851348876953, + "logps/rejected": -175.16256713867188, + "loss": 0.7167, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04130769148468971, + "rewards/margins": 0.13345830142498016, + "rewards/rejected": -0.09215061366558075, + "step": 334 + }, + { + "epoch": 0.44, + "learning_rate": 4.859539219633199e-05, + "logits/chosen": -1.6277759075164795, + "logits/rejected": -1.650618553161621, + "logps/chosen": -192.23538208007812, + "logps/rejected": -195.0891876220703, + "loss": 0.9001, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2764393389225006, + "rewards/margins": -0.1653566211462021, + "rewards/rejected": -0.11108270287513733, + "step": 335 + }, + { + "epoch": 0.44, + "learning_rate": 4.8583527075256804e-05, + "logits/chosen": -1.9752824306488037, + "logits/rejected": -1.976406455039978, + "logps/chosen": -215.763916015625, + "logps/rejected": -222.56468200683594, + "loss": 0.6023, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3105733394622803, + "rewards/margins": 0.3618737459182739, + "rewards/rejected": -0.6724470853805542, + "step": 336 + }, + { + "epoch": 0.44, + "learning_rate": 4.857161351159454e-05, + "logits/chosen": -1.7939121723175049, + "logits/rejected": -1.8076589107513428, + "logps/chosen": -175.3466033935547, + "logps/rejected": -176.5845947265625, + "loss": 0.6957, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10015758872032166, + "rewards/margins": 0.10513995587825775, + "rewards/rejected": -0.2052975744009018, + "step": 337 + }, + { + "epoch": 0.44, + "learning_rate": 4.8559651529816664e-05, + "logits/chosen": -1.7438242435455322, + "logits/rejected": -1.7907154560089111, + "logps/chosen": -190.70948791503906, + "logps/rejected": -204.99143981933594, + "loss": 0.6727, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.06472301483154297, + "rewards/margins": 0.1332576423883438, + "rewards/rejected": -0.19798064231872559, + "step": 338 + }, + { + "epoch": 0.44, + "learning_rate": 4.854764115449411e-05, + "logits/chosen": -1.7970941066741943, + "logits/rejected": -1.8352383375167847, + "logps/chosen": -167.88230895996094, + "logps/rejected": -166.8907928466797, + "loss": 0.6771, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2522534728050232, + "rewards/margins": 0.06870199739933014, + "rewards/rejected": -0.3209554851055145, + "step": 339 + }, + { + "epoch": 0.44, + "learning_rate": 4.853558241029723e-05, + "logits/chosen": -1.9054516553878784, + "logits/rejected": -1.9092512130737305, + "logps/chosen": -173.30734252929688, + "logps/rejected": -189.3043975830078, + "loss": 0.7071, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2111372947692871, + "rewards/margins": 0.07802311331033707, + "rewards/rejected": -0.2891604006290436, + "step": 340 + }, + { + "epoch": 0.45, + "learning_rate": 4.8523475321995715e-05, + "logits/chosen": -1.6928297281265259, + "logits/rejected": -1.7166639566421509, + "logps/chosen": -163.96253967285156, + "logps/rejected": -157.01870727539062, + "loss": 0.6892, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.023365147411823273, + "rewards/margins": 0.09150275588035583, + "rewards/rejected": -0.06813760101795197, + "step": 341 + }, + { + "epoch": 0.45, + "learning_rate": 4.8511319914458555e-05, + "logits/chosen": -1.5925623178482056, + "logits/rejected": -1.6342376470565796, + "logps/chosen": -169.8107452392578, + "logps/rejected": -174.4763641357422, + "loss": 0.7952, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6270018815994263, + "rewards/margins": -0.09866765886545181, + "rewards/rejected": -0.5283341407775879, + "step": 342 + }, + { + "epoch": 0.45, + "learning_rate": 4.849911621265401e-05, + "logits/chosen": -1.6875545978546143, + "logits/rejected": -1.6878042221069336, + "logps/chosen": -160.625732421875, + "logps/rejected": -186.27923583984375, + "loss": 0.6322, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33780720829963684, + "rewards/margins": 0.20755568146705627, + "rewards/rejected": -0.5453628301620483, + "step": 343 + }, + { + "epoch": 0.45, + "learning_rate": 4.848686424164953e-05, + "logits/chosen": -1.846010684967041, + "logits/rejected": -1.860701084136963, + "logps/chosen": -181.06878662109375, + "logps/rejected": -178.49310302734375, + "loss": 0.7739, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3998520076274872, + "rewards/margins": -0.03460027277469635, + "rewards/rejected": -0.36525171995162964, + "step": 344 + }, + { + "epoch": 0.45, + "learning_rate": 4.84745640266117e-05, + "logits/chosen": -1.8941820859909058, + "logits/rejected": -1.8956482410430908, + "logps/chosen": -228.6375732421875, + "logps/rejected": -222.01824951171875, + "loss": 0.7173, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3104371130466461, + "rewards/margins": 0.0530361533164978, + "rewards/rejected": -0.3634732663631439, + "step": 345 + }, + { + "epoch": 0.45, + "learning_rate": 4.846221559280624e-05, + "logits/chosen": -1.8459906578063965, + "logits/rejected": -1.9024940729141235, + "logps/chosen": -159.93936157226562, + "logps/rejected": -165.43307495117188, + "loss": 0.896, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23201912641525269, + "rewards/margins": -0.12730032205581665, + "rewards/rejected": -0.10471877455711365, + "step": 346 + }, + { + "epoch": 0.45, + "learning_rate": 4.844981896559787e-05, + "logits/chosen": -2.048933506011963, + "logits/rejected": -2.061128616333008, + "logps/chosen": -181.51718139648438, + "logps/rejected": -185.25296020507812, + "loss": 0.8408, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.32012802362442017, + "rewards/margins": -0.20446370542049408, + "rewards/rejected": -0.1156642958521843, + "step": 347 + }, + { + "epoch": 0.46, + "learning_rate": 4.8437374170450344e-05, + "logits/chosen": -1.9143999814987183, + "logits/rejected": -1.9218837022781372, + "logps/chosen": -182.51480102539062, + "logps/rejected": -182.2652130126953, + "loss": 0.8078, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6757728457450867, + "rewards/margins": -0.08970170468091965, + "rewards/rejected": -0.5860711932182312, + "step": 348 + }, + { + "epoch": 0.46, + "learning_rate": 4.842488123292632e-05, + "logits/chosen": -1.736176609992981, + "logits/rejected": -1.7229468822479248, + "logps/chosen": -159.5717315673828, + "logps/rejected": -189.70269775390625, + "loss": 0.6977, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2442682683467865, + "rewards/margins": 0.0333622470498085, + "rewards/rejected": -0.2776305675506592, + "step": 349 + }, + { + "epoch": 0.46, + "learning_rate": 4.8412340178687374e-05, + "logits/chosen": -1.6423401832580566, + "logits/rejected": -1.689012050628662, + "logps/chosen": -151.13458251953125, + "logps/rejected": -175.60679626464844, + "loss": 0.8116, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.33638978004455566, + "rewards/margins": -0.15178707242012024, + "rewards/rejected": -0.18460272252559662, + "step": 350 + }, + { + "epoch": 0.46, + "learning_rate": 4.839975103349391e-05, + "logits/chosen": -1.8096094131469727, + "logits/rejected": -1.8605788946151733, + "logps/chosen": -158.09568786621094, + "logps/rejected": -172.2019805908203, + "loss": 0.8539, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.17084676027297974, + "rewards/margins": -0.16634216904640198, + "rewards/rejected": -0.004504583775997162, + "step": 351 + }, + { + "epoch": 0.46, + "learning_rate": 4.8387113823205096e-05, + "logits/chosen": -1.8915197849273682, + "logits/rejected": -1.9206541776657104, + "logps/chosen": -173.2298583984375, + "logps/rejected": -181.8549346923828, + "loss": 0.6712, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22103241086006165, + "rewards/margins": 0.12150835990905762, + "rewards/rejected": -0.34254080057144165, + "step": 352 + }, + { + "epoch": 0.46, + "learning_rate": 4.8374428573778864e-05, + "logits/chosen": -1.9554250240325928, + "logits/rejected": -2.0212881565093994, + "logps/chosen": -188.2921142578125, + "logps/rejected": -199.60987854003906, + "loss": 0.7256, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0011289417743682861, + "rewards/margins": 0.15477600693702698, + "rewards/rejected": -0.1536470353603363, + "step": 353 + }, + { + "epoch": 0.46, + "learning_rate": 4.8361695311271795e-05, + "logits/chosen": -1.626081943511963, + "logits/rejected": -1.5311224460601807, + "logps/chosen": -185.79702758789062, + "logps/rejected": -203.40957641601562, + "loss": 0.8635, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.516851007938385, + "rewards/margins": -0.19476839900016785, + "rewards/rejected": -0.32208263874053955, + "step": 354 + }, + { + "epoch": 0.46, + "learning_rate": 4.83489140618391e-05, + "logits/chosen": -1.7895288467407227, + "logits/rejected": -1.753225564956665, + "logps/chosen": -217.21707153320312, + "logps/rejected": -191.97915649414062, + "loss": 0.7744, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4739922285079956, + "rewards/margins": -0.06083906441926956, + "rewards/rejected": -0.41315317153930664, + "step": 355 + }, + { + "epoch": 0.47, + "learning_rate": 4.833608485173457e-05, + "logits/chosen": -1.9408211708068848, + "logits/rejected": -1.916908860206604, + "logps/chosen": -210.69586181640625, + "logps/rejected": -233.63662719726562, + "loss": 0.7485, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3478991985321045, + "rewards/margins": -0.0014675185084342957, + "rewards/rejected": -0.3464316725730896, + "step": 356 + }, + { + "epoch": 0.47, + "learning_rate": 4.8323207707310496e-05, + "logits/chosen": -2.0299384593963623, + "logits/rejected": -2.0607504844665527, + "logps/chosen": -186.0093231201172, + "logps/rejected": -179.7775115966797, + "loss": 0.6698, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22601480782032013, + "rewards/margins": 0.12358909845352173, + "rewards/rejected": -0.34960389137268066, + "step": 357 + }, + { + "epoch": 0.47, + "learning_rate": 4.831028265501764e-05, + "logits/chosen": -1.7695857286453247, + "logits/rejected": -1.7599815130233765, + "logps/chosen": -161.50375366210938, + "logps/rejected": -168.04541015625, + "loss": 0.6868, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32897791266441345, + "rewards/margins": 0.10846008360385895, + "rewards/rejected": -0.437438040971756, + "step": 358 + }, + { + "epoch": 0.47, + "learning_rate": 4.829730972140517e-05, + "logits/chosen": -1.8524454832077026, + "logits/rejected": -1.9345438480377197, + "logps/chosen": -149.3410186767578, + "logps/rejected": -160.43539428710938, + "loss": 0.7965, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08119592815637589, + "rewards/margins": -0.04433928430080414, + "rewards/rejected": -0.03685663640499115, + "step": 359 + }, + { + "epoch": 0.47, + "learning_rate": 4.8284288933120594e-05, + "logits/chosen": -1.8411493301391602, + "logits/rejected": -1.8524004220962524, + "logps/chosen": -175.48532104492188, + "logps/rejected": -182.57582092285156, + "loss": 0.5989, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.46560171246528625, + "rewards/margins": 0.3886818289756775, + "rewards/rejected": -0.8542835712432861, + "step": 360 + }, + { + "epoch": 0.47, + "learning_rate": 4.8271220316909735e-05, + "logits/chosen": -1.687551498413086, + "logits/rejected": -1.722497582435608, + "logps/chosen": -167.26939392089844, + "logps/rejected": -181.43814086914062, + "loss": 0.9314, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5299715399742126, + "rewards/margins": -0.17927514016628265, + "rewards/rejected": -0.35069650411605835, + "step": 361 + }, + { + "epoch": 0.47, + "learning_rate": 4.825810389961666e-05, + "logits/chosen": -1.916797399520874, + "logits/rejected": -1.9123202562332153, + "logps/chosen": -198.55821228027344, + "logps/rejected": -225.2833709716797, + "loss": 0.8431, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.3632412254810333, + "rewards/margins": -0.22463062405586243, + "rewards/rejected": -0.1386105716228485, + "step": 362 + }, + { + "epoch": 0.48, + "learning_rate": 4.8244939708183596e-05, + "logits/chosen": -1.6189442873001099, + "logits/rejected": -1.6897720098495483, + "logps/chosen": -186.33004760742188, + "logps/rejected": -181.720947265625, + "loss": 0.8053, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.27750128507614136, + "rewards/margins": -0.1327960044145584, + "rewards/rejected": 0.41029733419418335, + "step": 363 + }, + { + "epoch": 0.48, + "learning_rate": 4.823172776965094e-05, + "logits/chosen": -2.003798246383667, + "logits/rejected": -1.9069428443908691, + "logps/chosen": -206.22410583496094, + "logps/rejected": -202.70343017578125, + "loss": 0.7931, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07843560725450516, + "rewards/margins": 0.052309781312942505, + "rewards/rejected": -0.13074536621570587, + "step": 364 + }, + { + "epoch": 0.48, + "learning_rate": 4.821846811115713e-05, + "logits/chosen": -1.4173839092254639, + "logits/rejected": -1.469193696975708, + "logps/chosen": -252.58900451660156, + "logps/rejected": -266.8291015625, + "loss": 0.7776, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2866368591785431, + "rewards/margins": -0.01211586594581604, + "rewards/rejected": -0.27452099323272705, + "step": 365 + }, + { + "epoch": 0.48, + "learning_rate": 4.820516075993865e-05, + "logits/chosen": -1.9226216077804565, + "logits/rejected": -1.8626333475112915, + "logps/chosen": -189.990966796875, + "logps/rejected": -212.71446228027344, + "loss": 0.7835, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2961033880710602, + "rewards/margins": -0.048593662679195404, + "rewards/rejected": -0.24750974774360657, + "step": 366 + }, + { + "epoch": 0.48, + "learning_rate": 4.819180574332994e-05, + "logits/chosen": -2.1082520484924316, + "logits/rejected": -2.0929677486419678, + "logps/chosen": -172.19317626953125, + "logps/rejected": -166.5411376953125, + "loss": 0.7558, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3426204323768616, + "rewards/margins": 0.029606737196445465, + "rewards/rejected": -0.37222716212272644, + "step": 367 + }, + { + "epoch": 0.48, + "learning_rate": 4.8178403088763355e-05, + "logits/chosen": -1.8143612146377563, + "logits/rejected": -1.8444868326187134, + "logps/chosen": -219.51095581054688, + "logps/rejected": -223.00332641601562, + "loss": 0.7669, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5216841697692871, + "rewards/margins": -0.06673192977905273, + "rewards/rejected": -0.454952210187912, + "step": 368 + }, + { + "epoch": 0.48, + "learning_rate": 4.8164952823769085e-05, + "logits/chosen": -2.0486927032470703, + "logits/rejected": -1.986092209815979, + "logps/chosen": -179.5719757080078, + "logps/rejected": -176.9426727294922, + "loss": 0.7705, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12122049927711487, + "rewards/margins": -0.017412271350622177, + "rewards/rejected": -0.1038082093000412, + "step": 369 + }, + { + "epoch": 0.48, + "learning_rate": 4.815145497597514e-05, + "logits/chosen": -1.6093693971633911, + "logits/rejected": -1.6691244840621948, + "logps/chosen": -220.64230346679688, + "logps/rejected": -226.87054443359375, + "loss": 0.739, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48053669929504395, + "rewards/margins": 0.01233639195561409, + "rewards/rejected": -0.4928731620311737, + "step": 370 + }, + { + "epoch": 0.49, + "learning_rate": 4.8137909573107246e-05, + "logits/chosen": -1.5354715585708618, + "logits/rejected": -1.4911106824874878, + "logps/chosen": -173.0819549560547, + "logps/rejected": -171.91053771972656, + "loss": 0.7285, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.09934857487678528, + "rewards/margins": 0.08546656370162964, + "rewards/rejected": -0.18481513857841492, + "step": 371 + }, + { + "epoch": 0.49, + "learning_rate": 4.812431664298883e-05, + "logits/chosen": -1.8645007610321045, + "logits/rejected": -1.8654489517211914, + "logps/chosen": -174.5074005126953, + "logps/rejected": -174.8561248779297, + "loss": 0.7322, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.34228798747062683, + "rewards/margins": 0.07575173676013947, + "rewards/rejected": -0.4180397093296051, + "step": 372 + }, + { + "epoch": 0.49, + "learning_rate": 4.811067621354094e-05, + "logits/chosen": -1.672195315361023, + "logits/rejected": -1.7209053039550781, + "logps/chosen": -179.9447784423828, + "logps/rejected": -163.33111572265625, + "loss": 0.9538, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.3875757157802582, + "rewards/margins": -0.4043459892272949, + "rewards/rejected": 0.016770271584391594, + "step": 373 + }, + { + "epoch": 0.49, + "learning_rate": 4.8096988312782174e-05, + "logits/chosen": -2.031759023666382, + "logits/rejected": -2.071578025817871, + "logps/chosen": -179.43516540527344, + "logps/rejected": -177.50064086914062, + "loss": 0.8834, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.4397561550140381, + "rewards/margins": -0.1974533647298813, + "rewards/rejected": -0.2423027753829956, + "step": 374 + }, + { + "epoch": 0.49, + "learning_rate": 4.8083252968828665e-05, + "logits/chosen": -1.9421418905258179, + "logits/rejected": -1.922428846359253, + "logps/chosen": -142.18792724609375, + "logps/rejected": -137.76077270507812, + "loss": 0.5961, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12917383015155792, + "rewards/margins": 0.23805946111679077, + "rewards/rejected": -0.3672332763671875, + "step": 375 + }, + { + "epoch": 0.49, + "learning_rate": 4.8069470209893974e-05, + "logits/chosen": -1.8579202890396118, + "logits/rejected": -1.7998918294906616, + "logps/chosen": -155.51712036132812, + "logps/rejected": -156.0476531982422, + "loss": 0.7183, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.02486548200249672, + "rewards/margins": 0.029943522065877914, + "rewards/rejected": -0.054809004068374634, + "step": 376 + }, + { + "epoch": 0.49, + "learning_rate": 4.8055640064289086e-05, + "logits/chosen": -1.9409297704696655, + "logits/rejected": -1.911987066268921, + "logps/chosen": -245.75701904296875, + "logps/rejected": -248.6239013671875, + "loss": 0.8102, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.4732462763786316, + "rewards/margins": -0.1539333313703537, + "rewards/rejected": -0.3193129599094391, + "step": 377 + }, + { + "epoch": 0.49, + "learning_rate": 4.80417625604223e-05, + "logits/chosen": -1.8907678127288818, + "logits/rejected": -1.8695294857025146, + "logps/chosen": -177.34124755859375, + "logps/rejected": -179.99476623535156, + "loss": 0.6441, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10845950245857239, + "rewards/margins": 0.19328270852565765, + "rewards/rejected": -0.3017422556877136, + "step": 378 + }, + { + "epoch": 0.5, + "learning_rate": 4.8027837726799205e-05, + "logits/chosen": -1.8234997987747192, + "logits/rejected": -1.8477734327316284, + "logps/chosen": -152.91793823242188, + "logps/rejected": -166.2785186767578, + "loss": 0.7019, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19078156352043152, + "rewards/margins": 0.07663966715335846, + "rewards/rejected": -0.26742124557495117, + "step": 379 + }, + { + "epoch": 0.5, + "learning_rate": 4.801386559202259e-05, + "logits/chosen": -1.9331165552139282, + "logits/rejected": -1.9343795776367188, + "logps/chosen": -197.23309326171875, + "logps/rejected": -216.14816284179688, + "loss": 0.6427, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1457238346338272, + "rewards/margins": 0.22981032729148865, + "rewards/rejected": -0.37553414702415466, + "step": 380 + }, + { + "epoch": 0.5, + "learning_rate": 4.799984618479242e-05, + "logits/chosen": -1.7535991668701172, + "logits/rejected": -1.8327221870422363, + "logps/chosen": -170.40121459960938, + "logps/rejected": -194.4297332763672, + "loss": 0.7553, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2612544000148773, + "rewards/margins": 0.023236550390720367, + "rewards/rejected": -0.2844909727573395, + "step": 381 + }, + { + "epoch": 0.5, + "learning_rate": 4.798577953390577e-05, + "logits/chosen": -1.9297330379486084, + "logits/rejected": -1.8824340105056763, + "logps/chosen": -192.4176788330078, + "logps/rejected": -204.23397827148438, + "loss": 0.7058, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.016188140958547592, + "rewards/margins": 0.1552063524723053, + "rewards/rejected": -0.1713944971561432, + "step": 382 + }, + { + "epoch": 0.5, + "learning_rate": 4.797166566825675e-05, + "logits/chosen": -1.983964204788208, + "logits/rejected": -2.0162336826324463, + "logps/chosen": -165.31370544433594, + "logps/rejected": -175.11459350585938, + "loss": 0.8134, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.43316060304641724, + "rewards/margins": -0.14317026734352112, + "rewards/rejected": -0.2899903357028961, + "step": 383 + }, + { + "epoch": 0.5, + "learning_rate": 4.795750461683644e-05, + "logits/chosen": -1.7382255792617798, + "logits/rejected": -1.7152175903320312, + "logps/chosen": -162.63970947265625, + "logps/rejected": -167.93568420410156, + "loss": 0.8503, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3506108224391937, + "rewards/margins": -0.19645802676677704, + "rewards/rejected": -0.1541527807712555, + "step": 384 + }, + { + "epoch": 0.5, + "learning_rate": 4.794329640873285e-05, + "logits/chosen": -1.9835039377212524, + "logits/rejected": -1.9371455907821655, + "logps/chosen": -164.90518188476562, + "logps/rejected": -153.89732360839844, + "loss": 0.8365, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10440421104431152, + "rewards/margins": -0.1423448920249939, + "rewards/rejected": 0.03794068843126297, + "step": 385 + }, + { + "epoch": 0.51, + "learning_rate": 4.7929041073130867e-05, + "logits/chosen": -1.6812117099761963, + "logits/rejected": -1.7789117097854614, + "logps/chosen": -171.525390625, + "logps/rejected": -194.28512573242188, + "loss": 0.6979, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.21003414690494537, + "rewards/margins": 0.1351175457239151, + "rewards/rejected": 0.07491665333509445, + "step": 386 + }, + { + "epoch": 0.51, + "learning_rate": 4.7914738639312165e-05, + "logits/chosen": -1.9188036918640137, + "logits/rejected": -1.9109784364700317, + "logps/chosen": -189.7833251953125, + "logps/rejected": -164.029541015625, + "loss": 0.8337, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.3247818648815155, + "rewards/margins": -0.18027785420417786, + "rewards/rejected": -0.14450398087501526, + "step": 387 + }, + { + "epoch": 0.51, + "learning_rate": 4.790038913665519e-05, + "logits/chosen": -2.0011377334594727, + "logits/rejected": -2.060800552368164, + "logps/chosen": -176.84857177734375, + "logps/rejected": -189.64443969726562, + "loss": 0.7481, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.09154945611953735, + "rewards/margins": -0.03873196616768837, + "rewards/rejected": -0.052817486226558685, + "step": 388 + }, + { + "epoch": 0.51, + "learning_rate": 4.788599259463502e-05, + "logits/chosen": -1.8452333211898804, + "logits/rejected": -1.8431695699691772, + "logps/chosen": -154.12435913085938, + "logps/rejected": -155.32220458984375, + "loss": 0.7097, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14383897185325623, + "rewards/margins": 0.18797016143798828, + "rewards/rejected": -0.3318091332912445, + "step": 389 + }, + { + "epoch": 0.51, + "learning_rate": 4.787154904282341e-05, + "logits/chosen": -1.3743209838867188, + "logits/rejected": -1.4174141883850098, + "logps/chosen": -177.5303192138672, + "logps/rejected": -211.40457153320312, + "loss": 0.5768, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2827177047729492, + "rewards/margins": 0.42551764845848083, + "rewards/rejected": -0.1427999585866928, + "step": 390 + }, + { + "epoch": 0.51, + "learning_rate": 4.7857058510888645e-05, + "logits/chosen": -2.1841466426849365, + "logits/rejected": -2.124525547027588, + "logps/chosen": -246.4276885986328, + "logps/rejected": -244.47906494140625, + "loss": 0.6631, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.038991779088974, + "rewards/margins": 0.20248231291770935, + "rewards/rejected": -0.24147410690784454, + "step": 391 + }, + { + "epoch": 0.51, + "learning_rate": 4.7842521028595526e-05, + "logits/chosen": -1.8262689113616943, + "logits/rejected": -1.8402464389801025, + "logps/chosen": -158.2376708984375, + "logps/rejected": -177.70938110351562, + "loss": 0.7674, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.19041013717651367, + "rewards/margins": 0.02704358845949173, + "rewards/rejected": -0.2174537032842636, + "step": 392 + }, + { + "epoch": 0.51, + "learning_rate": 4.7827936625805284e-05, + "logits/chosen": -1.9320769309997559, + "logits/rejected": -1.9378974437713623, + "logps/chosen": -162.44107055664062, + "logps/rejected": -162.588623046875, + "loss": 0.702, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.22483624517917633, + "rewards/margins": 0.04632706940174103, + "rewards/rejected": 0.1785091906785965, + "step": 393 + }, + { + "epoch": 0.52, + "learning_rate": 4.7813305332475535e-05, + "logits/chosen": -2.023815870285034, + "logits/rejected": -2.1101415157318115, + "logps/chosen": -161.48867797851562, + "logps/rejected": -177.5762939453125, + "loss": 0.7641, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2686167359352112, + "rewards/margins": -0.034188684076070786, + "rewards/rejected": -0.2344280332326889, + "step": 394 + }, + { + "epoch": 0.52, + "learning_rate": 4.77986271786602e-05, + "logits/chosen": -1.9036113023757935, + "logits/rejected": -1.8756999969482422, + "logps/chosen": -200.5419464111328, + "logps/rejected": -212.63906860351562, + "loss": 0.6506, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06710982322692871, + "rewards/margins": 0.18568173050880432, + "rewards/rejected": -0.11857189238071442, + "step": 395 + }, + { + "epoch": 0.52, + "learning_rate": 4.778390219450949e-05, + "logits/chosen": -1.8086354732513428, + "logits/rejected": -1.8471179008483887, + "logps/chosen": -152.69277954101562, + "logps/rejected": -143.47000122070312, + "loss": 0.6566, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.007534712553024292, + "rewards/margins": 0.15669256448745728, + "rewards/rejected": -0.14915785193443298, + "step": 396 + }, + { + "epoch": 0.52, + "learning_rate": 4.776913041026976e-05, + "logits/chosen": -2.1575872898101807, + "logits/rejected": -2.189612627029419, + "logps/chosen": -178.6571044921875, + "logps/rejected": -187.46389770507812, + "loss": 0.8387, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3838292360305786, + "rewards/margins": -0.1892194300889969, + "rewards/rejected": -0.19460979104042053, + "step": 397 + }, + { + "epoch": 0.52, + "learning_rate": 4.775431185628353e-05, + "logits/chosen": -2.0314245223999023, + "logits/rejected": -2.0670695304870605, + "logps/chosen": -139.46705627441406, + "logps/rejected": -137.02342224121094, + "loss": 0.799, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.3406837582588196, + "rewards/margins": -0.09202456474304199, + "rewards/rejected": -0.24865922331809998, + "step": 398 + }, + { + "epoch": 0.52, + "learning_rate": 4.7739446562989384e-05, + "logits/chosen": -1.7543888092041016, + "logits/rejected": -1.8098934888839722, + "logps/chosen": -171.0781707763672, + "logps/rejected": -192.0042724609375, + "loss": 0.7873, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4363712966442108, + "rewards/margins": 0.039258234202861786, + "rewards/rejected": -0.4756295382976532, + "step": 399 + }, + { + "epoch": 0.52, + "learning_rate": 4.772453456092191e-05, + "logits/chosen": -1.8036949634552002, + "logits/rejected": -1.8187798261642456, + "logps/chosen": -176.441650390625, + "logps/rejected": -187.6487274169922, + "loss": 0.7976, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.22391349077224731, + "rewards/margins": -0.052816301584243774, + "rewards/rejected": -0.17109718918800354, + "step": 400 + }, + { + "epoch": 0.52, + "learning_rate": 4.7709575880711634e-05, + "logits/chosen": -2.0721547603607178, + "logits/rejected": -2.05245041847229, + "logps/chosen": -181.74232482910156, + "logps/rejected": -187.65249633789062, + "loss": 0.5921, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2778853476047516, + "rewards/margins": 0.33293941617012024, + "rewards/rejected": -0.05505405738949776, + "step": 401 + }, + { + "epoch": 0.53, + "learning_rate": 4.769457055308497e-05, + "logits/chosen": -2.026765823364258, + "logits/rejected": -2.018843412399292, + "logps/chosen": -197.64260864257812, + "logps/rejected": -179.19317626953125, + "loss": 0.9551, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.30001845955848694, + "rewards/margins": -0.15433457493782043, + "rewards/rejected": -0.1456838697195053, + "step": 402 + }, + { + "epoch": 0.53, + "learning_rate": 4.767951860886415e-05, + "logits/chosen": -1.7545514106750488, + "logits/rejected": -1.7311463356018066, + "logps/chosen": -189.0639190673828, + "logps/rejected": -202.9300537109375, + "loss": 0.7032, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16693828999996185, + "rewards/margins": 0.15034297108650208, + "rewards/rejected": -0.3172812759876251, + "step": 403 + }, + { + "epoch": 0.53, + "learning_rate": 4.766442007896715e-05, + "logits/chosen": -1.3602584600448608, + "logits/rejected": -1.3138482570648193, + "logps/chosen": -229.68812561035156, + "logps/rejected": -228.81280517578125, + "loss": 0.7063, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.30803146958351135, + "rewards/margins": 0.05981824919581413, + "rewards/rejected": -0.3678497076034546, + "step": 404 + }, + { + "epoch": 0.53, + "learning_rate": 4.764927499440767e-05, + "logits/chosen": -1.2695108652114868, + "logits/rejected": -1.2994788885116577, + "logps/chosen": -178.60507202148438, + "logps/rejected": -209.27049255371094, + "loss": 0.9922, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.590323805809021, + "rewards/margins": -0.39020806550979614, + "rewards/rejected": -0.20011577010154724, + "step": 405 + }, + { + "epoch": 0.53, + "learning_rate": 4.763408338629498e-05, + "logits/chosen": -2.1045475006103516, + "logits/rejected": -2.1285929679870605, + "logps/chosen": -230.02374267578125, + "logps/rejected": -225.12106323242188, + "loss": 0.8737, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.33914846181869507, + "rewards/margins": -0.23797425627708435, + "rewards/rejected": -0.10117418318986893, + "step": 406 + }, + { + "epoch": 0.53, + "learning_rate": 4.761884528583396e-05, + "logits/chosen": -1.4888020753860474, + "logits/rejected": -1.4826213121414185, + "logps/chosen": -221.777587890625, + "logps/rejected": -241.30885314941406, + "loss": 0.7111, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6717454791069031, + "rewards/margins": 0.279751718044281, + "rewards/rejected": -0.9514971375465393, + "step": 407 + }, + { + "epoch": 0.53, + "learning_rate": 4.760356072432498e-05, + "logits/chosen": -1.8832398653030396, + "logits/rejected": -2.041220188140869, + "logps/chosen": -290.4461364746094, + "logps/rejected": -303.868408203125, + "loss": 0.747, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1965964436531067, + "rewards/margins": 0.01566828042268753, + "rewards/rejected": 0.1809280961751938, + "step": 408 + }, + { + "epoch": 0.54, + "learning_rate": 4.7588229733163834e-05, + "logits/chosen": -1.9910494089126587, + "logits/rejected": -2.065354585647583, + "logps/chosen": -189.50331115722656, + "logps/rejected": -204.8780975341797, + "loss": 0.7243, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7861883640289307, + "rewards/margins": 0.04858472943305969, + "rewards/rejected": -0.834773063659668, + "step": 409 + }, + { + "epoch": 0.54, + "learning_rate": 4.757285234384169e-05, + "logits/chosen": -1.94736909866333, + "logits/rejected": -2.0690090656280518, + "logps/chosen": -183.47666931152344, + "logps/rejected": -199.96681213378906, + "loss": 0.7047, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5590149164199829, + "rewards/margins": 0.15124721825122833, + "rewards/rejected": -0.7102621793746948, + "step": 410 + }, + { + "epoch": 0.54, + "learning_rate": 4.755742858794503e-05, + "logits/chosen": -2.1221137046813965, + "logits/rejected": -2.069035291671753, + "logps/chosen": -203.06430053710938, + "logps/rejected": -187.47093200683594, + "loss": 0.7289, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4738054871559143, + "rewards/margins": 0.03721272572875023, + "rewards/rejected": -0.5110181570053101, + "step": 411 + }, + { + "epoch": 0.54, + "learning_rate": 4.754195849715557e-05, + "logits/chosen": -1.9132802486419678, + "logits/rejected": -1.976714849472046, + "logps/chosen": -162.88668823242188, + "logps/rejected": -180.09291076660156, + "loss": 0.658, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10689959675073624, + "rewards/margins": 0.1677176058292389, + "rewards/rejected": -0.2746172249317169, + "step": 412 + }, + { + "epoch": 0.54, + "learning_rate": 4.75264421032502e-05, + "logits/chosen": -1.8963744640350342, + "logits/rejected": -1.8926461935043335, + "logps/chosen": -199.32736206054688, + "logps/rejected": -198.4470672607422, + "loss": 0.6689, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6300160884857178, + "rewards/margins": 0.09289233386516571, + "rewards/rejected": -0.7229084968566895, + "step": 413 + }, + { + "epoch": 0.54, + "learning_rate": 4.751087943810093e-05, + "logits/chosen": -1.8022470474243164, + "logits/rejected": -1.797208547592163, + "logps/chosen": -184.97244262695312, + "logps/rejected": -180.94895935058594, + "loss": 0.5437, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.35914158821105957, + "rewards/margins": 0.4712386727333069, + "rewards/rejected": -0.8303802609443665, + "step": 414 + }, + { + "epoch": 0.54, + "learning_rate": 4.749527053367481e-05, + "logits/chosen": -1.7871997356414795, + "logits/rejected": -1.7747814655303955, + "logps/chosen": -189.15310668945312, + "logps/rejected": -178.39805603027344, + "loss": 0.842, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7546498775482178, + "rewards/margins": -0.17995727062225342, + "rewards/rejected": -0.5746926069259644, + "step": 415 + }, + { + "epoch": 0.54, + "learning_rate": 4.747961542203386e-05, + "logits/chosen": -1.872157335281372, + "logits/rejected": -1.9486424922943115, + "logps/chosen": -172.46145629882812, + "logps/rejected": -195.35421752929688, + "loss": 0.7544, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.42608222365379333, + "rewards/margins": -0.038050394505262375, + "rewards/rejected": -0.38803184032440186, + "step": 416 + }, + { + "epoch": 0.55, + "learning_rate": 4.746391413533503e-05, + "logits/chosen": -1.9934085607528687, + "logits/rejected": -2.0255632400512695, + "logps/chosen": -166.4789276123047, + "logps/rejected": -183.97271728515625, + "loss": 0.6867, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.414784699678421, + "rewards/margins": 0.18602606654167175, + "rewards/rejected": -0.600810706615448, + "step": 417 + }, + { + "epoch": 0.55, + "learning_rate": 4.74481667058301e-05, + "logits/chosen": -1.8942878246307373, + "logits/rejected": -1.87151038646698, + "logps/chosen": -173.86004638671875, + "logps/rejected": -163.0937957763672, + "loss": 0.8824, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.772480309009552, + "rewards/margins": -0.22370155155658722, + "rewards/rejected": -0.5487788319587708, + "step": 418 + }, + { + "epoch": 0.55, + "learning_rate": 4.743237316586564e-05, + "logits/chosen": -1.9378021955490112, + "logits/rejected": -1.9703481197357178, + "logps/chosen": -180.11892700195312, + "logps/rejected": -195.14578247070312, + "loss": 0.7388, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3675965666770935, + "rewards/margins": -0.02655930444598198, + "rewards/rejected": -0.3410373032093048, + "step": 419 + }, + { + "epoch": 0.55, + "learning_rate": 4.741653354788295e-05, + "logits/chosen": -2.0154881477355957, + "logits/rejected": -2.0120866298675537, + "logps/chosen": -174.45498657226562, + "logps/rejected": -182.1272430419922, + "loss": 0.8029, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7631839513778687, + "rewards/margins": -0.11559872329235077, + "rewards/rejected": -0.6475852727890015, + "step": 420 + }, + { + "epoch": 0.55, + "learning_rate": 4.7400647884417956e-05, + "logits/chosen": -1.8835885524749756, + "logits/rejected": -2.027597665786743, + "logps/chosen": -169.2918701171875, + "logps/rejected": -167.33514404296875, + "loss": 0.6719, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.33407047390937805, + "rewards/margins": 0.15689513087272644, + "rewards/rejected": -0.49096566438674927, + "step": 421 + }, + { + "epoch": 0.55, + "learning_rate": 4.7384716208101166e-05, + "logits/chosen": -2.028184413909912, + "logits/rejected": -2.0173041820526123, + "logps/chosen": -168.9990692138672, + "logps/rejected": -160.54428100585938, + "loss": 0.7799, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.5599774122238159, + "rewards/margins": -0.1413966715335846, + "rewards/rejected": -0.4185807704925537, + "step": 422 + }, + { + "epoch": 0.55, + "learning_rate": 4.736873855165762e-05, + "logits/chosen": -1.9846090078353882, + "logits/rejected": -1.9857451915740967, + "logps/chosen": -193.67715454101562, + "logps/rejected": -175.9185333251953, + "loss": 0.6656, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5016548037528992, + "rewards/margins": 0.22657424211502075, + "rewards/rejected": -0.7282290458679199, + "step": 423 + }, + { + "epoch": 0.55, + "learning_rate": 4.735271494790678e-05, + "logits/chosen": -1.92975914478302, + "logits/rejected": -1.9406872987747192, + "logps/chosen": -181.31509399414062, + "logps/rejected": -167.101806640625, + "loss": 0.7386, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.18490439653396606, + "rewards/margins": 0.0683104544878006, + "rewards/rejected": -0.25321486592292786, + "step": 424 + }, + { + "epoch": 0.56, + "learning_rate": 4.733664542976253e-05, + "logits/chosen": -1.9516609907150269, + "logits/rejected": -2.005047559738159, + "logps/chosen": -167.55972290039062, + "logps/rejected": -168.99807739257812, + "loss": 0.9659, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3683350086212158, + "rewards/margins": -0.24237681925296783, + "rewards/rejected": -0.12595820426940918, + "step": 425 + }, + { + "epoch": 0.56, + "learning_rate": 4.732053003023301e-05, + "logits/chosen": -1.934274435043335, + "logits/rejected": -1.9798094034194946, + "logps/chosen": -153.78518676757812, + "logps/rejected": -166.25979614257812, + "loss": 0.7058, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5433434247970581, + "rewards/margins": 0.13497428596019745, + "rewards/rejected": -0.6783177256584167, + "step": 426 + }, + { + "epoch": 0.56, + "learning_rate": 4.730436878242064e-05, + "logits/chosen": -1.9575129747390747, + "logits/rejected": -1.9926663637161255, + "logps/chosen": -153.06951904296875, + "logps/rejected": -174.07875061035156, + "loss": 0.8359, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4944266080856323, + "rewards/margins": -0.00036665797233581543, + "rewards/rejected": -0.4940599203109741, + "step": 427 + }, + { + "epoch": 0.56, + "learning_rate": 4.7288161719522016e-05, + "logits/chosen": -1.9566092491149902, + "logits/rejected": -1.9228875637054443, + "logps/chosen": -162.54771423339844, + "logps/rejected": -167.0357666015625, + "loss": 0.8701, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5988922715187073, + "rewards/margins": -0.1607256382703781, + "rewards/rejected": -0.43816661834716797, + "step": 428 + }, + { + "epoch": 0.56, + "learning_rate": 4.727190887482783e-05, + "logits/chosen": -2.212228775024414, + "logits/rejected": -2.238290309906006, + "logps/chosen": -185.6492462158203, + "logps/rejected": -201.63949584960938, + "loss": 0.7027, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.45072147250175476, + "rewards/margins": 0.03795819729566574, + "rewards/rejected": -0.4886796772480011, + "step": 429 + }, + { + "epoch": 0.56, + "learning_rate": 4.725561028172282e-05, + "logits/chosen": -2.08243989944458, + "logits/rejected": -2.100586414337158, + "logps/chosen": -169.82723999023438, + "logps/rejected": -169.41476440429688, + "loss": 0.8011, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5032753348350525, + "rewards/margins": -0.07662791758775711, + "rewards/rejected": -0.4266473352909088, + "step": 430 + }, + { + "epoch": 0.56, + "learning_rate": 4.7239265973685696e-05, + "logits/chosen": -1.7974039316177368, + "logits/rejected": -1.802499771118164, + "logps/chosen": -166.3297119140625, + "logps/rejected": -184.35450744628906, + "loss": 0.6115, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3483116924762726, + "rewards/margins": 0.2958502173423767, + "rewards/rejected": -0.6441619396209717, + "step": 431 + }, + { + "epoch": 0.57, + "learning_rate": 4.722287598428907e-05, + "logits/chosen": -1.9482653141021729, + "logits/rejected": -1.9997018575668335, + "logps/chosen": -202.8570098876953, + "logps/rejected": -219.10565185546875, + "loss": 0.6303, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0477115735411644, + "rewards/margins": 0.21878241002559662, + "rewards/rejected": -0.26649394631385803, + "step": 432 + }, + { + "epoch": 0.57, + "learning_rate": 4.720644034719938e-05, + "logits/chosen": -1.8863980770111084, + "logits/rejected": -1.8580697774887085, + "logps/chosen": -178.37112426757812, + "logps/rejected": -196.09762573242188, + "loss": 0.7045, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.359438419342041, + "rewards/margins": 0.0665070116519928, + "rewards/rejected": -0.4259454607963562, + "step": 433 + }, + { + "epoch": 0.57, + "learning_rate": 4.7189959096176825e-05, + "logits/chosen": -1.962789535522461, + "logits/rejected": -2.0059375762939453, + "logps/chosen": -168.6863250732422, + "logps/rejected": -207.6638946533203, + "loss": 0.7517, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.33223748207092285, + "rewards/margins": 0.007875222712755203, + "rewards/rejected": -0.3401126563549042, + "step": 434 + }, + { + "epoch": 0.57, + "learning_rate": 4.7173432265075334e-05, + "logits/chosen": -2.13173770904541, + "logits/rejected": -2.1997811794281006, + "logps/chosen": -171.5570526123047, + "logps/rejected": -173.00726318359375, + "loss": 0.7831, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5620677471160889, + "rewards/margins": -0.08540257066488266, + "rewards/rejected": -0.47666510939598083, + "step": 435 + }, + { + "epoch": 0.57, + "learning_rate": 4.7156859887842416e-05, + "logits/chosen": -1.9717164039611816, + "logits/rejected": -1.9659010171890259, + "logps/chosen": -163.4027099609375, + "logps/rejected": -171.51205444335938, + "loss": 0.8767, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.36586710810661316, + "rewards/margins": -0.22375579178333282, + "rewards/rejected": -0.14211128652095795, + "step": 436 + }, + { + "epoch": 0.57, + "learning_rate": 4.714024199851915e-05, + "logits/chosen": -1.9461572170257568, + "logits/rejected": -1.9711329936981201, + "logps/chosen": -173.81141662597656, + "logps/rejected": -172.41860961914062, + "loss": 0.7655, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5734329223632812, + "rewards/margins": 0.0679081380367279, + "rewards/rejected": -0.6413410305976868, + "step": 437 + }, + { + "epoch": 0.57, + "learning_rate": 4.712357863124013e-05, + "logits/chosen": -2.0299744606018066, + "logits/rejected": -2.055668830871582, + "logps/chosen": -168.48684692382812, + "logps/rejected": -182.92257690429688, + "loss": 0.69, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.44026386737823486, + "rewards/margins": 0.11328400671482086, + "rewards/rejected": -0.5535478591918945, + "step": 438 + }, + { + "epoch": 0.57, + "learning_rate": 4.710686982023332e-05, + "logits/chosen": -2.0356907844543457, + "logits/rejected": -1.9616978168487549, + "logps/chosen": -147.8501739501953, + "logps/rejected": -154.8659210205078, + "loss": 0.7309, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06124575436115265, + "rewards/margins": 0.03877441585063934, + "rewards/rejected": -0.10002017021179199, + "step": 439 + }, + { + "epoch": 0.58, + "learning_rate": 4.709011559982006e-05, + "logits/chosen": -2.043642997741699, + "logits/rejected": -1.9762914180755615, + "logps/chosen": -193.67340087890625, + "logps/rejected": -186.66543579101562, + "loss": 0.7127, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34980466961860657, + "rewards/margins": 0.07452677190303802, + "rewards/rejected": -0.4243314862251282, + "step": 440 + }, + { + "epoch": 0.58, + "learning_rate": 4.707331600441495e-05, + "logits/chosen": -2.072479724884033, + "logits/rejected": -2.091381549835205, + "logps/chosen": -191.45059204101562, + "logps/rejected": -176.9473876953125, + "loss": 0.6801, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029101327061653137, + "rewards/margins": 0.26256757974624634, + "rewards/rejected": -0.2916688919067383, + "step": 441 + }, + { + "epoch": 0.58, + "learning_rate": 4.705647106852581e-05, + "logits/chosen": -1.9268254041671753, + "logits/rejected": -1.8882079124450684, + "logps/chosen": -171.5392303466797, + "logps/rejected": -182.5844268798828, + "loss": 0.7921, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2353130429983139, + "rewards/margins": -0.0611705407500267, + "rewards/rejected": -0.1741425096988678, + "step": 442 + }, + { + "epoch": 0.58, + "learning_rate": 4.7039580826753564e-05, + "logits/chosen": -2.029810667037964, + "logits/rejected": -2.026019334793091, + "logps/chosen": -169.64918518066406, + "logps/rejected": -186.5985107421875, + "loss": 0.6615, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.34466996788978577, + "rewards/margins": 0.14720463752746582, + "rewards/rejected": -0.491874635219574, + "step": 443 + }, + { + "epoch": 0.58, + "learning_rate": 4.7022645313792235e-05, + "logits/chosen": -1.5730178356170654, + "logits/rejected": -1.5958049297332764, + "logps/chosen": -160.8270263671875, + "logps/rejected": -176.29554748535156, + "loss": 0.7613, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.34809601306915283, + "rewards/margins": 0.23688597977161407, + "rewards/rejected": -0.5849819779396057, + "step": 444 + }, + { + "epoch": 0.58, + "learning_rate": 4.700566456442882e-05, + "logits/chosen": -2.009403705596924, + "logits/rejected": -1.9752486944198608, + "logps/chosen": -178.82701110839844, + "logps/rejected": -180.32125854492188, + "loss": 1.0244, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6049767136573792, + "rewards/margins": -0.4276657700538635, + "rewards/rejected": -0.1773110032081604, + "step": 445 + }, + { + "epoch": 0.58, + "learning_rate": 4.6988638613543216e-05, + "logits/chosen": -1.7354819774627686, + "logits/rejected": -1.731933355331421, + "logps/chosen": -167.52633666992188, + "logps/rejected": -181.66583251953125, + "loss": 0.789, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6287019848823547, + "rewards/margins": -0.054200708866119385, + "rewards/rejected": -0.5745012760162354, + "step": 446 + }, + { + "epoch": 0.58, + "learning_rate": 4.6971567496108206e-05, + "logits/chosen": -1.9981721639633179, + "logits/rejected": -2.026167154312134, + "logps/chosen": -212.54891967773438, + "logps/rejected": -215.69253540039062, + "loss": 0.7395, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4362500011920929, + "rewards/margins": -0.020926453173160553, + "rewards/rejected": -0.4153235852718353, + "step": 447 + }, + { + "epoch": 0.59, + "learning_rate": 4.695445124718931e-05, + "logits/chosen": -2.239379405975342, + "logits/rejected": -2.186093807220459, + "logps/chosen": -189.42547607421875, + "logps/rejected": -181.0236358642578, + "loss": 0.6656, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3167141079902649, + "rewards/margins": 0.10905791819095612, + "rewards/rejected": -0.4257720112800598, + "step": 448 + }, + { + "epoch": 0.59, + "learning_rate": 4.693728990194479e-05, + "logits/chosen": -2.132059097290039, + "logits/rejected": -2.1224942207336426, + "logps/chosen": -211.412841796875, + "logps/rejected": -197.23439025878906, + "loss": 0.6771, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29473942518234253, + "rewards/margins": 0.20832209289073944, + "rewards/rejected": -0.5030615329742432, + "step": 449 + }, + { + "epoch": 0.59, + "learning_rate": 4.692008349562551e-05, + "logits/chosen": -2.2113986015319824, + "logits/rejected": -2.1679513454437256, + "logps/chosen": -174.953369140625, + "logps/rejected": -188.64576721191406, + "loss": 0.9728, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.5910927057266235, + "rewards/margins": -0.3634761869907379, + "rewards/rejected": -0.22761650383472443, + "step": 450 + }, + { + "epoch": 0.59, + "learning_rate": 4.690283206357491e-05, + "logits/chosen": -1.984092116355896, + "logits/rejected": -2.0135576725006104, + "logps/chosen": -187.25738525390625, + "logps/rejected": -189.52325439453125, + "loss": 0.672, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4424164593219757, + "rewards/margins": 0.20770896971225739, + "rewards/rejected": -0.6501253843307495, + "step": 451 + }, + { + "epoch": 0.59, + "learning_rate": 4.6885535641228904e-05, + "logits/chosen": -2.0326945781707764, + "logits/rejected": -2.0409088134765625, + "logps/chosen": -189.7724609375, + "logps/rejected": -196.32164001464844, + "loss": 0.846, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2984338402748108, + "rewards/margins": -0.14787545800209045, + "rewards/rejected": -0.15055838227272034, + "step": 452 + }, + { + "epoch": 0.59, + "learning_rate": 4.6868194264115833e-05, + "logits/chosen": -1.8356232643127441, + "logits/rejected": -1.860498309135437, + "logps/chosen": -190.26791381835938, + "logps/rejected": -192.42124938964844, + "loss": 0.6865, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5506845712661743, + "rewards/margins": 0.12794733047485352, + "rewards/rejected": -0.6786318421363831, + "step": 453 + }, + { + "epoch": 0.59, + "learning_rate": 4.685080796785637e-05, + "logits/chosen": -2.0949289798736572, + "logits/rejected": -2.105635643005371, + "logps/chosen": -188.15576171875, + "logps/rejected": -177.27642822265625, + "loss": 0.7395, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6156455874443054, + "rewards/margins": 0.00026201456785202026, + "rewards/rejected": -0.615907609462738, + "step": 454 + }, + { + "epoch": 0.6, + "learning_rate": 4.683337678816345e-05, + "logits/chosen": -2.062208414077759, + "logits/rejected": -1.9715068340301514, + "logps/chosen": -247.74729919433594, + "logps/rejected": -225.91094970703125, + "loss": 0.8454, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.7724697589874268, + "rewards/margins": -0.1988295614719391, + "rewards/rejected": -0.5736401081085205, + "step": 455 + }, + { + "epoch": 0.6, + "learning_rate": 4.6815900760842236e-05, + "logits/chosen": -1.999656081199646, + "logits/rejected": -2.013665199279785, + "logps/chosen": -189.76425170898438, + "logps/rejected": -200.77801513671875, + "loss": 0.7929, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1113797426223755, + "rewards/margins": -0.07439464330673218, + "rewards/rejected": -1.0369850397109985, + "step": 456 + }, + { + "epoch": 0.6, + "learning_rate": 4.679837992178996e-05, + "logits/chosen": -1.962624192237854, + "logits/rejected": -1.8894569873809814, + "logps/chosen": -163.32264709472656, + "logps/rejected": -176.6225128173828, + "loss": 0.6683, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5831518769264221, + "rewards/margins": 0.18081454932689667, + "rewards/rejected": -0.76396644115448, + "step": 457 + }, + { + "epoch": 0.6, + "learning_rate": 4.678081430699594e-05, + "logits/chosen": -1.9277567863464355, + "logits/rejected": -1.9991313219070435, + "logps/chosen": -177.146484375, + "logps/rejected": -180.6768798828125, + "loss": 0.549, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5843685865402222, + "rewards/margins": 0.5196143388748169, + "rewards/rejected": -1.103982925415039, + "step": 458 + }, + { + "epoch": 0.6, + "learning_rate": 4.676320395254146e-05, + "logits/chosen": -1.8018032312393188, + "logits/rejected": -1.7631927728652954, + "logps/chosen": -193.01077270507812, + "logps/rejected": -193.25103759765625, + "loss": 0.6489, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5062118172645569, + "rewards/margins": 0.23078583180904388, + "rewards/rejected": -0.736997663974762, + "step": 459 + }, + { + "epoch": 0.6, + "learning_rate": 4.674554889459968e-05, + "logits/chosen": -1.7966090440750122, + "logits/rejected": -1.7850843667984009, + "logps/chosen": -182.4764404296875, + "logps/rejected": -184.0175018310547, + "loss": 0.7334, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4785413146018982, + "rewards/margins": -0.006460566073656082, + "rewards/rejected": -0.4720807671546936, + "step": 460 + }, + { + "epoch": 0.6, + "learning_rate": 4.672784916943562e-05, + "logits/chosen": -1.6125917434692383, + "logits/rejected": -1.6394853591918945, + "logps/chosen": -182.64862060546875, + "logps/rejected": -196.8616943359375, + "loss": 0.5382, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8303923010826111, + "rewards/margins": 0.4725598096847534, + "rewards/rejected": -1.3029520511627197, + "step": 461 + }, + { + "epoch": 0.6, + "learning_rate": 4.6710104813406034e-05, + "logits/chosen": -1.7587897777557373, + "logits/rejected": -1.7165967226028442, + "logps/chosen": -178.50250244140625, + "logps/rejected": -159.33375549316406, + "loss": 0.9045, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5282204151153564, + "rewards/margins": -0.22660110890865326, + "rewards/rejected": -0.30161935091018677, + "step": 462 + }, + { + "epoch": 0.61, + "learning_rate": 4.669231586295934e-05, + "logits/chosen": -1.8907124996185303, + "logits/rejected": -1.9228183031082153, + "logps/chosen": -169.16119384765625, + "logps/rejected": -180.715087890625, + "loss": 0.7763, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.8981459736824036, + "rewards/margins": -0.11685739457607269, + "rewards/rejected": -0.7812885046005249, + "step": 463 + }, + { + "epoch": 0.61, + "learning_rate": 4.667448235463557e-05, + "logits/chosen": -1.2660267353057861, + "logits/rejected": -1.2475149631500244, + "logps/chosen": -183.68353271484375, + "logps/rejected": -182.84422302246094, + "loss": 0.8929, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7595192790031433, + "rewards/margins": -0.14992079138755798, + "rewards/rejected": -0.6095985174179077, + "step": 464 + }, + { + "epoch": 0.61, + "learning_rate": 4.665660432506629e-05, + "logits/chosen": -1.8095303773880005, + "logits/rejected": -1.8506840467453003, + "logps/chosen": -213.27145385742188, + "logps/rejected": -220.65640258789062, + "loss": 0.7946, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6156535744667053, + "rewards/margins": -0.057905957102775574, + "rewards/rejected": -0.5577476024627686, + "step": 465 + }, + { + "epoch": 0.61, + "learning_rate": 4.6638681810974496e-05, + "logits/chosen": -1.758918285369873, + "logits/rejected": -1.7417278289794922, + "logps/chosen": -180.2194366455078, + "logps/rejected": -199.62014770507812, + "loss": 0.5957, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.17319843173027039, + "rewards/margins": 0.4147520363330841, + "rewards/rejected": -0.5879504680633545, + "step": 466 + }, + { + "epoch": 0.61, + "learning_rate": 4.6620714849174576e-05, + "logits/chosen": -1.5012279748916626, + "logits/rejected": -1.495218276977539, + "logps/chosen": -227.26577758789062, + "logps/rejected": -223.47470092773438, + "loss": 0.7009, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7020196318626404, + "rewards/margins": 0.10670151561498642, + "rewards/rejected": -0.808721125125885, + "step": 467 + }, + { + "epoch": 0.61, + "learning_rate": 4.660270347657219e-05, + "logits/chosen": -1.4245662689208984, + "logits/rejected": -1.4722357988357544, + "logps/chosen": -219.63504028320312, + "logps/rejected": -246.0736846923828, + "loss": 0.6091, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7042097449302673, + "rewards/margins": 0.5171206593513489, + "rewards/rejected": -1.2213302850723267, + "step": 468 + }, + { + "epoch": 0.61, + "learning_rate": 4.658464773016428e-05, + "logits/chosen": -1.7068259716033936, + "logits/rejected": -1.6351027488708496, + "logps/chosen": -199.54136657714844, + "logps/rejected": -181.50997924804688, + "loss": 0.8131, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2041518688201904, + "rewards/margins": -0.023167330771684647, + "rewards/rejected": -1.180984616279602, + "step": 469 + }, + { + "epoch": 0.62, + "learning_rate": 4.6566547647038864e-05, + "logits/chosen": -1.7098909616470337, + "logits/rejected": -1.80801522731781, + "logps/chosen": -167.95101928710938, + "logps/rejected": -180.0511016845703, + "loss": 0.5463, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2591591477394104, + "rewards/margins": 0.41101884841918945, + "rewards/rejected": -0.6701779961585999, + "step": 470 + }, + { + "epoch": 0.62, + "learning_rate": 4.6548403264375074e-05, + "logits/chosen": -2.014415979385376, + "logits/rejected": -2.0018627643585205, + "logps/chosen": -181.22947692871094, + "logps/rejected": -189.5872344970703, + "loss": 0.8377, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9818540811538696, + "rewards/margins": -0.06248188391327858, + "rewards/rejected": -0.9193722009658813, + "step": 471 + }, + { + "epoch": 0.62, + "learning_rate": 4.6530214619443037e-05, + "logits/chosen": -1.902940034866333, + "logits/rejected": -1.940006971359253, + "logps/chosen": -156.01939392089844, + "logps/rejected": -156.20623779296875, + "loss": 0.8502, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6196620464324951, + "rewards/margins": -0.18280625343322754, + "rewards/rejected": -0.4368557929992676, + "step": 472 + }, + { + "epoch": 0.62, + "learning_rate": 4.6511981749603775e-05, + "logits/chosen": -1.8872562646865845, + "logits/rejected": -1.9487504959106445, + "logps/chosen": -178.33872985839844, + "logps/rejected": -182.2080535888672, + "loss": 0.7332, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6403207778930664, + "rewards/margins": 0.12355762720108032, + "rewards/rejected": -0.7638784050941467, + "step": 473 + }, + { + "epoch": 0.62, + "learning_rate": 4.6493704692309175e-05, + "logits/chosen": -1.8873028755187988, + "logits/rejected": -1.8430054187774658, + "logps/chosen": -248.9535675048828, + "logps/rejected": -238.04327392578125, + "loss": 1.062, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.2121449708938599, + "rewards/margins": -0.3689318299293518, + "rewards/rejected": -0.8432131409645081, + "step": 474 + }, + { + "epoch": 0.62, + "learning_rate": 4.647538348510189e-05, + "logits/chosen": -1.8361527919769287, + "logits/rejected": -1.856339454650879, + "logps/chosen": -171.12091064453125, + "logps/rejected": -179.4962158203125, + "loss": 0.6846, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7720082998275757, + "rewards/margins": 0.10657864063978195, + "rewards/rejected": -0.8785868883132935, + "step": 475 + }, + { + "epoch": 0.62, + "learning_rate": 4.645701816561523e-05, + "logits/chosen": -1.6982722282409668, + "logits/rejected": -1.7370768785476685, + "logps/chosen": -232.54293823242188, + "logps/rejected": -213.66964721679688, + "loss": 0.7178, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8362730741500854, + "rewards/margins": 0.07781472057104111, + "rewards/rejected": -0.9140878319740295, + "step": 476 + }, + { + "epoch": 0.62, + "learning_rate": 4.643860877157314e-05, + "logits/chosen": -1.7802523374557495, + "logits/rejected": -1.7304799556732178, + "logps/chosen": -168.3419189453125, + "logps/rejected": -205.67333984375, + "loss": 0.8153, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4274996221065521, + "rewards/margins": -0.046173423528671265, + "rewards/rejected": -0.38132619857788086, + "step": 477 + }, + { + "epoch": 0.63, + "learning_rate": 4.642015534079012e-05, + "logits/chosen": -1.9037768840789795, + "logits/rejected": -1.8988232612609863, + "logps/chosen": -173.9936981201172, + "logps/rejected": -197.27523803710938, + "loss": 0.6135, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.32596248388290405, + "rewards/margins": 0.23917633295059204, + "rewards/rejected": -0.5651388168334961, + "step": 478 + }, + { + "epoch": 0.63, + "learning_rate": 4.640165791117106e-05, + "logits/chosen": -1.9618606567382812, + "logits/rejected": -1.9455369710922241, + "logps/chosen": -190.39830017089844, + "logps/rejected": -175.3238067626953, + "loss": 0.9091, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0257893800735474, + "rewards/margins": -0.2109348475933075, + "rewards/rejected": -0.8148545622825623, + "step": 479 + }, + { + "epoch": 0.63, + "learning_rate": 4.63831165207113e-05, + "logits/chosen": -1.864621877670288, + "logits/rejected": -1.889084815979004, + "logps/chosen": -205.9281005859375, + "logps/rejected": -230.0751953125, + "loss": 0.7761, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6850899457931519, + "rewards/margins": -0.05720193684101105, + "rewards/rejected": -0.6278879046440125, + "step": 480 + }, + { + "epoch": 0.63, + "learning_rate": 4.6364531207496426e-05, + "logits/chosen": -1.737329363822937, + "logits/rejected": -1.745915412902832, + "logps/chosen": -171.1361846923828, + "logps/rejected": -175.88906860351562, + "loss": 0.6837, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.743391215801239, + "rewards/margins": 0.0780097097158432, + "rewards/rejected": -0.8214008808135986, + "step": 481 + }, + { + "epoch": 0.63, + "learning_rate": 4.634590200970227e-05, + "logits/chosen": -1.8280831575393677, + "logits/rejected": -1.8323631286621094, + "logps/chosen": -188.1634521484375, + "logps/rejected": -213.10360717773438, + "loss": 0.7336, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7827669382095337, + "rewards/margins": 0.3351660966873169, + "rewards/rejected": -1.1179330348968506, + "step": 482 + }, + { + "epoch": 0.63, + "learning_rate": 4.632722896559481e-05, + "logits/chosen": -1.9295848608016968, + "logits/rejected": -1.9276199340820312, + "logps/chosen": -167.31385803222656, + "logps/rejected": -186.1995391845703, + "loss": 0.6013, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.16915923357009888, + "rewards/margins": 0.3335033357143402, + "rewards/rejected": -0.5026625394821167, + "step": 483 + }, + { + "epoch": 0.63, + "learning_rate": 4.630851211353007e-05, + "logits/chosen": -1.6712524890899658, + "logits/rejected": -1.788968801498413, + "logps/chosen": -164.82725524902344, + "logps/rejected": -182.7891387939453, + "loss": 0.7642, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.609403669834137, + "rewards/margins": 0.03477644547820091, + "rewards/rejected": -0.644180178642273, + "step": 484 + }, + { + "epoch": 0.63, + "learning_rate": 4.628975149195407e-05, + "logits/chosen": -1.2296477556228638, + "logits/rejected": -1.2632193565368652, + "logps/chosen": -204.8614044189453, + "logps/rejected": -224.30543518066406, + "loss": 0.7234, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7619196176528931, + "rewards/margins": 0.12807466089725494, + "rewards/rejected": -0.8899943232536316, + "step": 485 + }, + { + "epoch": 0.64, + "learning_rate": 4.6270947139402744e-05, + "logits/chosen": -2.047361373901367, + "logits/rejected": -2.1051080226898193, + "logps/chosen": -169.24703979492188, + "logps/rejected": -184.35586547851562, + "loss": 0.66, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6050511002540588, + "rewards/margins": 0.13156384229660034, + "rewards/rejected": -0.7366149425506592, + "step": 486 + }, + { + "epoch": 0.64, + "learning_rate": 4.6252099094501834e-05, + "logits/chosen": -1.9163178205490112, + "logits/rejected": -1.8832037448883057, + "logps/chosen": -202.50064086914062, + "logps/rejected": -206.1787109375, + "loss": 0.8078, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8862631320953369, + "rewards/margins": -0.06633087992668152, + "rewards/rejected": -0.819932222366333, + "step": 487 + }, + { + "epoch": 0.64, + "learning_rate": 4.623320739596685e-05, + "logits/chosen": -1.943336009979248, + "logits/rejected": -1.9594800472259521, + "logps/chosen": -184.20272827148438, + "logps/rejected": -185.5780029296875, + "loss": 0.948, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.8648966550827026, + "rewards/margins": -0.32853201031684875, + "rewards/rejected": -0.5363646745681763, + "step": 488 + }, + { + "epoch": 0.64, + "learning_rate": 4.621427208260296e-05, + "logits/chosen": -2.0543949604034424, + "logits/rejected": -2.09141206741333, + "logps/chosen": -186.11021423339844, + "logps/rejected": -197.07164001464844, + "loss": 0.6593, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6588226556777954, + "rewards/margins": 0.23558923602104187, + "rewards/rejected": -0.8944119811058044, + "step": 489 + }, + { + "epoch": 0.64, + "learning_rate": 4.6195293193304915e-05, + "logits/chosen": -2.2013731002807617, + "logits/rejected": -2.209264039993286, + "logps/chosen": -192.5195770263672, + "logps/rejected": -188.9171600341797, + "loss": 0.8303, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8534584641456604, + "rewards/margins": -0.10596348345279694, + "rewards/rejected": -0.7474948763847351, + "step": 490 + }, + { + "epoch": 0.64, + "learning_rate": 4.6176270767056976e-05, + "logits/chosen": -1.8635625839233398, + "logits/rejected": -1.8899545669555664, + "logps/chosen": -193.61715698242188, + "logps/rejected": -196.3071746826172, + "loss": 0.5859, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.620331346988678, + "rewards/margins": 0.35552215576171875, + "rewards/rejected": -0.9758535027503967, + "step": 491 + }, + { + "epoch": 0.64, + "learning_rate": 4.615720484293286e-05, + "logits/chosen": -2.0970966815948486, + "logits/rejected": -2.0922045707702637, + "logps/chosen": -171.4237060546875, + "logps/rejected": -173.91969299316406, + "loss": 0.7777, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7394740581512451, + "rewards/margins": 0.09215141832828522, + "rewards/rejected": -0.8316254615783691, + "step": 492 + }, + { + "epoch": 0.65, + "learning_rate": 4.613809546009558e-05, + "logits/chosen": -1.923639178276062, + "logits/rejected": -1.9087320566177368, + "logps/chosen": -210.64447021484375, + "logps/rejected": -202.98309326171875, + "loss": 0.7005, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7617734670639038, + "rewards/margins": 0.28139957785606384, + "rewards/rejected": -1.04317307472229, + "step": 493 + }, + { + "epoch": 0.65, + "learning_rate": 4.611894265779748e-05, + "logits/chosen": -1.7692898511886597, + "logits/rejected": -1.8441616296768188, + "logps/chosen": -181.05316162109375, + "logps/rejected": -190.46311950683594, + "loss": 0.8492, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.9923998117446899, + "rewards/margins": -0.16852112114429474, + "rewards/rejected": -0.8238787651062012, + "step": 494 + }, + { + "epoch": 0.65, + "learning_rate": 4.609974647538003e-05, + "logits/chosen": -2.242365837097168, + "logits/rejected": -2.2200753688812256, + "logps/chosen": -192.88491821289062, + "logps/rejected": -209.79190063476562, + "loss": 0.7824, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7258377075195312, + "rewards/margins": 0.06173846498131752, + "rewards/rejected": -0.7875760793685913, + "step": 495 + }, + { + "epoch": 0.65, + "learning_rate": 4.608050695227385e-05, + "logits/chosen": -2.0390326976776123, + "logits/rejected": -2.0602505207061768, + "logps/chosen": -159.614013671875, + "logps/rejected": -159.90530395507812, + "loss": 0.6495, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6496396660804749, + "rewards/margins": 0.15987172722816467, + "rewards/rejected": -0.8095113635063171, + "step": 496 + }, + { + "epoch": 0.65, + "learning_rate": 4.606122412799857e-05, + "logits/chosen": -1.8621050119400024, + "logits/rejected": -1.843872308731079, + "logps/chosen": -191.3387451171875, + "logps/rejected": -212.04867553710938, + "loss": 0.838, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9131224751472473, + "rewards/margins": -0.06589814275503159, + "rewards/rejected": -0.847224235534668, + "step": 497 + }, + { + "epoch": 0.65, + "learning_rate": 4.6041898042162764e-05, + "logits/chosen": -1.9165095090866089, + "logits/rejected": -1.9768743515014648, + "logps/chosen": -179.1850128173828, + "logps/rejected": -197.76953125, + "loss": 0.7127, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8291003704071045, + "rewards/margins": 0.08120033144950867, + "rewards/rejected": -0.9103007316589355, + "step": 498 + }, + { + "epoch": 0.65, + "learning_rate": 4.602252873446386e-05, + "logits/chosen": -1.71052086353302, + "logits/rejected": -1.7267752885818481, + "logps/chosen": -233.17083740234375, + "logps/rejected": -238.11651611328125, + "loss": 0.7183, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6033438444137573, + "rewards/margins": 0.14365322887897491, + "rewards/rejected": -0.7469971179962158, + "step": 499 + }, + { + "epoch": 0.65, + "learning_rate": 4.60031162446881e-05, + "logits/chosen": -1.685623049736023, + "logits/rejected": -1.759178876876831, + "logps/chosen": -179.87600708007812, + "logps/rejected": -183.2005615234375, + "loss": 0.7049, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7123013734817505, + "rewards/margins": 0.11958488076925278, + "rewards/rejected": -0.8318862915039062, + "step": 500 + }, + { + "epoch": 0.66, + "learning_rate": 4.5983660612710365e-05, + "logits/chosen": -1.9058446884155273, + "logits/rejected": -1.9013440608978271, + "logps/chosen": -177.750244140625, + "logps/rejected": -163.71591186523438, + "loss": 0.7503, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6146600246429443, + "rewards/margins": -0.01889324188232422, + "rewards/rejected": -0.5957667827606201, + "step": 501 + }, + { + "epoch": 0.66, + "learning_rate": 4.596416187849423e-05, + "logits/chosen": -1.6376805305480957, + "logits/rejected": -1.5382472276687622, + "logps/chosen": -177.68373107910156, + "logps/rejected": -205.24169921875, + "loss": 0.579, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39451998472213745, + "rewards/margins": 0.3564395606517792, + "rewards/rejected": -0.7509595155715942, + "step": 502 + }, + { + "epoch": 0.66, + "learning_rate": 4.5944620082091745e-05, + "logits/chosen": -2.126429319381714, + "logits/rejected": -2.135838747024536, + "logps/chosen": -168.3842010498047, + "logps/rejected": -193.17636108398438, + "loss": 0.8234, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9303070902824402, + "rewards/margins": -0.04648715257644653, + "rewards/rejected": -0.8838199973106384, + "step": 503 + }, + { + "epoch": 0.66, + "learning_rate": 4.5925035263643444e-05, + "logits/chosen": -2.267376661300659, + "logits/rejected": -2.2018914222717285, + "logps/chosen": -199.2592315673828, + "logps/rejected": -171.32907104492188, + "loss": 1.1319, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.2791969776153564, + "rewards/margins": -0.523222804069519, + "rewards/rejected": -0.7559741735458374, + "step": 504 + }, + { + "epoch": 0.66, + "learning_rate": 4.5905407463378225e-05, + "logits/chosen": -2.0708484649658203, + "logits/rejected": -2.0920658111572266, + "logps/chosen": -160.09619140625, + "logps/rejected": -166.4447021484375, + "loss": 0.7491, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7854949831962585, + "rewards/margins": 0.020966414362192154, + "rewards/rejected": -0.8064614534378052, + "step": 505 + }, + { + "epoch": 0.66, + "learning_rate": 4.588573672161326e-05, + "logits/chosen": -2.0177011489868164, + "logits/rejected": -2.026048183441162, + "logps/chosen": -272.7811584472656, + "logps/rejected": -275.3113708496094, + "loss": 0.755, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7942684888839722, + "rewards/margins": 0.19223184883594513, + "rewards/rejected": -0.9865003228187561, + "step": 506 + }, + { + "epoch": 0.66, + "learning_rate": 4.586602307875396e-05, + "logits/chosen": -2.0870862007141113, + "logits/rejected": -2.1000866889953613, + "logps/chosen": -159.46205139160156, + "logps/rejected": -163.21469116210938, + "loss": 0.5693, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6831841468811035, + "rewards/margins": 0.39189672470092773, + "rewards/rejected": -1.0750807523727417, + "step": 507 + }, + { + "epoch": 0.66, + "learning_rate": 4.5846266575293816e-05, + "logits/chosen": -2.2326223850250244, + "logits/rejected": -2.258408308029175, + "logps/chosen": -194.6920928955078, + "logps/rejected": -210.00762939453125, + "loss": 0.6405, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3496246635913849, + "rewards/margins": 0.22416295111179352, + "rewards/rejected": -0.5737876296043396, + "step": 508 + }, + { + "epoch": 0.67, + "learning_rate": 4.582646725181441e-05, + "logits/chosen": -1.880753993988037, + "logits/rejected": -1.873094081878662, + "logps/chosen": -185.65306091308594, + "logps/rejected": -175.70822143554688, + "loss": 0.8836, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8803216218948364, + "rewards/margins": -0.1467835009098053, + "rewards/rejected": -0.7335382103919983, + "step": 509 + }, + { + "epoch": 0.67, + "learning_rate": 4.580662514898522e-05, + "logits/chosen": -2.0115244388580322, + "logits/rejected": -2.0410544872283936, + "logps/chosen": -144.72772216796875, + "logps/rejected": -144.91920471191406, + "loss": 0.8422, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.628396213054657, + "rewards/margins": -0.12304525077342987, + "rewards/rejected": -0.5053509473800659, + "step": 510 + }, + { + "epoch": 0.67, + "learning_rate": 4.5786740307563636e-05, + "logits/chosen": -2.0044987201690674, + "logits/rejected": -2.0016119480133057, + "logps/chosen": -173.86984252929688, + "logps/rejected": -171.8080596923828, + "loss": 0.8741, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.8329235315322876, + "rewards/margins": -0.2628735303878784, + "rewards/rejected": -0.5700500011444092, + "step": 511 + }, + { + "epoch": 0.67, + "learning_rate": 4.576681276839483e-05, + "logits/chosen": -1.8384199142456055, + "logits/rejected": -1.9472904205322266, + "logps/chosen": -151.073486328125, + "logps/rejected": -166.85824584960938, + "loss": 0.7757, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.168123483657837, + "rewards/margins": 0.10600915551185608, + "rewards/rejected": -1.2741327285766602, + "step": 512 + }, + { + "epoch": 0.67, + "learning_rate": 4.574684257241168e-05, + "logits/chosen": -1.633155107498169, + "logits/rejected": -1.6421935558319092, + "logps/chosen": -178.23052978515625, + "logps/rejected": -178.91262817382812, + "loss": 0.6821, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3441859483718872, + "rewards/margins": 0.10676999390125275, + "rewards/rejected": -0.45095589756965637, + "step": 513 + }, + { + "epoch": 0.67, + "learning_rate": 4.572682976063468e-05, + "logits/chosen": -2.086414337158203, + "logits/rejected": -2.0576364994049072, + "logps/chosen": -210.91128540039062, + "logps/rejected": -207.3369140625, + "loss": 0.7466, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8871356248855591, + "rewards/margins": 0.06643228977918625, + "rewards/rejected": -0.9535678625106812, + "step": 514 + }, + { + "epoch": 0.67, + "learning_rate": 4.5706774374171854e-05, + "logits/chosen": -1.8427444696426392, + "logits/rejected": -1.861626148223877, + "logps/chosen": -179.20623779296875, + "logps/rejected": -185.3961181640625, + "loss": 0.7066, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45703721046447754, + "rewards/margins": 0.051899224519729614, + "rewards/rejected": -0.5089364647865295, + "step": 515 + }, + { + "epoch": 0.68, + "learning_rate": 4.56866764542187e-05, + "logits/chosen": -1.7605255842208862, + "logits/rejected": -1.7601615190505981, + "logps/chosen": -196.36715698242188, + "logps/rejected": -227.3414306640625, + "loss": 0.6068, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5065858364105225, + "rewards/margins": 0.42499038577079773, + "rewards/rejected": -0.9315762519836426, + "step": 516 + }, + { + "epoch": 0.68, + "learning_rate": 4.566653604205805e-05, + "logits/chosen": -1.8466157913208008, + "logits/rejected": -1.7656481266021729, + "logps/chosen": -192.74110412597656, + "logps/rejected": -192.05816650390625, + "loss": 0.9557, + "rewards/accuracies": 0.1875, + "rewards/chosen": -1.4459354877471924, + "rewards/margins": -0.40716552734375, + "rewards/rejected": -1.0387699604034424, + "step": 517 + }, + { + "epoch": 0.68, + "learning_rate": 4.5646353179060057e-05, + "logits/chosen": -1.8340647220611572, + "logits/rejected": -1.8470059633255005, + "logps/chosen": -210.009765625, + "logps/rejected": -211.32252502441406, + "loss": 0.8106, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6514133810997009, + "rewards/margins": -0.08276738226413727, + "rewards/rejected": -0.5686460137367249, + "step": 518 + }, + { + "epoch": 0.68, + "learning_rate": 4.562612790668204e-05, + "logits/chosen": -1.9675233364105225, + "logits/rejected": -1.9512717723846436, + "logps/chosen": -145.25350952148438, + "logps/rejected": -151.3224334716797, + "loss": 0.7672, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6850120425224304, + "rewards/margins": 0.0520671084523201, + "rewards/rejected": -0.7370792031288147, + "step": 519 + }, + { + "epoch": 0.68, + "learning_rate": 4.560586026646845e-05, + "logits/chosen": -1.7509602308273315, + "logits/rejected": -1.701064944267273, + "logps/chosen": -228.2190704345703, + "logps/rejected": -214.31961059570312, + "loss": 0.9088, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.739928662776947, + "rewards/margins": -0.19188320636749268, + "rewards/rejected": -0.5480455160140991, + "step": 520 + }, + { + "epoch": 0.68, + "learning_rate": 4.558555030005075e-05, + "logits/chosen": -2.1057076454162598, + "logits/rejected": -2.0997025966644287, + "logps/chosen": -219.45846557617188, + "logps/rejected": -222.98912048339844, + "loss": 0.7314, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.62837815284729, + "rewards/margins": 0.05226774513721466, + "rewards/rejected": -0.6806458234786987, + "step": 521 + }, + { + "epoch": 0.68, + "learning_rate": 4.556519804914736e-05, + "logits/chosen": -2.0136232376098633, + "logits/rejected": -1.996194839477539, + "logps/chosen": -184.241455078125, + "logps/rejected": -173.1997528076172, + "loss": 0.6137, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2964561879634857, + "rewards/margins": 0.219641774892807, + "rewards/rejected": -0.5160979628562927, + "step": 522 + }, + { + "epoch": 0.68, + "learning_rate": 4.554480355556354e-05, + "logits/chosen": -1.9112343788146973, + "logits/rejected": -1.8661640882492065, + "logps/chosen": -168.06959533691406, + "logps/rejected": -173.24607849121094, + "loss": 0.7573, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5839947462081909, + "rewards/margins": 0.007685039192438126, + "rewards/rejected": -0.5916797518730164, + "step": 523 + }, + { + "epoch": 0.69, + "learning_rate": 4.552436686119134e-05, + "logits/chosen": -1.8316876888275146, + "logits/rejected": -1.8144513368606567, + "logps/chosen": -181.4498291015625, + "logps/rejected": -186.10784912109375, + "loss": 0.9599, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6419578194618225, + "rewards/margins": -0.39174380898475647, + "rewards/rejected": -0.25021398067474365, + "step": 524 + }, + { + "epoch": 0.69, + "learning_rate": 4.550388800800948e-05, + "logits/chosen": -1.8764700889587402, + "logits/rejected": -1.9319936037063599, + "logps/chosen": -168.9038543701172, + "logps/rejected": -168.67950439453125, + "loss": 0.6579, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.301873117685318, + "rewards/margins": 0.17850691080093384, + "rewards/rejected": -0.48037999868392944, + "step": 525 + }, + { + "epoch": 0.69, + "learning_rate": 4.548336703808328e-05, + "logits/chosen": -1.9540125131607056, + "logits/rejected": -1.9322861433029175, + "logps/chosen": -228.04966735839844, + "logps/rejected": -231.02769470214844, + "loss": 0.8968, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6345371007919312, + "rewards/margins": -0.17063456773757935, + "rewards/rejected": -0.4639025628566742, + "step": 526 + }, + { + "epoch": 0.69, + "learning_rate": 4.546280399356457e-05, + "logits/chosen": -1.6315593719482422, + "logits/rejected": -1.6160613298416138, + "logps/chosen": -225.32296752929688, + "logps/rejected": -212.6197052001953, + "loss": 0.627, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.24752049148082733, + "rewards/margins": 0.3519718647003174, + "rewards/rejected": -0.5994923710823059, + "step": 527 + }, + { + "epoch": 0.69, + "learning_rate": 4.54421989166916e-05, + "logits/chosen": -2.0561516284942627, + "logits/rejected": -2.1020302772521973, + "logps/chosen": -168.7066192626953, + "logps/rejected": -180.7903289794922, + "loss": 0.782, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4703682065010071, + "rewards/margins": -0.10123680531978607, + "rewards/rejected": -0.3691314160823822, + "step": 528 + }, + { + "epoch": 0.69, + "learning_rate": 4.542155184978898e-05, + "logits/chosen": -1.8236006498336792, + "logits/rejected": -1.810032606124878, + "logps/chosen": -169.7281951904297, + "logps/rejected": -162.98208618164062, + "loss": 0.9228, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.45871978998184204, + "rewards/margins": -0.2870974838733673, + "rewards/rejected": -0.17162233591079712, + "step": 529 + }, + { + "epoch": 0.69, + "learning_rate": 4.540086283526754e-05, + "logits/chosen": -2.0382392406463623, + "logits/rejected": -2.0122694969177246, + "logps/chosen": -196.42291259765625, + "logps/rejected": -196.21530151367188, + "loss": 0.9005, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.638641357421875, + "rewards/margins": -0.2959403991699219, + "rewards/rejected": -0.34270092844963074, + "step": 530 + }, + { + "epoch": 0.69, + "learning_rate": 4.538013191562431e-05, + "logits/chosen": -1.4818511009216309, + "logits/rejected": -1.5436879396438599, + "logps/chosen": -173.07022094726562, + "logps/rejected": -172.4759063720703, + "loss": 0.7594, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5663694143295288, + "rewards/margins": -0.0718955397605896, + "rewards/rejected": -0.4944738447666168, + "step": 531 + }, + { + "epoch": 0.7, + "learning_rate": 4.5359359133442356e-05, + "logits/chosen": -1.788183331489563, + "logits/rejected": -1.7689244747161865, + "logps/chosen": -194.0176239013672, + "logps/rejected": -183.86965942382812, + "loss": 0.5613, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.342574805021286, + "rewards/margins": 0.44486334919929504, + "rewards/rejected": -0.787438154220581, + "step": 532 + }, + { + "epoch": 0.7, + "learning_rate": 4.533854453139077e-05, + "logits/chosen": -1.6594241857528687, + "logits/rejected": -1.6917797327041626, + "logps/chosen": -235.48097229003906, + "logps/rejected": -265.2259826660156, + "loss": 0.8443, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6860448718070984, + "rewards/margins": -0.2058294713497162, + "rewards/rejected": -0.4802154004573822, + "step": 533 + }, + { + "epoch": 0.7, + "learning_rate": 4.5317688152224515e-05, + "logits/chosen": -2.104198932647705, + "logits/rejected": -2.0917530059814453, + "logps/chosen": -193.2283935546875, + "logps/rejected": -199.61131286621094, + "loss": 0.9177, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.36964061856269836, + "rewards/margins": -0.1281491219997406, + "rewards/rejected": -0.24149154126644135, + "step": 534 + }, + { + "epoch": 0.7, + "learning_rate": 4.52967900387844e-05, + "logits/chosen": -2.016317367553711, + "logits/rejected": -2.0518314838409424, + "logps/chosen": -192.35496520996094, + "logps/rejected": -203.24896240234375, + "loss": 0.9074, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2986190915107727, + "rewards/margins": -0.2612786591053009, + "rewards/rejected": -0.037340469658374786, + "step": 535 + }, + { + "epoch": 0.7, + "learning_rate": 4.5275850233996925e-05, + "logits/chosen": -1.9520158767700195, + "logits/rejected": -1.9376802444458008, + "logps/chosen": -189.20904541015625, + "logps/rejected": -222.3477783203125, + "loss": 0.6845, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.38919875025749207, + "rewards/margins": 0.12857961654663086, + "rewards/rejected": -0.5177783370018005, + "step": 536 + }, + { + "epoch": 0.7, + "learning_rate": 4.525486878087426e-05, + "logits/chosen": -1.7735748291015625, + "logits/rejected": -1.7847058773040771, + "logps/chosen": -177.4947967529297, + "logps/rejected": -180.04049682617188, + "loss": 0.6215, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2464422881603241, + "rewards/margins": 0.21997497975826263, + "rewards/rejected": -0.4664173126220703, + "step": 537 + }, + { + "epoch": 0.7, + "learning_rate": 4.523384572251409e-05, + "logits/chosen": -1.6225758790969849, + "logits/rejected": -1.6475239992141724, + "logps/chosen": -176.7261962890625, + "logps/rejected": -201.78408813476562, + "loss": 0.6428, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31473392248153687, + "rewards/margins": 0.2050209641456604, + "rewards/rejected": -0.5197548866271973, + "step": 538 + }, + { + "epoch": 0.71, + "learning_rate": 4.52127811020996e-05, + "logits/chosen": -2.0809147357940674, + "logits/rejected": -2.0989580154418945, + "logps/chosen": -228.8104705810547, + "logps/rejected": -215.7156524658203, + "loss": 0.7835, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.21490764617919922, + "rewards/margins": -0.11036863178014755, + "rewards/rejected": -0.10453899949789047, + "step": 539 + }, + { + "epoch": 0.71, + "learning_rate": 4.5191674962899314e-05, + "logits/chosen": -1.7017827033996582, + "logits/rejected": -1.7276175022125244, + "logps/chosen": -155.532470703125, + "logps/rejected": -170.7515411376953, + "loss": 0.788, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5771217346191406, + "rewards/margins": -0.06404206156730652, + "rewards/rejected": -0.5130796432495117, + "step": 540 + }, + { + "epoch": 0.71, + "learning_rate": 4.5170527348267054e-05, + "logits/chosen": -1.8137165307998657, + "logits/rejected": -1.7630757093429565, + "logps/chosen": -177.9446563720703, + "logps/rejected": -174.14016723632812, + "loss": 0.7775, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.48409003019332886, + "rewards/margins": -0.04493946209549904, + "rewards/rejected": -0.4391506016254425, + "step": 541 + }, + { + "epoch": 0.71, + "learning_rate": 4.5149338301641845e-05, + "logits/chosen": -2.1948161125183105, + "logits/rejected": -2.1407151222229004, + "logps/chosen": -170.88812255859375, + "logps/rejected": -178.73684692382812, + "loss": 0.7424, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1686539500951767, + "rewards/margins": -0.012474283576011658, + "rewards/rejected": -0.15617968142032623, + "step": 542 + }, + { + "epoch": 0.71, + "learning_rate": 4.512810786654779e-05, + "logits/chosen": -2.117692708969116, + "logits/rejected": -2.156689405441284, + "logps/chosen": -214.4130859375, + "logps/rejected": -214.46578979492188, + "loss": 0.6464, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06381916999816895, + "rewards/margins": 0.22082670032978058, + "rewards/rejected": -0.28464585542678833, + "step": 543 + }, + { + "epoch": 0.71, + "learning_rate": 4.510683608659403e-05, + "logits/chosen": -2.0320470333099365, + "logits/rejected": -2.0040557384490967, + "logps/chosen": -163.62440490722656, + "logps/rejected": -147.44810485839844, + "loss": 0.985, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.31194961071014404, + "rewards/margins": -0.37895485758781433, + "rewards/rejected": 0.06700524687767029, + "step": 544 + }, + { + "epoch": 0.71, + "learning_rate": 4.508552300547463e-05, + "logits/chosen": -1.8441392183303833, + "logits/rejected": -1.8550631999969482, + "logps/chosen": -165.3875274658203, + "logps/rejected": -163.02366638183594, + "loss": 0.9708, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3251355290412903, + "rewards/margins": -0.3274462819099426, + "rewards/rejected": 0.002310771495103836, + "step": 545 + }, + { + "epoch": 0.71, + "learning_rate": 4.506416866696848e-05, + "logits/chosen": -1.8278872966766357, + "logits/rejected": -1.807031273841858, + "logps/chosen": -184.7231903076172, + "logps/rejected": -194.12863159179688, + "loss": 0.7609, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.35950741171836853, + "rewards/margins": 0.040937766432762146, + "rewards/rejected": -0.40044522285461426, + "step": 546 + }, + { + "epoch": 0.72, + "learning_rate": 4.504277311493922e-05, + "logits/chosen": -1.9982786178588867, + "logits/rejected": -1.9985647201538086, + "logps/chosen": -171.0308380126953, + "logps/rejected": -187.55636596679688, + "loss": 0.6264, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04028481990098953, + "rewards/margins": 0.33397042751312256, + "rewards/rejected": -0.3742552697658539, + "step": 547 + }, + { + "epoch": 0.72, + "learning_rate": 4.502133639333516e-05, + "logits/chosen": -1.8734229803085327, + "logits/rejected": -1.8940666913986206, + "logps/chosen": -170.61341857910156, + "logps/rejected": -159.20034790039062, + "loss": 0.7322, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.08178206533193588, + "rewards/margins": 0.0979895144701004, + "rewards/rejected": -0.016207464039325714, + "step": 548 + }, + { + "epoch": 0.72, + "learning_rate": 4.499985854618915e-05, + "logits/chosen": -1.7860755920410156, + "logits/rejected": -1.8111528158187866, + "logps/chosen": -166.02748107910156, + "logps/rejected": -184.15676879882812, + "loss": 0.7579, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3080641031265259, + "rewards/margins": -0.010175898671150208, + "rewards/rejected": -0.29788821935653687, + "step": 549 + }, + { + "epoch": 0.72, + "learning_rate": 4.497833961761855e-05, + "logits/chosen": -1.3680813312530518, + "logits/rejected": -1.399601697921753, + "logps/chosen": -177.28103637695312, + "logps/rejected": -221.0879364013672, + "loss": 0.745, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.43804460763931274, + "rewards/margins": 0.094999298453331, + "rewards/rejected": -0.5330439209938049, + "step": 550 + }, + { + "epoch": 0.72, + "learning_rate": 4.495677965182506e-05, + "logits/chosen": -1.704332947731018, + "logits/rejected": -1.8049595355987549, + "logps/chosen": -214.8744354248047, + "logps/rejected": -240.92410278320312, + "loss": 0.6746, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.26206129789352417, + "rewards/margins": 0.24123568832874298, + "rewards/rejected": -0.503296971321106, + "step": 551 + }, + { + "epoch": 0.72, + "learning_rate": 4.4935178693094714e-05, + "logits/chosen": -1.9950153827667236, + "logits/rejected": -1.9464647769927979, + "logps/chosen": -191.90756225585938, + "logps/rejected": -199.6075439453125, + "loss": 0.6123, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.23637795448303223, + "rewards/margins": 0.35672998428344727, + "rewards/rejected": -0.12035202980041504, + "step": 552 + }, + { + "epoch": 0.72, + "learning_rate": 4.491353678579774e-05, + "logits/chosen": -2.1501073837280273, + "logits/rejected": -2.076387405395508, + "logps/chosen": -207.20103454589844, + "logps/rejected": -181.49301147460938, + "loss": 0.6066, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04991075396537781, + "rewards/margins": 0.3844224512577057, + "rewards/rejected": -0.3345116972923279, + "step": 553 + }, + { + "epoch": 0.73, + "learning_rate": 4.489185397438845e-05, + "logits/chosen": -2.0605039596557617, + "logits/rejected": -2.0319156646728516, + "logps/chosen": -227.3732147216797, + "logps/rejected": -207.87069702148438, + "loss": 0.92, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.13487425446510315, + "rewards/margins": -0.36219164729118347, + "rewards/rejected": 0.22731736302375793, + "step": 554 + }, + { + "epoch": 0.73, + "learning_rate": 4.4870130303405214e-05, + "logits/chosen": -1.849971890449524, + "logits/rejected": -1.7690582275390625, + "logps/chosen": -181.0364532470703, + "logps/rejected": -200.14926147460938, + "loss": 0.9661, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.27974945306777954, + "rewards/margins": -0.3051350712776184, + "rewards/rejected": 0.025385625660419464, + "step": 555 + }, + { + "epoch": 0.73, + "learning_rate": 4.484836581747032e-05, + "logits/chosen": -1.952078938484192, + "logits/rejected": -1.961308240890503, + "logps/chosen": -178.1840057373047, + "logps/rejected": -183.68377685546875, + "loss": 0.5584, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3255058526992798, + "rewards/margins": 0.4501607418060303, + "rewards/rejected": -0.7756666541099548, + "step": 556 + }, + { + "epoch": 0.73, + "learning_rate": 4.4826560561289865e-05, + "logits/chosen": -1.9457815885543823, + "logits/rejected": -2.0380780696868896, + "logps/chosen": -178.47607421875, + "logps/rejected": -187.38902282714844, + "loss": 0.87, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.361419141292572, + "rewards/margins": -0.22761335968971252, + "rewards/rejected": -0.1338057518005371, + "step": 557 + }, + { + "epoch": 0.73, + "learning_rate": 4.4804714579653736e-05, + "logits/chosen": -1.8216781616210938, + "logits/rejected": -1.8323644399642944, + "logps/chosen": -235.6400146484375, + "logps/rejected": -214.7845001220703, + "loss": 0.8859, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.32507357001304626, + "rewards/margins": -0.19276276230812073, + "rewards/rejected": -0.13231079280376434, + "step": 558 + }, + { + "epoch": 0.73, + "learning_rate": 4.4782827917435454e-05, + "logits/chosen": -2.179039716720581, + "logits/rejected": -2.2042417526245117, + "logps/chosen": -138.87144470214844, + "logps/rejected": -152.27728271484375, + "loss": 0.859, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2654874324798584, + "rewards/margins": -0.15553446114063263, + "rewards/rejected": -0.10995297133922577, + "step": 559 + }, + { + "epoch": 0.73, + "learning_rate": 4.4760900619592085e-05, + "logits/chosen": -1.9957417249679565, + "logits/rejected": -2.006272315979004, + "logps/chosen": -156.08970642089844, + "logps/rejected": -156.20684814453125, + "loss": 0.5498, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06699158251285553, + "rewards/margins": 0.520176887512207, + "rewards/rejected": -0.4531853199005127, + "step": 560 + }, + { + "epoch": 0.73, + "learning_rate": 4.4738932731164194e-05, + "logits/chosen": -2.0068199634552, + "logits/rejected": -2.0332655906677246, + "logps/chosen": -192.71728515625, + "logps/rejected": -208.94129943847656, + "loss": 0.8786, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.30353277921676636, + "rewards/margins": -0.13430212438106537, + "rewards/rejected": -0.16923066973686218, + "step": 561 + }, + { + "epoch": 0.74, + "learning_rate": 4.47169242972757e-05, + "logits/chosen": -2.0624818801879883, + "logits/rejected": -2.0608577728271484, + "logps/chosen": -186.4285430908203, + "logps/rejected": -195.51882934570312, + "loss": 0.7084, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.07775077223777771, + "rewards/margins": 0.15880751609802246, + "rewards/rejected": -0.08105673640966415, + "step": 562 + }, + { + "epoch": 0.74, + "learning_rate": 4.469487536313381e-05, + "logits/chosen": -1.780775547027588, + "logits/rejected": -1.7067487239837646, + "logps/chosen": -182.54295349121094, + "logps/rejected": -186.3512420654297, + "loss": 0.8105, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5589167475700378, + "rewards/margins": -0.0893404483795166, + "rewards/rejected": -0.46957623958587646, + "step": 563 + }, + { + "epoch": 0.74, + "learning_rate": 4.467278597402894e-05, + "logits/chosen": -1.8799240589141846, + "logits/rejected": -1.8977303504943848, + "logps/chosen": -151.5462646484375, + "logps/rejected": -153.37344360351562, + "loss": 0.6187, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06963834166526794, + "rewards/margins": 0.30848821997642517, + "rewards/rejected": -0.23884987831115723, + "step": 564 + }, + { + "epoch": 0.74, + "learning_rate": 4.465065617533457e-05, + "logits/chosen": -1.7631909847259521, + "logits/rejected": -1.7585983276367188, + "logps/chosen": -192.2979278564453, + "logps/rejected": -191.92445373535156, + "loss": 0.664, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0888519287109375, + "rewards/margins": 0.18764151632785797, + "rewards/rejected": -0.27649345993995667, + "step": 565 + }, + { + "epoch": 0.74, + "learning_rate": 4.462848601250722e-05, + "logits/chosen": -2.05842924118042, + "logits/rejected": -2.0034210681915283, + "logps/chosen": -167.9142608642578, + "logps/rejected": -176.2246551513672, + "loss": 0.754, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05985128879547119, + "rewards/margins": -0.02553650364279747, + "rewards/rejected": -0.034314800053834915, + "step": 566 + }, + { + "epoch": 0.74, + "learning_rate": 4.4606275531086295e-05, + "logits/chosen": -1.7910778522491455, + "logits/rejected": -1.746850848197937, + "logps/chosen": -146.9508056640625, + "logps/rejected": -154.16268920898438, + "loss": 0.7626, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.19962230324745178, + "rewards/margins": -0.01553274691104889, + "rewards/rejected": -0.1840895563364029, + "step": 567 + }, + { + "epoch": 0.74, + "learning_rate": 4.4584024776694035e-05, + "logits/chosen": -1.7370885610580444, + "logits/rejected": -1.7392570972442627, + "logps/chosen": -195.53094482421875, + "logps/rejected": -185.76638793945312, + "loss": 0.9247, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.5089238286018372, + "rewards/margins": -0.37316685914993286, + "rewards/rejected": -0.13575701415538788, + "step": 568 + }, + { + "epoch": 0.74, + "learning_rate": 4.45617337950354e-05, + "logits/chosen": -2.0297796726226807, + "logits/rejected": -1.9959697723388672, + "logps/chosen": -187.27316284179688, + "logps/rejected": -171.93833923339844, + "loss": 0.853, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.13575240969657898, + "rewards/margins": -0.1941533386707306, + "rewards/rejected": 0.058400966227054596, + "step": 569 + }, + { + "epoch": 0.75, + "learning_rate": 4.453940263189797e-05, + "logits/chosen": -1.8683103322982788, + "logits/rejected": -1.8316171169281006, + "logps/chosen": -245.50885009765625, + "logps/rejected": -218.13079833984375, + "loss": 0.9437, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6386896967887878, + "rewards/margins": -0.3385563790798187, + "rewards/rejected": -0.3001333475112915, + "step": 570 + }, + { + "epoch": 0.75, + "learning_rate": 4.4517031333151874e-05, + "logits/chosen": -1.982710361480713, + "logits/rejected": -2.028301239013672, + "logps/chosen": -147.2786865234375, + "logps/rejected": -162.232177734375, + "loss": 0.762, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11944058537483215, + "rewards/margins": 0.18246760964393616, + "rewards/rejected": -0.06302699446678162, + "step": 571 + }, + { + "epoch": 0.75, + "learning_rate": 4.449461994474968e-05, + "logits/chosen": -1.6838575601577759, + "logits/rejected": -1.7166067361831665, + "logps/chosen": -197.30078125, + "logps/rejected": -184.3292999267578, + "loss": 0.8271, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.18462339043617249, + "rewards/margins": -0.059001460671424866, + "rewards/rejected": -0.12562192976474762, + "step": 572 + }, + { + "epoch": 0.75, + "learning_rate": 4.44721685127263e-05, + "logits/chosen": -2.028676986694336, + "logits/rejected": -2.0241925716400146, + "logps/chosen": -171.38328552246094, + "logps/rejected": -168.54164123535156, + "loss": 0.9617, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.37711527943611145, + "rewards/margins": -0.406730592250824, + "rewards/rejected": 0.029615353792905807, + "step": 573 + }, + { + "epoch": 0.75, + "learning_rate": 4.4449677083198896e-05, + "logits/chosen": -1.7875943183898926, + "logits/rejected": -1.790823221206665, + "logps/chosen": -166.38113403320312, + "logps/rejected": -168.552490234375, + "loss": 0.8054, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.35224342346191406, + "rewards/margins": -0.07848221063613892, + "rewards/rejected": -0.27376121282577515, + "step": 574 + }, + { + "epoch": 0.75, + "learning_rate": 4.4427145702366804e-05, + "logits/chosen": -1.856335163116455, + "logits/rejected": -1.8741549253463745, + "logps/chosen": -148.2678680419922, + "logps/rejected": -154.46356201171875, + "loss": 0.8199, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3622299134731293, + "rewards/margins": -0.16889148950576782, + "rewards/rejected": -0.19333842396736145, + "step": 575 + }, + { + "epoch": 0.75, + "learning_rate": 4.440457441651139e-05, + "logits/chosen": -2.041019916534424, + "logits/rejected": -2.060852289199829, + "logps/chosen": -163.51490783691406, + "logps/rejected": -167.4606170654297, + "loss": 0.7434, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2925058901309967, + "rewards/margins": 0.018959401175379753, + "rewards/rejected": -0.3114652931690216, + "step": 576 + }, + { + "epoch": 0.76, + "learning_rate": 4.4381963271996044e-05, + "logits/chosen": -1.9944902658462524, + "logits/rejected": -1.9588004350662231, + "logps/chosen": -197.7550506591797, + "logps/rejected": -220.91131591796875, + "loss": 0.7051, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7190204858779907, + "rewards/margins": 0.08902256935834885, + "rewards/rejected": -0.8080430626869202, + "step": 577 + }, + { + "epoch": 0.76, + "learning_rate": 4.435931231526597e-05, + "logits/chosen": -1.665032148361206, + "logits/rejected": -1.7525084018707275, + "logps/chosen": -179.66204833984375, + "logps/rejected": -176.45870971679688, + "loss": 0.6586, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15862412750720978, + "rewards/margins": 0.2471276819705963, + "rewards/rejected": -0.4057517945766449, + "step": 578 + }, + { + "epoch": 0.76, + "learning_rate": 4.433662159284818e-05, + "logits/chosen": -2.038362741470337, + "logits/rejected": -2.01652193069458, + "logps/chosen": -164.3741455078125, + "logps/rejected": -179.5104522705078, + "loss": 0.7518, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.31034964323043823, + "rewards/margins": 0.014281976036727428, + "rewards/rejected": -0.32463157176971436, + "step": 579 + }, + { + "epoch": 0.76, + "learning_rate": 4.4313891151351375e-05, + "logits/chosen": -1.9882714748382568, + "logits/rejected": -1.9608687162399292, + "logps/chosen": -172.2445068359375, + "logps/rejected": -166.55502319335938, + "loss": 0.8068, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2949073314666748, + "rewards/margins": 0.04608011618256569, + "rewards/rejected": -0.3409874737262726, + "step": 580 + }, + { + "epoch": 0.76, + "learning_rate": 4.429112103746582e-05, + "logits/chosen": -1.999483585357666, + "logits/rejected": -1.8845561742782593, + "logps/chosen": -175.16586303710938, + "logps/rejected": -193.78138732910156, + "loss": 0.7416, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3049665093421936, + "rewards/margins": 0.03948044776916504, + "rewards/rejected": -0.34444695711135864, + "step": 581 + }, + { + "epoch": 0.76, + "learning_rate": 4.4268311297963295e-05, + "logits/chosen": -2.089709758758545, + "logits/rejected": -2.10359263420105, + "logps/chosen": -188.5309600830078, + "logps/rejected": -185.20364379882812, + "loss": 0.6649, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11079156398773193, + "rewards/margins": 0.3179706037044525, + "rewards/rejected": -0.42876213788986206, + "step": 582 + }, + { + "epoch": 0.76, + "learning_rate": 4.4245461979696937e-05, + "logits/chosen": -1.8897924423217773, + "logits/rejected": -1.8803207874298096, + "logps/chosen": -244.1274871826172, + "logps/rejected": -252.70843505859375, + "loss": 0.8402, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.36066266894340515, + "rewards/margins": -0.10755321383476257, + "rewards/rejected": -0.2531094551086426, + "step": 583 + }, + { + "epoch": 0.76, + "learning_rate": 4.422257312960123e-05, + "logits/chosen": -1.826185703277588, + "logits/rejected": -1.883529782295227, + "logps/chosen": -177.42892456054688, + "logps/rejected": -199.69570922851562, + "loss": 0.7324, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.18842053413391113, + "rewards/margins": 0.08203045278787613, + "rewards/rejected": -0.27045097947120667, + "step": 584 + }, + { + "epoch": 0.77, + "learning_rate": 4.419964479469182e-05, + "logits/chosen": -1.8737729787826538, + "logits/rejected": -1.8830443620681763, + "logps/chosen": -179.8202362060547, + "logps/rejected": -186.00640869140625, + "loss": 0.717, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.18414545059204102, + "rewards/margins": 0.08150999248027802, + "rewards/rejected": -0.2656554579734802, + "step": 585 + }, + { + "epoch": 0.77, + "learning_rate": 4.417667702206548e-05, + "logits/chosen": -1.9959778785705566, + "logits/rejected": -2.040126085281372, + "logps/chosen": -163.7072296142578, + "logps/rejected": -166.8711700439453, + "loss": 0.6836, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08035236597061157, + "rewards/margins": 0.12447042763233185, + "rewards/rejected": -0.20482276380062103, + "step": 586 + }, + { + "epoch": 0.77, + "learning_rate": 4.415366985889998e-05, + "logits/chosen": -1.8016334772109985, + "logits/rejected": -1.7292026281356812, + "logps/chosen": -221.72146606445312, + "logps/rejected": -246.97320556640625, + "loss": 0.5375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22102700173854828, + "rewards/margins": 0.4568701386451721, + "rewards/rejected": -0.677897036075592, + "step": 587 + }, + { + "epoch": 0.77, + "learning_rate": 4.413062335245402e-05, + "logits/chosen": -2.1090190410614014, + "logits/rejected": -2.0904922485351562, + "logps/chosen": -165.54083251953125, + "logps/rejected": -178.31280517578125, + "loss": 0.9611, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.23936405777931213, + "rewards/margins": -0.30644306540489197, + "rewards/rejected": 0.06707899272441864, + "step": 588 + }, + { + "epoch": 0.77, + "learning_rate": 4.410753755006708e-05, + "logits/chosen": -2.0900731086730957, + "logits/rejected": -2.109846591949463, + "logps/chosen": -163.52517700195312, + "logps/rejected": -176.35760498046875, + "loss": 0.7631, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0015172064304351807, + "rewards/margins": -0.01845034398138523, + "rewards/rejected": 0.016933124512434006, + "step": 589 + }, + { + "epoch": 0.77, + "learning_rate": 4.408441249915938e-05, + "logits/chosen": -1.9141035079956055, + "logits/rejected": -1.933986783027649, + "logps/chosen": -172.36045837402344, + "logps/rejected": -175.99472045898438, + "loss": 0.9, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4615350365638733, + "rewards/margins": -0.29253697395324707, + "rewards/rejected": -0.16899806261062622, + "step": 590 + }, + { + "epoch": 0.77, + "learning_rate": 4.4061248247231776e-05, + "logits/chosen": -1.7625254392623901, + "logits/rejected": -1.7860521078109741, + "logps/chosen": -214.19223022460938, + "logps/rejected": -205.5415802001953, + "loss": 0.8853, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8153303861618042, + "rewards/margins": -0.16456466913223267, + "rewards/rejected": -0.6507657170295715, + "step": 591 + }, + { + "epoch": 0.77, + "learning_rate": 4.4038044841865614e-05, + "logits/chosen": -1.9525036811828613, + "logits/rejected": -1.9083642959594727, + "logps/chosen": -158.90896606445312, + "logps/rejected": -151.97276306152344, + "loss": 0.7655, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2276325672864914, + "rewards/margins": -0.026436805725097656, + "rewards/rejected": -0.20119577646255493, + "step": 592 + }, + { + "epoch": 0.78, + "learning_rate": 4.401480233072268e-05, + "logits/chosen": -1.9390695095062256, + "logits/rejected": -1.9349026679992676, + "logps/chosen": -169.4829864501953, + "logps/rejected": -175.8646240234375, + "loss": 0.8046, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3574194014072418, + "rewards/margins": -0.10936145484447479, + "rewards/rejected": -0.24805793166160583, + "step": 593 + }, + { + "epoch": 0.78, + "learning_rate": 4.399152076154509e-05, + "logits/chosen": -1.7492634057998657, + "logits/rejected": -1.7610502243041992, + "logps/chosen": -187.03453063964844, + "logps/rejected": -181.9283905029297, + "loss": 0.7496, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.39506590366363525, + "rewards/margins": 0.02907838299870491, + "rewards/rejected": -0.42414435744285583, + "step": 594 + }, + { + "epoch": 0.78, + "learning_rate": 4.396820018215518e-05, + "logits/chosen": -1.5396924018859863, + "logits/rejected": -1.6151387691497803, + "logps/chosen": -155.45480346679688, + "logps/rejected": -160.28761291503906, + "loss": 0.7919, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.2293637990951538, + "rewards/margins": -0.06823254376649857, + "rewards/rejected": -0.16113126277923584, + "step": 595 + }, + { + "epoch": 0.78, + "learning_rate": 4.394484064045542e-05, + "logits/chosen": -1.7704952955245972, + "logits/rejected": -1.838837742805481, + "logps/chosen": -169.5724334716797, + "logps/rejected": -215.3038330078125, + "loss": 0.8807, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5652868151664734, + "rewards/margins": -0.16716551780700684, + "rewards/rejected": -0.39812129735946655, + "step": 596 + }, + { + "epoch": 0.78, + "learning_rate": 4.392144218442831e-05, + "logits/chosen": -1.8503105640411377, + "logits/rejected": -1.917284369468689, + "logps/chosen": -191.72821044921875, + "logps/rejected": -207.21307373046875, + "loss": 0.6902, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15306434035301208, + "rewards/margins": 0.08791357278823853, + "rewards/rejected": -0.2409779131412506, + "step": 597 + }, + { + "epoch": 0.78, + "learning_rate": 4.3898004862136286e-05, + "logits/chosen": -1.8588240146636963, + "logits/rejected": -1.8746473789215088, + "logps/chosen": -155.37948608398438, + "logps/rejected": -164.0855712890625, + "loss": 0.6794, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22585612535476685, + "rewards/margins": 0.20000189542770386, + "rewards/rejected": -0.4258580207824707, + "step": 598 + }, + { + "epoch": 0.78, + "learning_rate": 4.3874528721721624e-05, + "logits/chosen": -2.0860049724578857, + "logits/rejected": -2.040462017059326, + "logps/chosen": -178.65829467773438, + "logps/rejected": -164.56826782226562, + "loss": 0.8497, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2987426221370697, + "rewards/margins": -0.12979570031166077, + "rewards/rejected": -0.16894695162773132, + "step": 599 + }, + { + "epoch": 0.79, + "learning_rate": 4.385101381140633e-05, + "logits/chosen": -2.0084433555603027, + "logits/rejected": -1.9932807683944702, + "logps/chosen": -177.64755249023438, + "logps/rejected": -183.24024963378906, + "loss": 0.6995, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.26401758193969727, + "rewards/margins": 0.12354859709739685, + "rewards/rejected": -0.38756614923477173, + "step": 600 + }, + { + "epoch": 0.79, + "learning_rate": 4.382746017949203e-05, + "logits/chosen": -1.8708142042160034, + "logits/rejected": -1.7702758312225342, + "logps/chosen": -176.54803466796875, + "logps/rejected": -182.53933715820312, + "loss": 0.9847, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.42444175481796265, + "rewards/margins": -0.36502790451049805, + "rewards/rejected": -0.059413861483335495, + "step": 601 + }, + { + "epoch": 0.79, + "learning_rate": 4.380386787435992e-05, + "logits/chosen": -1.9202303886413574, + "logits/rejected": -1.940227746963501, + "logps/chosen": -156.80599975585938, + "logps/rejected": -165.71456909179688, + "loss": 0.8598, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.22691097855567932, + "rewards/margins": -0.20334003865718842, + "rewards/rejected": -0.02357092872262001, + "step": 602 + }, + { + "epoch": 0.79, + "learning_rate": 4.378023694447061e-05, + "logits/chosen": -1.751800537109375, + "logits/rejected": -1.6968414783477783, + "logps/chosen": -203.2870635986328, + "logps/rejected": -175.4205322265625, + "loss": 0.8759, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5231369733810425, + "rewards/margins": -0.2074095904827118, + "rewards/rejected": -0.3157273828983307, + "step": 603 + }, + { + "epoch": 0.79, + "learning_rate": 4.375656743836407e-05, + "logits/chosen": -1.9159809350967407, + "logits/rejected": -1.8906610012054443, + "logps/chosen": -158.56918334960938, + "logps/rejected": -162.91845703125, + "loss": 0.7958, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.34893998503685, + "rewards/margins": -0.01667727530002594, + "rewards/rejected": -0.33226269483566284, + "step": 604 + }, + { + "epoch": 0.79, + "learning_rate": 4.373285940465948e-05, + "logits/chosen": -2.0548174381256104, + "logits/rejected": -2.0111958980560303, + "logps/chosen": -172.6857147216797, + "logps/rejected": -161.885498046875, + "loss": 0.6875, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16591539978981018, + "rewards/margins": 0.14734701812267303, + "rewards/rejected": -0.313262403011322, + "step": 605 + }, + { + "epoch": 0.79, + "learning_rate": 4.370911289205518e-05, + "logits/chosen": -1.9247201681137085, + "logits/rejected": -1.8969950675964355, + "logps/chosen": -193.29541015625, + "logps/rejected": -193.72703552246094, + "loss": 0.8154, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.11465515196323395, + "rewards/margins": -0.17800137400627136, + "rewards/rejected": 0.06334619224071503, + "step": 606 + }, + { + "epoch": 0.79, + "learning_rate": 4.368532794932854e-05, + "logits/chosen": -1.6993311643600464, + "logits/rejected": -1.7183904647827148, + "logps/chosen": -196.13595581054688, + "logps/rejected": -198.9134521484375, + "loss": 0.9953, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3177400529384613, + "rewards/margins": -0.28991618752479553, + "rewards/rejected": -0.02782391384243965, + "step": 607 + }, + { + "epoch": 0.8, + "learning_rate": 4.366150462533588e-05, + "logits/chosen": -1.9511172771453857, + "logits/rejected": -2.002882957458496, + "logps/chosen": -178.0076141357422, + "logps/rejected": -182.05166625976562, + "loss": 0.7215, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.021460914984345436, + "rewards/margins": 0.14429938793182373, + "rewards/rejected": -0.16576027870178223, + "step": 608 + }, + { + "epoch": 0.8, + "learning_rate": 4.363764296901234e-05, + "logits/chosen": -1.8208976984024048, + "logits/rejected": -1.8266651630401611, + "logps/chosen": -164.09571838378906, + "logps/rejected": -178.66693115234375, + "loss": 0.7568, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1179531067609787, + "rewards/margins": 0.09427131712436676, + "rewards/rejected": -0.21222442388534546, + "step": 609 + }, + { + "epoch": 0.8, + "learning_rate": 4.361374302937182e-05, + "logits/chosen": -1.5063440799713135, + "logits/rejected": -1.5503039360046387, + "logps/chosen": -195.34811401367188, + "logps/rejected": -200.38067626953125, + "loss": 0.7159, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.013212010264396667, + "rewards/margins": 0.05199579522013664, + "rewards/rejected": -0.06520780920982361, + "step": 610 + }, + { + "epoch": 0.8, + "learning_rate": 4.358980485550683e-05, + "logits/chosen": -1.8006740808486938, + "logits/rejected": -1.7630650997161865, + "logps/chosen": -198.05638122558594, + "logps/rejected": -167.6128387451172, + "loss": 0.8621, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.465340793132782, + "rewards/margins": -0.2472882717847824, + "rewards/rejected": -0.21805252134799957, + "step": 611 + }, + { + "epoch": 0.8, + "learning_rate": 4.356582849658845e-05, + "logits/chosen": -1.8933278322219849, + "logits/rejected": -1.9565207958221436, + "logps/chosen": -165.55247497558594, + "logps/rejected": -172.00611877441406, + "loss": 0.6704, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.09733805060386658, + "rewards/margins": 0.23297539353370667, + "rewards/rejected": -0.13563737273216248, + "step": 612 + }, + { + "epoch": 0.8, + "learning_rate": 4.354181400186617e-05, + "logits/chosen": -1.3361432552337646, + "logits/rejected": -1.4160830974578857, + "logps/chosen": -204.05889892578125, + "logps/rejected": -203.48159790039062, + "loss": 0.8597, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.468783974647522, + "rewards/margins": -0.07349361479282379, + "rewards/rejected": -0.3952903151512146, + "step": 613 + }, + { + "epoch": 0.8, + "learning_rate": 4.351776142066782e-05, + "logits/chosen": -1.8746682405471802, + "logits/rejected": -1.9063349962234497, + "logps/chosen": -161.05291748046875, + "logps/rejected": -155.7138671875, + "loss": 0.7912, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.021917428821325302, + "rewards/margins": -0.07229090481996536, + "rewards/rejected": 0.09420835971832275, + "step": 614 + }, + { + "epoch": 0.8, + "learning_rate": 4.349367080239946e-05, + "logits/chosen": -1.9660152196884155, + "logits/rejected": -1.9447054862976074, + "logps/chosen": -171.2956085205078, + "logps/rejected": -167.69021606445312, + "loss": 0.747, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.011758615262806416, + "rewards/margins": -0.006392620503902435, + "rewards/rejected": -0.005365990102291107, + "step": 615 + }, + { + "epoch": 0.81, + "learning_rate": 4.34695421965453e-05, + "logits/chosen": -1.7947853803634644, + "logits/rejected": -1.8116130828857422, + "logps/chosen": -191.9875946044922, + "logps/rejected": -188.09500122070312, + "loss": 0.7336, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2941385507583618, + "rewards/margins": 0.003938054665923119, + "rewards/rejected": 0.29020047187805176, + "step": 616 + }, + { + "epoch": 0.81, + "learning_rate": 4.344537565266755e-05, + "logits/chosen": -1.918999433517456, + "logits/rejected": -1.9537014961242676, + "logps/chosen": -172.28302001953125, + "logps/rejected": -184.16195678710938, + "loss": 0.777, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13267874717712402, + "rewards/margins": -0.03271746635437012, + "rewards/rejected": 0.16539622843265533, + "step": 617 + }, + { + "epoch": 0.81, + "learning_rate": 4.342117122040637e-05, + "logits/chosen": -1.900996208190918, + "logits/rejected": -1.8993828296661377, + "logps/chosen": -209.77993774414062, + "logps/rejected": -206.2876739501953, + "loss": 0.9541, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1300465315580368, + "rewards/margins": -0.33919087052345276, + "rewards/rejected": 0.46923738718032837, + "step": 618 + }, + { + "epoch": 0.81, + "learning_rate": 4.339692894947974e-05, + "logits/chosen": -1.8545596599578857, + "logits/rejected": -1.860669493675232, + "logps/chosen": -186.5775909423828, + "logps/rejected": -209.63888549804688, + "loss": 0.7799, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.011012416332960129, + "rewards/margins": 0.014494312927126884, + "rewards/rejected": -0.003481905907392502, + "step": 619 + }, + { + "epoch": 0.81, + "learning_rate": 4.3372648889683364e-05, + "logits/chosen": -1.8433650732040405, + "logits/rejected": -1.902207612991333, + "logps/chosen": -193.0688934326172, + "logps/rejected": -177.6405792236328, + "loss": 0.7566, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0212489552795887, + "rewards/margins": 0.15483959019184113, + "rewards/rejected": -0.13359062373638153, + "step": 620 + }, + { + "epoch": 0.81, + "learning_rate": 4.334833109089057e-05, + "logits/chosen": -1.6036548614501953, + "logits/rejected": -1.5984629392623901, + "logps/chosen": -166.178955078125, + "logps/rejected": -174.63314819335938, + "loss": 0.7545, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20678240060806274, + "rewards/margins": 0.031082022935152054, + "rewards/rejected": -0.2378644049167633, + "step": 621 + }, + { + "epoch": 0.81, + "learning_rate": 4.33239756030522e-05, + "logits/chosen": -1.8155311346054077, + "logits/rejected": -1.759658694267273, + "logps/chosen": -196.43370056152344, + "logps/rejected": -193.10064697265625, + "loss": 0.8896, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2915378212928772, + "rewards/margins": -0.29409241676330566, + "rewards/rejected": 0.0025546252727508545, + "step": 622 + }, + { + "epoch": 0.82, + "learning_rate": 4.329958247619651e-05, + "logits/chosen": -1.74562668800354, + "logits/rejected": -1.807843565940857, + "logps/chosen": -167.89886474609375, + "logps/rejected": -186.01974487304688, + "loss": 0.5757, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22084513306617737, + "rewards/margins": 0.36972615122795105, + "rewards/rejected": -0.5905711650848389, + "step": 623 + }, + { + "epoch": 0.82, + "learning_rate": 4.3275151760429075e-05, + "logits/chosen": -1.8474458456039429, + "logits/rejected": -1.8740853071212769, + "logps/chosen": -143.50112915039062, + "logps/rejected": -154.0746307373047, + "loss": 0.6583, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10751471668481827, + "rewards/margins": 0.15516290068626404, + "rewards/rejected": -0.047648198902606964, + "step": 624 + }, + { + "epoch": 0.82, + "learning_rate": 4.325068350593268e-05, + "logits/chosen": -1.7046520709991455, + "logits/rejected": -1.8137892484664917, + "logps/chosen": -179.74691772460938, + "logps/rejected": -195.8003387451172, + "loss": 0.7036, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2236245721578598, + "rewards/margins": 0.15745726227760315, + "rewards/rejected": -0.38108178973197937, + "step": 625 + }, + { + "epoch": 0.82, + "learning_rate": 4.322617776296723e-05, + "logits/chosen": -1.7875136137008667, + "logits/rejected": -1.7463059425354004, + "logps/chosen": -189.130126953125, + "logps/rejected": -180.15403747558594, + "loss": 0.9103, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.34625834226608276, + "rewards/margins": -0.27761733531951904, + "rewards/rejected": -0.06864099949598312, + "step": 626 + }, + { + "epoch": 0.82, + "learning_rate": 4.320163458186961e-05, + "logits/chosen": -1.7227963209152222, + "logits/rejected": -1.6379646062850952, + "logps/chosen": -207.108154296875, + "logps/rejected": -187.44049072265625, + "loss": 0.7069, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.06443925201892853, + "rewards/margins": 0.11024240404367447, + "rewards/rejected": -0.1746816635131836, + "step": 627 + }, + { + "epoch": 0.82, + "learning_rate": 4.317705401305362e-05, + "logits/chosen": -1.6465739011764526, + "logits/rejected": -1.676656723022461, + "logps/chosen": -156.50994873046875, + "logps/rejected": -169.0661163330078, + "loss": 0.854, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.04478641599416733, + "rewards/margins": -0.16434717178344727, + "rewards/rejected": 0.2091335952281952, + "step": 628 + }, + { + "epoch": 0.82, + "learning_rate": 4.315243610700986e-05, + "logits/chosen": -1.869480848312378, + "logits/rejected": -1.8965606689453125, + "logps/chosen": -179.2183380126953, + "logps/rejected": -193.56800842285156, + "loss": 0.6107, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.12309933453798294, + "rewards/margins": 0.24142897129058838, + "rewards/rejected": -0.11832961440086365, + "step": 629 + }, + { + "epoch": 0.82, + "learning_rate": 4.312778091430563e-05, + "logits/chosen": -1.572332739830017, + "logits/rejected": -1.5489404201507568, + "logps/chosen": -185.81053161621094, + "logps/rejected": -180.34814453125, + "loss": 0.725, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4762308597564697, + "rewards/margins": 0.07182511687278748, + "rewards/rejected": -0.5480560660362244, + "step": 630 + }, + { + "epoch": 0.83, + "learning_rate": 4.310308848558479e-05, + "logits/chosen": -1.7954251766204834, + "logits/rejected": -1.80027437210083, + "logps/chosen": -217.88711547851562, + "logps/rejected": -232.96463012695312, + "loss": 0.8256, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.10674677789211273, + "rewards/margins": -0.12963929772377014, + "rewards/rejected": 0.022892538458108902, + "step": 631 + }, + { + "epoch": 0.83, + "learning_rate": 4.3078358871567706e-05, + "logits/chosen": -1.759313702583313, + "logits/rejected": -1.8224774599075317, + "logps/chosen": -179.50665283203125, + "logps/rejected": -172.76654052734375, + "loss": 0.7918, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.04008585214614868, + "rewards/margins": 0.13793742656707764, + "rewards/rejected": -0.17802327871322632, + "step": 632 + }, + { + "epoch": 0.83, + "learning_rate": 4.305359212305115e-05, + "logits/chosen": -1.9337170124053955, + "logits/rejected": -1.9300730228424072, + "logps/chosen": -174.96153259277344, + "logps/rejected": -175.67324829101562, + "loss": 0.66, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.008273787796497345, + "rewards/margins": 0.14017948508262634, + "rewards/rejected": -0.1319057047367096, + "step": 633 + }, + { + "epoch": 0.83, + "learning_rate": 4.302878829090813e-05, + "logits/chosen": -1.779855728149414, + "logits/rejected": -1.770744800567627, + "logps/chosen": -196.88430786132812, + "logps/rejected": -185.92510986328125, + "loss": 0.6659, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15623866021633148, + "rewards/margins": 0.16380999982357025, + "rewards/rejected": -0.32004866003990173, + "step": 634 + }, + { + "epoch": 0.83, + "learning_rate": 4.300394742608784e-05, + "logits/chosen": -1.756955623626709, + "logits/rejected": -1.8318352699279785, + "logps/chosen": -154.08811950683594, + "logps/rejected": -156.06085205078125, + "loss": 0.8138, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.16264663636684418, + "rewards/margins": 0.004404932260513306, + "rewards/rejected": -0.16705156862735748, + "step": 635 + }, + { + "epoch": 0.83, + "learning_rate": 4.2979069579615564e-05, + "logits/chosen": -1.8656206130981445, + "logits/rejected": -1.8203411102294922, + "logps/chosen": -188.10708618164062, + "logps/rejected": -183.7784423828125, + "loss": 0.9251, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.29391956329345703, + "rewards/margins": -0.2983461618423462, + "rewards/rejected": 0.004426578059792519, + "step": 636 + }, + { + "epoch": 0.83, + "learning_rate": 4.2954154802592514e-05, + "logits/chosen": -1.604241132736206, + "logits/rejected": -1.6417977809906006, + "logps/chosen": -159.251708984375, + "logps/rejected": -163.13052368164062, + "loss": 0.777, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3667724132537842, + "rewards/margins": -0.06517796963453293, + "rewards/rejected": -0.30159446597099304, + "step": 637 + }, + { + "epoch": 0.83, + "learning_rate": 4.292920314619578e-05, + "logits/chosen": -1.7538769245147705, + "logits/rejected": -1.7120615243911743, + "logps/chosen": -224.12762451171875, + "logps/rejected": -205.00430297851562, + "loss": 0.8706, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10906004160642624, + "rewards/margins": -0.1597534716129303, + "rewards/rejected": 0.050693415105342865, + "step": 638 + }, + { + "epoch": 0.84, + "learning_rate": 4.290421466167822e-05, + "logits/chosen": -1.509183406829834, + "logits/rejected": -1.4262561798095703, + "logps/chosen": -180.26675415039062, + "logps/rejected": -181.98175048828125, + "loss": 0.9278, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.41150203347206116, + "rewards/margins": -0.36716675758361816, + "rewards/rejected": -0.044335294514894485, + "step": 639 + }, + { + "epoch": 0.84, + "learning_rate": 4.2879189400368314e-05, + "logits/chosen": -1.4397776126861572, + "logits/rejected": -1.3926661014556885, + "logps/chosen": -165.068603515625, + "logps/rejected": -178.24217224121094, + "loss": 0.8944, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2168307602405548, + "rewards/margins": -0.030826739966869354, + "rewards/rejected": 0.24765750765800476, + "step": 640 + }, + { + "epoch": 0.84, + "learning_rate": 4.2854127413670096e-05, + "logits/chosen": -1.8310662508010864, + "logits/rejected": -1.8567099571228027, + "logps/chosen": -173.8634490966797, + "logps/rejected": -170.68670654296875, + "loss": 0.8247, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.36099639534950256, + "rewards/margins": -0.10963311791419983, + "rewards/rejected": -0.25136324763298035, + "step": 641 + }, + { + "epoch": 0.84, + "learning_rate": 4.282902875306304e-05, + "logits/chosen": -1.6725221872329712, + "logits/rejected": -1.6123416423797607, + "logps/chosen": -210.78814697265625, + "logps/rejected": -199.3596649169922, + "loss": 0.7146, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2739197611808777, + "rewards/margins": 0.1191968321800232, + "rewards/rejected": -0.39311662316322327, + "step": 642 + }, + { + "epoch": 0.84, + "learning_rate": 4.280389347010194e-05, + "logits/chosen": -1.8086258172988892, + "logits/rejected": -1.818231225013733, + "logps/chosen": -175.2198944091797, + "logps/rejected": -176.08119201660156, + "loss": 0.8965, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3234391212463379, + "rewards/margins": -0.2487168163061142, + "rewards/rejected": -0.07472231984138489, + "step": 643 + }, + { + "epoch": 0.84, + "learning_rate": 4.277872161641682e-05, + "logits/chosen": -1.8151034116744995, + "logits/rejected": -1.8139681816101074, + "logps/chosen": -160.6265869140625, + "logps/rejected": -169.05947875976562, + "loss": 0.8433, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.014796596020460129, + "rewards/margins": -0.17133474349975586, + "rewards/rejected": 0.18613135814666748, + "step": 644 + }, + { + "epoch": 0.84, + "learning_rate": 4.275351324371283e-05, + "logits/chosen": -1.8731780052185059, + "logits/rejected": -1.903558373451233, + "logps/chosen": -172.57882690429688, + "logps/rejected": -180.54104614257812, + "loss": 0.8243, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.4790874421596527, + "rewards/margins": -0.1846233606338501, + "rewards/rejected": -0.2944641411304474, + "step": 645 + }, + { + "epoch": 0.85, + "learning_rate": 4.2728268403770145e-05, + "logits/chosen": -1.8170170783996582, + "logits/rejected": -1.8452297449111938, + "logps/chosen": -172.42489624023438, + "logps/rejected": -174.13865661621094, + "loss": 0.9024, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.345039427280426, + "rewards/margins": -0.303177148103714, + "rewards/rejected": -0.041862305253744125, + "step": 646 + }, + { + "epoch": 0.85, + "learning_rate": 4.270298714844381e-05, + "logits/chosen": -1.7615243196487427, + "logits/rejected": -1.8334003686904907, + "logps/chosen": -166.7871551513672, + "logps/rejected": -179.83578491210938, + "loss": 0.7427, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3320530951023102, + "rewards/margins": -0.020228669047355652, + "rewards/rejected": -0.3118244409561157, + "step": 647 + }, + { + "epoch": 0.85, + "learning_rate": 4.267766952966369e-05, + "logits/chosen": -1.7627480030059814, + "logits/rejected": -1.724000334739685, + "logps/chosen": -168.9439697265625, + "logps/rejected": -163.2694854736328, + "loss": 0.8367, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.42008066177368164, + "rewards/margins": -0.16962656378746033, + "rewards/rejected": -0.2504541277885437, + "step": 648 + }, + { + "epoch": 0.85, + "learning_rate": 4.2652315599434354e-05, + "logits/chosen": -1.666603922843933, + "logits/rejected": -1.6485559940338135, + "logps/chosen": -159.96652221679688, + "logps/rejected": -157.31324768066406, + "loss": 0.7431, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16877800226211548, + "rewards/margins": 0.033473365008831024, + "rewards/rejected": -0.2022513449192047, + "step": 649 + }, + { + "epoch": 0.85, + "learning_rate": 4.262692540983496e-05, + "logits/chosen": -1.6999112367630005, + "logits/rejected": -1.7737563848495483, + "logps/chosen": -156.45681762695312, + "logps/rejected": -185.85430908203125, + "loss": 0.7336, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.33198750019073486, + "rewards/margins": -0.019247818738222122, + "rewards/rejected": -0.31273967027664185, + "step": 650 + }, + { + "epoch": 0.85, + "learning_rate": 4.2601499013019126e-05, + "logits/chosen": -1.680021047592163, + "logits/rejected": -1.7226678133010864, + "logps/chosen": -156.96827697753906, + "logps/rejected": -159.31240844726562, + "loss": 0.7094, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1396864801645279, + "rewards/margins": 0.11252421140670776, + "rewards/rejected": -0.25221067667007446, + "step": 651 + }, + { + "epoch": 0.85, + "learning_rate": 4.257603646121484e-05, + "logits/chosen": -1.6294937133789062, + "logits/rejected": -1.6531920433044434, + "logps/chosen": -194.5813446044922, + "logps/rejected": -193.0689697265625, + "loss": 0.8568, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.348048597574234, + "rewards/margins": -0.0678197517991066, + "rewards/rejected": -0.280228853225708, + "step": 652 + }, + { + "epoch": 0.85, + "learning_rate": 4.2550537806724384e-05, + "logits/chosen": -1.653900146484375, + "logits/rejected": -1.725424885749817, + "logps/chosen": -188.66009521484375, + "logps/rejected": -189.3316650390625, + "loss": 0.6937, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.10200534015893936, + "rewards/margins": 0.19126522541046143, + "rewards/rejected": -0.2932705581188202, + "step": 653 + }, + { + "epoch": 0.86, + "learning_rate": 4.2525003101924164e-05, + "logits/chosen": -1.7711960077285767, + "logits/rejected": -1.7922366857528687, + "logps/chosen": -188.667724609375, + "logps/rejected": -199.7714080810547, + "loss": 0.8373, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.46268340945243835, + "rewards/margins": -0.047878868877887726, + "rewards/rejected": -0.4148045480251312, + "step": 654 + }, + { + "epoch": 0.86, + "learning_rate": 4.249943239926467e-05, + "logits/chosen": -1.870734691619873, + "logits/rejected": -1.888472080230713, + "logps/chosen": -173.50595092773438, + "logps/rejected": -187.27886962890625, + "loss": 0.7626, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.11934824287891388, + "rewards/margins": 0.06529253721237183, + "rewards/rejected": -0.1846407949924469, + "step": 655 + }, + { + "epoch": 0.86, + "learning_rate": 4.247382575127031e-05, + "logits/chosen": -1.6958601474761963, + "logits/rejected": -1.657966136932373, + "logps/chosen": -176.43829345703125, + "logps/rejected": -214.62869262695312, + "loss": 0.6291, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09161286056041718, + "rewards/margins": 0.4105958044528961, + "rewards/rejected": -0.5022085905075073, + "step": 656 + }, + { + "epoch": 0.86, + "learning_rate": 4.2448183210539334e-05, + "logits/chosen": -1.628333568572998, + "logits/rejected": -1.6449060440063477, + "logps/chosen": -148.4341583251953, + "logps/rejected": -144.66708374023438, + "loss": 0.6733, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00030353665351867676, + "rewards/margins": 0.1996443122625351, + "rewards/rejected": -0.19994783401489258, + "step": 657 + }, + { + "epoch": 0.86, + "learning_rate": 4.2422504829743724e-05, + "logits/chosen": -1.765242099761963, + "logits/rejected": -1.7532581090927124, + "logps/chosen": -164.06983947753906, + "logps/rejected": -183.4119110107422, + "loss": 0.4436, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08075151592493057, + "rewards/margins": 0.8164087533950806, + "rewards/rejected": -0.7356572151184082, + "step": 658 + }, + { + "epoch": 0.86, + "learning_rate": 4.239679066162907e-05, + "logits/chosen": -1.910383701324463, + "logits/rejected": -1.8477028608322144, + "logps/chosen": -168.18246459960938, + "logps/rejected": -175.83685302734375, + "loss": 0.727, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.29001450538635254, + "rewards/margins": 0.1458943635225296, + "rewards/rejected": -0.43590888381004333, + "step": 659 + }, + { + "epoch": 0.86, + "learning_rate": 4.237104075901449e-05, + "logits/chosen": -1.5762726068496704, + "logits/rejected": -1.6210079193115234, + "logps/chosen": -202.59307861328125, + "logps/rejected": -213.10899353027344, + "loss": 0.7975, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4130185544490814, + "rewards/margins": -0.1117025762796402, + "rewards/rejected": -0.30131596326828003, + "step": 660 + }, + { + "epoch": 0.87, + "learning_rate": 4.234525517479248e-05, + "logits/chosen": -1.8088970184326172, + "logits/rejected": -1.833585262298584, + "logps/chosen": -168.6361541748047, + "logps/rejected": -173.4584503173828, + "loss": 0.7173, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.29294320940971375, + "rewards/margins": 0.052368972450494766, + "rewards/rejected": -0.3453121483325958, + "step": 661 + }, + { + "epoch": 0.87, + "learning_rate": 4.2319433961928844e-05, + "logits/chosen": -1.7081291675567627, + "logits/rejected": -1.7482770681381226, + "logps/chosen": -165.43646240234375, + "logps/rejected": -193.29000854492188, + "loss": 0.8157, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.42327651381492615, + "rewards/margins": -0.0674619972705841, + "rewards/rejected": -0.35581451654434204, + "step": 662 + }, + { + "epoch": 0.87, + "learning_rate": 4.229357717346257e-05, + "logits/chosen": -1.9136518239974976, + "logits/rejected": -1.855428695678711, + "logps/chosen": -190.4178466796875, + "logps/rejected": -188.52420043945312, + "loss": 0.7248, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10876584053039551, + "rewards/margins": 0.00017318874597549438, + "rewards/rejected": -0.1089390367269516, + "step": 663 + }, + { + "epoch": 0.87, + "learning_rate": 4.226768486250572e-05, + "logits/chosen": -1.8013920783996582, + "logits/rejected": -1.8036880493164062, + "logps/chosen": -194.70623779296875, + "logps/rejected": -216.53485107421875, + "loss": 0.6308, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5092557668685913, + "rewards/margins": 0.299633264541626, + "rewards/rejected": -0.8088890314102173, + "step": 664 + }, + { + "epoch": 0.87, + "learning_rate": 4.224175708224332e-05, + "logits/chosen": -1.7224409580230713, + "logits/rejected": -1.718071460723877, + "logps/chosen": -187.01220703125, + "logps/rejected": -182.26296997070312, + "loss": 0.8315, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3375519812107086, + "rewards/margins": -0.13301782310009003, + "rewards/rejected": -0.2045341432094574, + "step": 665 + }, + { + "epoch": 0.87, + "learning_rate": 4.221579388593326e-05, + "logits/chosen": -1.7278739213943481, + "logits/rejected": -1.7006864547729492, + "logps/chosen": -178.83615112304688, + "logps/rejected": -185.47445678710938, + "loss": 0.7341, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32711875438690186, + "rewards/margins": 0.09781965613365173, + "rewards/rejected": -0.4249383807182312, + "step": 666 + }, + { + "epoch": 0.87, + "learning_rate": 4.218979532690616e-05, + "logits/chosen": -2.025843620300293, + "logits/rejected": -2.0172643661499023, + "logps/chosen": -165.31512451171875, + "logps/rejected": -160.6278839111328, + "loss": 0.7005, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4214673638343811, + "rewards/margins": 0.03614773973822594, + "rewards/rejected": -0.45761507749557495, + "step": 667 + }, + { + "epoch": 0.87, + "learning_rate": 4.216376145856529e-05, + "logits/chosen": -1.802656888961792, + "logits/rejected": -1.771694302558899, + "logps/chosen": -196.45603942871094, + "logps/rejected": -202.7981414794922, + "loss": 0.5865, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3090572953224182, + "rewards/margins": 0.4084371328353882, + "rewards/rejected": -0.7174944877624512, + "step": 668 + }, + { + "epoch": 0.88, + "learning_rate": 4.213769233438646e-05, + "logits/chosen": -1.7967143058776855, + "logits/rejected": -1.772080898284912, + "logps/chosen": -237.03370666503906, + "logps/rejected": -236.46771240234375, + "loss": 0.8748, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5695147514343262, + "rewards/margins": -0.10085253417491913, + "rewards/rejected": -0.46866220235824585, + "step": 669 + }, + { + "epoch": 0.88, + "learning_rate": 4.211158800791788e-05, + "logits/chosen": -1.9533357620239258, + "logits/rejected": -1.9477338790893555, + "logps/chosen": -195.1766357421875, + "logps/rejected": -168.0863037109375, + "loss": 0.947, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6404910087585449, + "rewards/margins": -0.25117361545562744, + "rewards/rejected": -0.3893173635005951, + "step": 670 + }, + { + "epoch": 0.88, + "learning_rate": 4.208544853278008e-05, + "logits/chosen": -1.7536804676055908, + "logits/rejected": -1.7879080772399902, + "logps/chosen": -167.13681030273438, + "logps/rejected": -163.1280517578125, + "loss": 0.9277, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6327708959579468, + "rewards/margins": -0.3013594448566437, + "rewards/rejected": -0.3314114212989807, + "step": 671 + }, + { + "epoch": 0.88, + "learning_rate": 4.205927396266577e-05, + "logits/chosen": -1.923248291015625, + "logits/rejected": -1.9297988414764404, + "logps/chosen": -206.3466796875, + "logps/rejected": -210.54095458984375, + "loss": 0.8353, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.27377229928970337, + "rewards/margins": 0.06531517207622528, + "rewards/rejected": -0.33908745646476746, + "step": 672 + }, + { + "epoch": 0.88, + "learning_rate": 4.203306435133978e-05, + "logits/chosen": -1.7622315883636475, + "logits/rejected": -1.673543930053711, + "logps/chosen": -205.08291625976562, + "logps/rejected": -207.41424560546875, + "loss": 0.8054, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.26307201385498047, + "rewards/margins": 0.005908198654651642, + "rewards/rejected": -0.2689802348613739, + "step": 673 + }, + { + "epoch": 0.88, + "learning_rate": 4.200681975263888e-05, + "logits/chosen": -2.055384635925293, + "logits/rejected": -2.045701503753662, + "logps/chosen": -180.04840087890625, + "logps/rejected": -169.86734008789062, + "loss": 0.8382, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.41850394010543823, + "rewards/margins": -0.2109215408563614, + "rewards/rejected": -0.20758239924907684, + "step": 674 + }, + { + "epoch": 0.88, + "learning_rate": 4.1980540220471744e-05, + "logits/chosen": -1.940061092376709, + "logits/rejected": -1.9028154611587524, + "logps/chosen": -156.75413513183594, + "logps/rejected": -165.96414184570312, + "loss": 1.0616, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.19156816601753235, + "rewards/margins": -0.4865376949310303, + "rewards/rejected": 0.29496949911117554, + "step": 675 + }, + { + "epoch": 0.88, + "learning_rate": 4.195422580881878e-05, + "logits/chosen": -1.651497721672058, + "logits/rejected": -1.6224371194839478, + "logps/chosen": -193.23190307617188, + "logps/rejected": -176.51651000976562, + "loss": 0.7193, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.344645231962204, + "rewards/margins": 0.07098521292209625, + "rewards/rejected": -0.41563040018081665, + "step": 676 + }, + { + "epoch": 0.89, + "learning_rate": 4.192787657173204e-05, + "logits/chosen": -1.779762625694275, + "logits/rejected": -1.8038190603256226, + "logps/chosen": -172.65908813476562, + "logps/rejected": -175.31455993652344, + "loss": 0.775, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49525824189186096, + "rewards/margins": -0.10039810091257095, + "rewards/rejected": -0.394860178232193, + "step": 677 + }, + { + "epoch": 0.89, + "learning_rate": 4.1901492563335115e-05, + "logits/chosen": -2.0335657596588135, + "logits/rejected": -1.9739696979522705, + "logps/chosen": -209.03280639648438, + "logps/rejected": -197.73880004882812, + "loss": 0.9794, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.6130102872848511, + "rewards/margins": -0.4307286739349365, + "rewards/rejected": -0.18228159844875336, + "step": 678 + }, + { + "epoch": 0.89, + "learning_rate": 4.187507383782303e-05, + "logits/chosen": -1.7845653295516968, + "logits/rejected": -1.8473167419433594, + "logps/chosen": -175.36248779296875, + "logps/rejected": -192.36488342285156, + "loss": 0.6858, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3342759311199188, + "rewards/margins": 0.16146346926689148, + "rewards/rejected": -0.4957394003868103, + "step": 679 + }, + { + "epoch": 0.89, + "learning_rate": 4.1848620449462115e-05, + "logits/chosen": -1.681877613067627, + "logits/rejected": -1.7026312351226807, + "logps/chosen": -193.92356872558594, + "logps/rejected": -171.92117309570312, + "loss": 0.9088, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.47164231538772583, + "rewards/margins": -0.34437641501426697, + "rewards/rejected": -0.12726587057113647, + "step": 680 + }, + { + "epoch": 0.89, + "learning_rate": 4.1822132452589885e-05, + "logits/chosen": -2.0411343574523926, + "logits/rejected": -2.014064311981201, + "logps/chosen": -177.00779724121094, + "logps/rejected": -179.8595428466797, + "loss": 0.559, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.27876853942871094, + "rewards/margins": 0.3328251242637634, + "rewards/rejected": -0.6115936636924744, + "step": 681 + }, + { + "epoch": 0.89, + "learning_rate": 4.1795609901614966e-05, + "logits/chosen": -1.8736965656280518, + "logits/rejected": -1.8809561729431152, + "logps/chosen": -176.62069702148438, + "logps/rejected": -183.5467529296875, + "loss": 0.7303, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2544437348842621, + "rewards/margins": 0.08209258317947388, + "rewards/rejected": -0.33653631806373596, + "step": 682 + }, + { + "epoch": 0.89, + "learning_rate": 4.176905285101695e-05, + "logits/chosen": -1.7462670803070068, + "logits/rejected": -1.761965036392212, + "logps/chosen": -151.63314819335938, + "logps/rejected": -151.56427001953125, + "loss": 0.6546, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3507009446620941, + "rewards/margins": 0.17545562982559204, + "rewards/rejected": -0.5261565446853638, + "step": 683 + }, + { + "epoch": 0.9, + "learning_rate": 4.17424613553463e-05, + "logits/chosen": -1.8288514614105225, + "logits/rejected": -1.859897255897522, + "logps/chosen": -171.83522033691406, + "logps/rejected": -191.14358520507812, + "loss": 0.7507, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.28291165828704834, + "rewards/margins": -0.012934118509292603, + "rewards/rejected": -0.26997753977775574, + "step": 684 + }, + { + "epoch": 0.9, + "learning_rate": 4.171583546922423e-05, + "logits/chosen": -1.8343758583068848, + "logits/rejected": -1.8267977237701416, + "logps/chosen": -135.20872497558594, + "logps/rejected": -134.46913146972656, + "loss": 0.7083, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1251545250415802, + "rewards/margins": 0.06863637268543243, + "rewards/rejected": -0.19379091262817383, + "step": 685 + }, + { + "epoch": 0.9, + "learning_rate": 4.1689175247342584e-05, + "logits/chosen": -1.9471396207809448, + "logits/rejected": -1.9692058563232422, + "logps/chosen": -192.59835815429688, + "logps/rejected": -198.68960571289062, + "loss": 0.6293, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2941395044326782, + "rewards/margins": 0.24021106958389282, + "rewards/rejected": -0.5343505144119263, + "step": 686 + }, + { + "epoch": 0.9, + "learning_rate": 4.1662480744463744e-05, + "logits/chosen": -1.9480713605880737, + "logits/rejected": -1.9755477905273438, + "logps/chosen": -179.84698486328125, + "logps/rejected": -167.24124145507812, + "loss": 0.7688, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4756694436073303, + "rewards/margins": 0.07222910225391388, + "rewards/rejected": -0.547898530960083, + "step": 687 + }, + { + "epoch": 0.9, + "learning_rate": 4.163575201542052e-05, + "logits/chosen": -1.9736183881759644, + "logits/rejected": -1.9988312721252441, + "logps/chosen": -155.44598388671875, + "logps/rejected": -169.1442108154297, + "loss": 0.6985, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0029303748160600662, + "rewards/margins": 0.2341531217098236, + "rewards/rejected": -0.23708350956439972, + "step": 688 + }, + { + "epoch": 0.9, + "learning_rate": 4.1608989115116e-05, + "logits/chosen": -1.8959014415740967, + "logits/rejected": -1.9003026485443115, + "logps/chosen": -191.73049926757812, + "logps/rejected": -210.97207641601562, + "loss": 0.8625, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.14525511860847473, + "rewards/margins": -0.15813013911247253, + "rewards/rejected": 0.012875035405158997, + "step": 689 + }, + { + "epoch": 0.9, + "learning_rate": 4.158219209852349e-05, + "logits/chosen": -1.875536561012268, + "logits/rejected": -1.8144606351852417, + "logps/chosen": -183.92649841308594, + "logps/rejected": -184.21307373046875, + "loss": 0.8123, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3249794840812683, + "rewards/margins": -0.07503964751958847, + "rewards/rejected": -0.24993987381458282, + "step": 690 + }, + { + "epoch": 0.9, + "learning_rate": 4.155536102068636e-05, + "logits/chosen": -1.874202013015747, + "logits/rejected": -1.906548261642456, + "logps/chosen": -194.843505859375, + "logps/rejected": -206.90777587890625, + "loss": 0.8466, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4885198473930359, + "rewards/margins": -0.20483222603797913, + "rewards/rejected": -0.28368765115737915, + "step": 691 + }, + { + "epoch": 0.91, + "learning_rate": 4.152849593671793e-05, + "logits/chosen": -1.8392844200134277, + "logits/rejected": -1.8655755519866943, + "logps/chosen": -232.13516235351562, + "logps/rejected": -248.52139282226562, + "loss": 0.8003, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.466138631105423, + "rewards/margins": -0.003687852993607521, + "rewards/rejected": -0.46245077252388, + "step": 692 + }, + { + "epoch": 0.91, + "learning_rate": 4.1501596901801384e-05, + "logits/chosen": -1.7795881032943726, + "logits/rejected": -1.749732255935669, + "logps/chosen": -207.29209899902344, + "logps/rejected": -188.52491760253906, + "loss": 0.7602, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2848435938358307, + "rewards/margins": 0.07350137829780579, + "rewards/rejected": -0.3583449721336365, + "step": 693 + }, + { + "epoch": 0.91, + "learning_rate": 4.147466397118968e-05, + "logits/chosen": -1.8992621898651123, + "logits/rejected": -1.870530128479004, + "logps/chosen": -291.69342041015625, + "logps/rejected": -304.0249938964844, + "loss": 0.7225, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4635903239250183, + "rewards/margins": 0.08878035098314285, + "rewards/rejected": -0.5523706674575806, + "step": 694 + }, + { + "epoch": 0.91, + "learning_rate": 4.144769720020533e-05, + "logits/chosen": -1.7536263465881348, + "logits/rejected": -1.779775619506836, + "logps/chosen": -216.75164794921875, + "logps/rejected": -257.4428405761719, + "loss": 0.8723, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7266858220100403, + "rewards/margins": -0.09885367751121521, + "rewards/rejected": -0.6278320550918579, + "step": 695 + }, + { + "epoch": 0.91, + "learning_rate": 4.142069664424041e-05, + "logits/chosen": -1.9797451496124268, + "logits/rejected": -1.9267971515655518, + "logps/chosen": -165.46441650390625, + "logps/rejected": -167.7301483154297, + "loss": 0.7904, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5045456290245056, + "rewards/margins": -0.02093798667192459, + "rewards/rejected": -0.4836076498031616, + "step": 696 + }, + { + "epoch": 0.91, + "learning_rate": 4.139366235875637e-05, + "logits/chosen": -1.8825287818908691, + "logits/rejected": -1.8622865676879883, + "logps/chosen": -185.31285095214844, + "logps/rejected": -207.89364624023438, + "loss": 0.8387, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23477278649806976, + "rewards/margins": -0.08795370161533356, + "rewards/rejected": -0.146819069981575, + "step": 697 + }, + { + "epoch": 0.91, + "learning_rate": 4.136659439928397e-05, + "logits/chosen": -1.8915377855300903, + "logits/rejected": -1.8720455169677734, + "logps/chosen": -194.33871459960938, + "logps/rejected": -203.15911865234375, + "loss": 0.5884, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17537376284599304, + "rewards/margins": 0.34904515743255615, + "rewards/rejected": -0.17367137968540192, + "step": 698 + }, + { + "epoch": 0.91, + "learning_rate": 4.13394928214231e-05, + "logits/chosen": -1.3153133392333984, + "logits/rejected": -1.2558776140213013, + "logps/chosen": -226.58245849609375, + "logps/rejected": -229.76478576660156, + "loss": 0.7587, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.569631040096283, + "rewards/margins": -0.02661065012216568, + "rewards/rejected": -0.5430203676223755, + "step": 699 + }, + { + "epoch": 0.92, + "learning_rate": 4.1312357680842735e-05, + "logits/chosen": -1.6899161338806152, + "logits/rejected": -1.8224129676818848, + "logps/chosen": -157.56524658203125, + "logps/rejected": -175.26402282714844, + "loss": 0.8479, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6318594813346863, + "rewards/margins": -0.13617253303527832, + "rewards/rejected": -0.49568694829940796, + "step": 700 + }, + { + "epoch": 0.92, + "learning_rate": 4.128518903328078e-05, + "logits/chosen": -1.964836597442627, + "logits/rejected": -1.9238197803497314, + "logps/chosen": -153.36062622070312, + "logps/rejected": -157.65313720703125, + "loss": 0.7669, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5114039778709412, + "rewards/margins": 0.02798408642411232, + "rewards/rejected": -0.5393880605697632, + "step": 701 + }, + { + "epoch": 0.92, + "learning_rate": 4.125798693454396e-05, + "logits/chosen": -2.0570058822631836, + "logits/rejected": -2.0080535411834717, + "logps/chosen": -194.94296264648438, + "logps/rejected": -177.8400421142578, + "loss": 0.752, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22861789166927338, + "rewards/margins": 0.06845703721046448, + "rewards/rejected": -0.29707497358322144, + "step": 702 + }, + { + "epoch": 0.92, + "learning_rate": 4.123075144050772e-05, + "logits/chosen": -1.79075026512146, + "logits/rejected": -1.7762547731399536, + "logps/chosen": -182.18724060058594, + "logps/rejected": -171.6795654296875, + "loss": 0.8897, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5534724593162537, + "rewards/margins": -0.30023112893104553, + "rewards/rejected": -0.25324133038520813, + "step": 703 + }, + { + "epoch": 0.92, + "learning_rate": 4.120348260711611e-05, + "logits/chosen": -1.4944194555282593, + "logits/rejected": -1.5302600860595703, + "logps/chosen": -200.65188598632812, + "logps/rejected": -205.47659301757812, + "loss": 0.6494, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6325050592422485, + "rewards/margins": 0.22558629512786865, + "rewards/rejected": -0.8580912351608276, + "step": 704 + }, + { + "epoch": 0.92, + "learning_rate": 4.117618049038165e-05, + "logits/chosen": -1.6539419889450073, + "logits/rejected": -1.763761043548584, + "logps/chosen": -204.7454376220703, + "logps/rejected": -203.48162841796875, + "loss": 0.8093, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12654227018356323, + "rewards/margins": -0.014543063938617706, + "rewards/rejected": -0.11199923604726791, + "step": 705 + }, + { + "epoch": 0.92, + "learning_rate": 4.1148845146385214e-05, + "logits/chosen": -1.9443776607513428, + "logits/rejected": -1.9469598531723022, + "logps/chosen": -160.744140625, + "logps/rejected": -189.53492736816406, + "loss": 0.9369, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.3124936819076538, + "rewards/margins": -0.24969440698623657, + "rewards/rejected": -0.06279925256967545, + "step": 706 + }, + { + "epoch": 0.93, + "learning_rate": 4.112147663127596e-05, + "logits/chosen": -1.9382710456848145, + "logits/rejected": -1.9214251041412354, + "logps/chosen": -222.22467041015625, + "logps/rejected": -222.050537109375, + "loss": 0.8019, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.11056547611951828, + "rewards/margins": -0.16465231776237488, + "rewards/rejected": 0.054086834192276, + "step": 707 + }, + { + "epoch": 0.93, + "learning_rate": 4.109407500127116e-05, + "logits/chosen": -1.9728318452835083, + "logits/rejected": -2.0126798152923584, + "logps/chosen": -215.3148651123047, + "logps/rejected": -226.9068603515625, + "loss": 0.6567, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.04354026913642883, + "rewards/margins": 0.456782728433609, + "rewards/rejected": -0.4132424294948578, + "step": 708 + }, + { + "epoch": 0.93, + "learning_rate": 4.106664031265611e-05, + "logits/chosen": -2.0477206707000732, + "logits/rejected": -2.063199043273926, + "logps/chosen": -175.30618286132812, + "logps/rejected": -174.9276885986328, + "loss": 0.6284, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07596893608570099, + "rewards/margins": 0.2849644720554352, + "rewards/rejected": -0.36093342304229736, + "step": 709 + }, + { + "epoch": 0.93, + "learning_rate": 4.103917262178402e-05, + "logits/chosen": -1.9526329040527344, + "logits/rejected": -2.0106678009033203, + "logps/chosen": -168.55447387695312, + "logps/rejected": -186.49818420410156, + "loss": 0.6474, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09987005591392517, + "rewards/margins": 0.15396371483802795, + "rewards/rejected": -0.2538337707519531, + "step": 710 + }, + { + "epoch": 0.93, + "learning_rate": 4.1011671985075865e-05, + "logits/chosen": -1.7663332223892212, + "logits/rejected": -1.7783488035202026, + "logps/chosen": -147.06881713867188, + "logps/rejected": -149.45726013183594, + "loss": 0.6022, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16052348911762238, + "rewards/margins": 0.24407917261123657, + "rewards/rejected": -0.08355572074651718, + "step": 711 + }, + { + "epoch": 0.93, + "learning_rate": 4.098413845902033e-05, + "logits/chosen": -1.698045253753662, + "logits/rejected": -1.800790786743164, + "logps/chosen": -167.90267944335938, + "logps/rejected": -177.54620361328125, + "loss": 0.8036, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.11254360526800156, + "rewards/margins": -0.024379856884479523, + "rewards/rejected": -0.08816378563642502, + "step": 712 + }, + { + "epoch": 0.93, + "learning_rate": 4.095657210017364e-05, + "logits/chosen": -1.8730559349060059, + "logits/rejected": -1.8630263805389404, + "logps/chosen": -178.40362548828125, + "logps/rejected": -168.40597534179688, + "loss": 0.7513, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1604912132024765, + "rewards/margins": -0.03123726323246956, + "rewards/rejected": 0.19172844290733337, + "step": 713 + }, + { + "epoch": 0.93, + "learning_rate": 4.092897296515944e-05, + "logits/chosen": -1.9024522304534912, + "logits/rejected": -1.9056739807128906, + "logps/chosen": -167.58963012695312, + "logps/rejected": -177.44297790527344, + "loss": 0.9506, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1954386681318283, + "rewards/margins": -0.3931421637535095, + "rewards/rejected": 0.1977035105228424, + "step": 714 + }, + { + "epoch": 0.94, + "learning_rate": 4.090134111066874e-05, + "logits/chosen": -1.9650360345840454, + "logits/rejected": -2.005605459213257, + "logps/chosen": -168.1615447998047, + "logps/rejected": -198.7694549560547, + "loss": 0.7222, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17665693163871765, + "rewards/margins": 0.07247452437877655, + "rewards/rejected": 0.1041824072599411, + "step": 715 + }, + { + "epoch": 0.94, + "learning_rate": 4.0873676593459725e-05, + "logits/chosen": -1.9938017129898071, + "logits/rejected": -1.9984208345413208, + "logps/chosen": -162.42727661132812, + "logps/rejected": -154.55877685546875, + "loss": 0.9037, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.017479307949543, + "rewards/margins": -0.1982007771730423, + "rewards/rejected": 0.18072140216827393, + "step": 716 + }, + { + "epoch": 0.94, + "learning_rate": 4.08459794703577e-05, + "logits/chosen": -1.8340137004852295, + "logits/rejected": -1.7937979698181152, + "logps/chosen": -163.39852905273438, + "logps/rejected": -163.37271118164062, + "loss": 0.8219, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0151270292699337, + "rewards/margins": -0.09924013912677765, + "rewards/rejected": 0.11436714231967926, + "step": 717 + }, + { + "epoch": 0.94, + "learning_rate": 4.081824979825492e-05, + "logits/chosen": -1.834256649017334, + "logits/rejected": -1.7396763563156128, + "logps/chosen": -178.6987762451172, + "logps/rejected": -186.36557006835938, + "loss": 0.7235, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.19348809123039246, + "rewards/margins": 0.030947115272283554, + "rewards/rejected": -0.2244352102279663, + "step": 718 + }, + { + "epoch": 0.94, + "learning_rate": 4.07904876341105e-05, + "logits/chosen": -1.9482046365737915, + "logits/rejected": -1.9566956758499146, + "logps/chosen": -165.24465942382812, + "logps/rejected": -194.7000274658203, + "loss": 0.6016, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.015304666012525558, + "rewards/margins": 0.2807159423828125, + "rewards/rejected": -0.26541128754615784, + "step": 719 + }, + { + "epoch": 0.94, + "learning_rate": 4.076269303495033e-05, + "logits/chosen": -1.6069618463516235, + "logits/rejected": -1.5738195180892944, + "logps/chosen": -218.8490447998047, + "logps/rejected": -214.1678466796875, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.12460929155349731, + "rewards/margins": 0.16634413599967957, + "rewards/rejected": -0.04173481464385986, + "step": 720 + }, + { + "epoch": 0.94, + "learning_rate": 4.073486605786689e-05, + "logits/chosen": -1.8867998123168945, + "logits/rejected": -1.9408642053604126, + "logps/chosen": -162.95245361328125, + "logps/rejected": -180.93423461914062, + "loss": 0.7724, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.036885879933834076, + "rewards/margins": -0.006463900208473206, + "rewards/rejected": -0.030421972274780273, + "step": 721 + }, + { + "epoch": 0.94, + "learning_rate": 4.0707006760019175e-05, + "logits/chosen": -1.9228891134262085, + "logits/rejected": -1.9769740104675293, + "logps/chosen": -168.56643676757812, + "logps/rejected": -176.51063537597656, + "loss": 0.6297, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20986348390579224, + "rewards/margins": 0.29382628202438354, + "rewards/rejected": -0.08396277576684952, + "step": 722 + }, + { + "epoch": 0.95, + "learning_rate": 4.067911519863257e-05, + "logits/chosen": -1.6868350505828857, + "logits/rejected": -1.666567087173462, + "logps/chosen": -173.6453399658203, + "logps/rejected": -166.50987243652344, + "loss": 0.7459, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.21088244020938873, + "rewards/margins": 0.03544869273900986, + "rewards/rejected": -0.246331125497818, + "step": 723 + }, + { + "epoch": 0.95, + "learning_rate": 4.065119143099874e-05, + "logits/chosen": -1.78118896484375, + "logits/rejected": -1.769471526145935, + "logps/chosen": -200.60067749023438, + "logps/rejected": -214.09454345703125, + "loss": 0.7429, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5265517830848694, + "rewards/margins": 0.10783174633979797, + "rewards/rejected": -0.6343836188316345, + "step": 724 + }, + { + "epoch": 0.95, + "learning_rate": 4.062323551447549e-05, + "logits/chosen": -1.8144245147705078, + "logits/rejected": -1.7792181968688965, + "logps/chosen": -141.9656524658203, + "logps/rejected": -135.1618194580078, + "loss": 0.748, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.021636370569467545, + "rewards/margins": -0.00877220556139946, + "rewards/rejected": -0.012864157557487488, + "step": 725 + }, + { + "epoch": 0.95, + "learning_rate": 4.059524750648668e-05, + "logits/chosen": -1.7768784761428833, + "logits/rejected": -1.7305599451065063, + "logps/chosen": -186.06993103027344, + "logps/rejected": -194.9784698486328, + "loss": 0.7282, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.07539110630750656, + "rewards/margins": 0.016810424625873566, + "rewards/rejected": -0.09220151603221893, + "step": 726 + }, + { + "epoch": 0.95, + "learning_rate": 4.056722746452207e-05, + "logits/chosen": -1.8889234066009521, + "logits/rejected": -1.9104032516479492, + "logps/chosen": -265.5204772949219, + "logps/rejected": -279.9514465332031, + "loss": 0.6895, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13372832536697388, + "rewards/margins": 0.1461489200592041, + "rewards/rejected": -0.279877245426178, + "step": 727 + }, + { + "epoch": 0.95, + "learning_rate": 4.053917544613723e-05, + "logits/chosen": -1.8855706453323364, + "logits/rejected": -1.9057148694992065, + "logps/chosen": -186.10189819335938, + "logps/rejected": -186.93589782714844, + "loss": 0.9102, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.23901258409023285, + "rewards/margins": -0.2926080822944641, + "rewards/rejected": 0.05359550192952156, + "step": 728 + }, + { + "epoch": 0.95, + "learning_rate": 4.051109150895343e-05, + "logits/chosen": -1.8091373443603516, + "logits/rejected": -1.8701030015945435, + "logps/chosen": -217.9191131591797, + "logps/rejected": -178.51612854003906, + "loss": 0.8551, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.050105515867471695, + "rewards/margins": -0.13900446891784668, + "rewards/rejected": 0.18911001086235046, + "step": 729 + }, + { + "epoch": 0.96, + "learning_rate": 4.0482975710657455e-05, + "logits/chosen": -1.8592071533203125, + "logits/rejected": -1.8811057806015015, + "logps/chosen": -168.57730102539062, + "logps/rejected": -204.54586791992188, + "loss": 0.7531, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08510392904281616, + "rewards/margins": 0.1694384664297104, + "rewards/rejected": -0.25454244017601013, + "step": 730 + }, + { + "epoch": 0.96, + "learning_rate": 4.045482810900159e-05, + "logits/chosen": -2.088895797729492, + "logits/rejected": -2.0842466354370117, + "logps/chosen": -177.7628631591797, + "logps/rejected": -171.59335327148438, + "loss": 0.5767, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16781626641750336, + "rewards/margins": 0.34129244089126587, + "rewards/rejected": -0.17347615957260132, + "step": 731 + }, + { + "epoch": 0.96, + "learning_rate": 4.042664876180341e-05, + "logits/chosen": -2.0007247924804688, + "logits/rejected": -1.9096792936325073, + "logps/chosen": -180.6066131591797, + "logps/rejected": -170.75169372558594, + "loss": 0.6665, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.059895459562540054, + "rewards/margins": 0.11071177572011948, + "rewards/rejected": -0.17060723900794983, + "step": 732 + }, + { + "epoch": 0.96, + "learning_rate": 4.0398437726945716e-05, + "logits/chosen": -1.975681185722351, + "logits/rejected": -1.9806488752365112, + "logps/chosen": -190.25149536132812, + "logps/rejected": -190.06532287597656, + "loss": 0.5606, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.14021314680576324, + "rewards/margins": 0.528197169303894, + "rewards/rejected": -0.3879840672016144, + "step": 733 + }, + { + "epoch": 0.96, + "learning_rate": 4.037019506237638e-05, + "logits/chosen": -1.9460779428482056, + "logits/rejected": -1.9403655529022217, + "logps/chosen": -203.6526336669922, + "logps/rejected": -222.22845458984375, + "loss": 0.7525, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.027491139248013496, + "rewards/margins": -0.014092395082116127, + "rewards/rejected": -0.013398736715316772, + "step": 734 + }, + { + "epoch": 0.96, + "learning_rate": 4.034192082610828e-05, + "logits/chosen": -1.9467103481292725, + "logits/rejected": -1.9498211145401, + "logps/chosen": -178.94808959960938, + "logps/rejected": -193.01995849609375, + "loss": 0.6873, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14050908386707306, + "rewards/margins": 0.16484788060188293, + "rewards/rejected": -0.024338817223906517, + "step": 735 + }, + { + "epoch": 0.96, + "learning_rate": 4.031361507621911e-05, + "logits/chosen": -1.9716556072235107, + "logits/rejected": -1.9920234680175781, + "logps/chosen": -169.69122314453125, + "logps/rejected": -174.0656280517578, + "loss": 0.601, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.26089152693748474, + "rewards/margins": 0.31827694177627563, + "rewards/rejected": -0.057385385036468506, + "step": 736 + }, + { + "epoch": 0.96, + "learning_rate": 4.02852778708513e-05, + "logits/chosen": -1.8285408020019531, + "logits/rejected": -1.825709581375122, + "logps/chosen": -201.5704345703125, + "logps/rejected": -208.6463165283203, + "loss": 0.7082, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.05932292714715004, + "rewards/margins": 0.07191256433725357, + "rewards/rejected": -0.1312354952096939, + "step": 737 + }, + { + "epoch": 0.97, + "learning_rate": 4.0256909268211914e-05, + "logits/chosen": -1.866570234298706, + "logits/rejected": -1.9105130434036255, + "logps/chosen": -155.8884735107422, + "logps/rejected": -167.31063842773438, + "loss": 0.7262, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.06431854516267776, + "rewards/margins": 0.020435571670532227, + "rewards/rejected": 0.04388298839330673, + "step": 738 + }, + { + "epoch": 0.97, + "learning_rate": 4.0228509326572496e-05, + "logits/chosen": -1.8914620876312256, + "logits/rejected": -1.835383415222168, + "logps/chosen": -193.6123809814453, + "logps/rejected": -189.48651123046875, + "loss": 1.0269, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6893608570098877, + "rewards/margins": -0.49458661675453186, + "rewards/rejected": -0.19477425515651703, + "step": 739 + }, + { + "epoch": 0.97, + "learning_rate": 4.0200078104268944e-05, + "logits/chosen": -2.1169674396514893, + "logits/rejected": -2.1131343841552734, + "logps/chosen": -197.66700744628906, + "logps/rejected": -203.12513732910156, + "loss": 0.6898, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24454453587532043, + "rewards/margins": 0.23662899434566498, + "rewards/rejected": 0.007915541529655457, + "step": 740 + }, + { + "epoch": 0.97, + "learning_rate": 4.017161565970144e-05, + "logits/chosen": -1.946454405784607, + "logits/rejected": -1.9714114665985107, + "logps/chosen": -195.95687866210938, + "logps/rejected": -196.1630401611328, + "loss": 0.8647, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.28991246223449707, + "rewards/margins": -0.02519702911376953, + "rewards/rejected": -0.26471543312072754, + "step": 741 + }, + { + "epoch": 0.97, + "learning_rate": 4.014312205133428e-05, + "logits/chosen": -1.8742949962615967, + "logits/rejected": -1.8860889673233032, + "logps/chosen": -170.3537139892578, + "logps/rejected": -176.24937438964844, + "loss": 0.5796, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2740973234176636, + "rewards/margins": 0.3400841951370239, + "rewards/rejected": -0.06598688662052155, + "step": 742 + }, + { + "epoch": 0.97, + "learning_rate": 4.011459733769579e-05, + "logits/chosen": -1.8638414144515991, + "logits/rejected": -1.8556957244873047, + "logps/chosen": -219.03761291503906, + "logps/rejected": -227.22244262695312, + "loss": 0.8823, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4476678967475891, + "rewards/margins": -0.2778720259666443, + "rewards/rejected": -0.16979584097862244, + "step": 743 + }, + { + "epoch": 0.97, + "learning_rate": 4.0086041577378166e-05, + "logits/chosen": -1.8305257558822632, + "logits/rejected": -1.7608011960983276, + "logps/chosen": -187.88043212890625, + "logps/rejected": -169.92657470703125, + "loss": 0.7201, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.090822733938694, + "rewards/margins": 0.07794052362442017, + "rewards/rejected": -0.16876326501369476, + "step": 744 + }, + { + "epoch": 0.97, + "learning_rate": 4.005745482903739e-05, + "logits/chosen": -1.8173670768737793, + "logits/rejected": -1.7855581045150757, + "logps/chosen": -179.5590362548828, + "logps/rejected": -166.4422607421875, + "loss": 0.6341, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.17833015322685242, + "rewards/margins": 0.2266974300146103, + "rewards/rejected": -0.04836726933717728, + "step": 745 + }, + { + "epoch": 0.98, + "learning_rate": 4.002883715139309e-05, + "logits/chosen": -1.7349971532821655, + "logits/rejected": -1.7977635860443115, + "logps/chosen": -170.27752685546875, + "logps/rejected": -167.193603515625, + "loss": 0.7021, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12941241264343262, + "rewards/margins": 0.05854359269142151, + "rewards/rejected": -0.1879560351371765, + "step": 746 + }, + { + "epoch": 0.98, + "learning_rate": 4.000018860322845e-05, + "logits/chosen": -1.7580136060714722, + "logits/rejected": -1.7533882856369019, + "logps/chosen": -178.30084228515625, + "logps/rejected": -163.27293395996094, + "loss": 0.6128, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03221731632947922, + "rewards/margins": 0.30581218004226685, + "rewards/rejected": -0.2735949158668518, + "step": 747 + }, + { + "epoch": 0.98, + "learning_rate": 3.9971509243390025e-05, + "logits/chosen": -1.8093492984771729, + "logits/rejected": -1.8199714422225952, + "logps/chosen": -144.11441040039062, + "logps/rejected": -162.16128540039062, + "loss": 0.723, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.020870715379714966, + "rewards/margins": 0.024553870782256126, + "rewards/rejected": -0.0036831647157669067, + "step": 748 + }, + { + "epoch": 0.98, + "learning_rate": 3.99427991307877e-05, + "logits/chosen": -1.867729902267456, + "logits/rejected": -1.8255057334899902, + "logps/chosen": -183.21380615234375, + "logps/rejected": -181.64158630371094, + "loss": 0.7334, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.21690025925636292, + "rewards/margins": 0.19758300483226776, + "rewards/rejected": 0.01931723952293396, + "step": 749 + }, + { + "epoch": 0.98, + "learning_rate": 3.9914058324394486e-05, + "logits/chosen": -1.9604843854904175, + "logits/rejected": -2.0102388858795166, + "logps/chosen": -140.79994201660156, + "logps/rejected": -147.5875244140625, + "loss": 0.7198, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15232627093791962, + "rewards/margins": 0.18342937529087067, + "rewards/rejected": -0.03110312670469284, + "step": 750 + }, + { + "epoch": 0.98, + "learning_rate": 3.9885286883246476e-05, + "logits/chosen": -1.7222356796264648, + "logits/rejected": -1.70112144947052, + "logps/chosen": -147.16017150878906, + "logps/rejected": -154.9020233154297, + "loss": 0.92, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.529461145401001, + "rewards/margins": -0.19158104062080383, + "rewards/rejected": 0.7210422158241272, + "step": 751 + }, + { + "epoch": 0.98, + "learning_rate": 3.985648486644267e-05, + "logits/chosen": -1.743016004562378, + "logits/rejected": -1.6408265829086304, + "logps/chosen": -236.60455322265625, + "logps/rejected": -226.41018676757812, + "loss": 0.7036, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3873446583747864, + "rewards/margins": 0.05065695941448212, + "rewards/rejected": -0.4380015730857849, + "step": 752 + }, + { + "epoch": 0.99, + "learning_rate": 3.982765233314489e-05, + "logits/chosen": -1.9756314754486084, + "logits/rejected": -1.9660886526107788, + "logps/chosen": -179.02066040039062, + "logps/rejected": -178.9298858642578, + "loss": 0.832, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.08823183178901672, + "rewards/margins": -0.16290561854839325, + "rewards/rejected": 0.25113746523857117, + "step": 753 + }, + { + "epoch": 0.99, + "learning_rate": 3.979878934257762e-05, + "logits/chosen": -1.6876684427261353, + "logits/rejected": -1.738308310508728, + "logps/chosen": -284.55389404296875, + "logps/rejected": -267.27178955078125, + "loss": 0.8761, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01247774064540863, + "rewards/margins": -0.1615845113992691, + "rewards/rejected": 0.17406225204467773, + "step": 754 + }, + { + "epoch": 0.99, + "learning_rate": 3.976989595402793e-05, + "logits/chosen": -1.7371678352355957, + "logits/rejected": -1.767361044883728, + "logps/chosen": -166.92442321777344, + "logps/rejected": -175.44528198242188, + "loss": 0.7914, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.061636172235012054, + "rewards/margins": -0.010894455015659332, + "rewards/rejected": -0.050741732120513916, + "step": 755 + }, + { + "epoch": 0.99, + "learning_rate": 3.974097222684532e-05, + "logits/chosen": -1.9828624725341797, + "logits/rejected": -1.9450846910476685, + "logps/chosen": -182.8007049560547, + "logps/rejected": -179.33963012695312, + "loss": 0.8109, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.20281130075454712, + "rewards/margins": -0.025454670190811157, + "rewards/rejected": 0.22826597094535828, + "step": 756 + }, + { + "epoch": 0.99, + "learning_rate": 3.9712018220441596e-05, + "logits/chosen": -2.006014585494995, + "logits/rejected": -2.0298256874084473, + "logps/chosen": -163.69482421875, + "logps/rejected": -163.8912353515625, + "loss": 0.7536, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.13053151965141296, + "rewards/margins": -0.029842479154467583, + "rewards/rejected": -0.10068905353546143, + "step": 757 + }, + { + "epoch": 0.99, + "learning_rate": 3.9683033994290767e-05, + "logits/chosen": -1.5359764099121094, + "logits/rejected": -1.5906102657318115, + "logps/chosen": -176.468505859375, + "logps/rejected": -200.26760864257812, + "loss": 0.7742, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.018672414124011993, + "rewards/margins": 0.06273224204778671, + "rewards/rejected": -0.08140464872121811, + "step": 758 + }, + { + "epoch": 0.99, + "learning_rate": 3.965401960792894e-05, + "logits/chosen": -1.7277990579605103, + "logits/rejected": -1.7565523386001587, + "logps/chosen": -180.8856658935547, + "logps/rejected": -178.6083526611328, + "loss": 0.742, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.007134014740586281, + "rewards/margins": 0.06274452805519104, + "rewards/rejected": -0.06987853348255157, + "step": 759 + }, + { + "epoch": 0.99, + "learning_rate": 3.962497512095412e-05, + "logits/chosen": -1.7388745546340942, + "logits/rejected": -1.780763864517212, + "logps/chosen": -193.02093505859375, + "logps/rejected": -216.36962890625, + "loss": 0.8278, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.12493382394313812, + "rewards/margins": -0.1711753010749817, + "rewards/rejected": 0.296109139919281, + "step": 760 + }, + { + "epoch": 1.0, + "learning_rate": 3.95959005930262e-05, + "logits/chosen": -1.7948150634765625, + "logits/rejected": -1.85101318359375, + "logps/chosen": -229.8053741455078, + "logps/rejected": -240.33815002441406, + "loss": 0.6705, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19152618944644928, + "rewards/margins": 0.1523137390613556, + "rewards/rejected": 0.03921244665980339, + "step": 761 + }, + { + "epoch": 1.0, + "learning_rate": 3.9566796083866756e-05, + "logits/chosen": -1.626505732536316, + "logits/rejected": -1.6348035335540771, + "logps/chosen": -146.9053955078125, + "logps/rejected": -159.9919891357422, + "loss": 0.784, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.29515981674194336, + "rewards/margins": 0.03867659345269203, + "rewards/rejected": 0.2564832270145416, + "step": 762 + }, + { + "epoch": 1.0, + "learning_rate": 3.953766165325892e-05, + "logits/chosen": -1.582480549812317, + "logits/rejected": -1.5091769695281982, + "logps/chosen": -175.38235473632812, + "logps/rejected": -168.53660583496094, + "loss": 0.972, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4482717514038086, + "rewards/margins": -0.27672773599624634, + "rewards/rejected": -0.17154404520988464, + "step": 763 + }, + { + "epoch": 1.0, + "learning_rate": 3.9508497361047334e-05, + "logits/chosen": -1.8544284105300903, + "logits/rejected": -1.8667871952056885, + "logps/chosen": -173.8372039794922, + "logps/rejected": -182.494873046875, + "loss": 0.6637, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09536217153072357, + "rewards/margins": 0.11863154172897339, + "rewards/rejected": -0.023269355297088623, + "step": 764 + }, + { + "epoch": 1.0, + "learning_rate": 3.9479303267137944e-05, + "logits/chosen": -1.843505859375, + "logits/rejected": -1.9012435674667358, + "logps/chosen": -136.9263458251953, + "logps/rejected": -189.14739990234375, + "loss": 0.167, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.681319236755371, + "rewards/margins": 4.289565086364746, + "rewards/rejected": -2.608245849609375, + "step": 765 + }, + { + "epoch": 1.0, + "learning_rate": 3.9450079431497936e-05, + "logits/chosen": -1.6857999563217163, + "logits/rejected": -1.6979467868804932, + "logps/chosen": -242.67828369140625, + "logps/rejected": -301.3565673828125, + "loss": 0.0933, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9228906631469727, + "rewards/margins": 6.939729690551758, + "rewards/rejected": -4.016839027404785, + "step": 766 + }, + { + "epoch": 1.0, + "learning_rate": 3.9420825914155554e-05, + "logits/chosen": -1.873047113418579, + "logits/rejected": -1.9470162391662598, + "logps/chosen": -154.30426025390625, + "logps/rejected": -221.78143310546875, + "loss": 0.0623, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0256850719451904, + "rewards/margins": 6.026838779449463, + "rewards/rejected": -3.0011534690856934, + "step": 767 + }, + { + "epoch": 1.01, + "learning_rate": 3.939154277520006e-05, + "logits/chosen": -1.6709171533584595, + "logits/rejected": -1.6833773851394653, + "logps/chosen": -145.9373321533203, + "logps/rejected": -248.9676055908203, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.266024351119995, + "rewards/margins": 8.912620544433594, + "rewards/rejected": -5.6465959548950195, + "step": 768 + }, + { + "epoch": 1.01, + "learning_rate": 3.9362230074781506e-05, + "logits/chosen": -1.9885060787200928, + "logits/rejected": -1.9463064670562744, + "logps/chosen": -152.9366912841797, + "logps/rejected": -194.1561279296875, + "loss": 0.062, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.965550422668457, + "rewards/margins": 6.187473773956299, + "rewards/rejected": -3.2219231128692627, + "step": 769 + }, + { + "epoch": 1.01, + "learning_rate": 3.9332887873110695e-05, + "logits/chosen": -1.4813066720962524, + "logits/rejected": -1.4207929372787476, + "logps/chosen": -132.97509765625, + "logps/rejected": -212.00186157226562, + "loss": 0.0931, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.51338791847229, + "rewards/margins": 6.154384613037109, + "rewards/rejected": -3.6409966945648193, + "step": 770 + }, + { + "epoch": 1.01, + "learning_rate": 3.9303516230459035e-05, + "logits/chosen": -1.8454346656799316, + "logits/rejected": -1.9101213216781616, + "logps/chosen": -173.029296875, + "logps/rejected": -259.5926513671875, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.007847785949707, + "rewards/margins": 7.848160266876221, + "rewards/rejected": -4.840312480926514, + "step": 771 + }, + { + "epoch": 1.01, + "learning_rate": 3.92741152071584e-05, + "logits/chosen": -1.8387994766235352, + "logits/rejected": -1.8383971452713013, + "logps/chosen": -146.1198272705078, + "logps/rejected": -220.85960388183594, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8462674617767334, + "rewards/margins": 7.253849983215332, + "rewards/rejected": -4.4075822830200195, + "step": 772 + }, + { + "epoch": 1.01, + "learning_rate": 3.924468486360101e-05, + "logits/chosen": -1.62917160987854, + "logits/rejected": -1.645263433456421, + "logps/chosen": -129.12579345703125, + "logps/rejected": -185.53977966308594, + "loss": 0.2783, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6398133039474487, + "rewards/margins": 5.0858635902404785, + "rewards/rejected": -3.4460501670837402, + "step": 773 + }, + { + "epoch": 1.01, + "learning_rate": 3.921522526023931e-05, + "logits/chosen": -1.7776762247085571, + "logits/rejected": -1.79482901096344, + "logps/chosen": -149.4893798828125, + "logps/rejected": -202.3997802734375, + "loss": 0.126, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4249045848846436, + "rewards/margins": 5.789612770080566, + "rewards/rejected": -3.364708423614502, + "step": 774 + }, + { + "epoch": 1.01, + "learning_rate": 3.918573645758586e-05, + "logits/chosen": -1.8283805847167969, + "logits/rejected": -1.739791750907898, + "logps/chosen": -144.54391479492188, + "logps/rejected": -221.6058807373047, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.915061593055725, + "rewards/margins": 5.494343280792236, + "rewards/rejected": -3.579281806945801, + "step": 775 + }, + { + "epoch": 1.02, + "learning_rate": 3.915621851621318e-05, + "logits/chosen": -1.9749910831451416, + "logits/rejected": -2.0017504692077637, + "logps/chosen": -134.46275329589844, + "logps/rejected": -218.1602783203125, + "loss": 0.048, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.595043420791626, + "rewards/margins": 7.404112815856934, + "rewards/rejected": -4.809070110321045, + "step": 776 + }, + { + "epoch": 1.02, + "learning_rate": 3.9126671496753666e-05, + "logits/chosen": -1.9023005962371826, + "logits/rejected": -1.9459162950515747, + "logps/chosen": -148.63253784179688, + "logps/rejected": -228.6715545654297, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.283623218536377, + "rewards/margins": 6.3677473068237305, + "rewards/rejected": -4.084123611450195, + "step": 777 + }, + { + "epoch": 1.02, + "learning_rate": 3.909709545989942e-05, + "logits/chosen": -1.82866370677948, + "logits/rejected": -1.8816454410552979, + "logps/chosen": -143.5281982421875, + "logps/rejected": -203.86846923828125, + "loss": 0.0826, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4189865589141846, + "rewards/margins": 5.4766387939453125, + "rewards/rejected": -3.057651996612549, + "step": 778 + }, + { + "epoch": 1.02, + "learning_rate": 3.9067490466402156e-05, + "logits/chosen": -1.8777246475219727, + "logits/rejected": -1.951180338859558, + "logps/chosen": -141.29640197753906, + "logps/rejected": -220.64962768554688, + "loss": 0.0561, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0638809204101562, + "rewards/margins": 7.617124080657959, + "rewards/rejected": -4.553243160247803, + "step": 779 + }, + { + "epoch": 1.02, + "learning_rate": 3.903785657707307e-05, + "logits/chosen": -1.7680386304855347, + "logits/rejected": -1.7862606048583984, + "logps/chosen": -151.75965881347656, + "logps/rejected": -215.16712951660156, + "loss": 0.1213, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1962571144104004, + "rewards/margins": 5.733034133911133, + "rewards/rejected": -4.536776542663574, + "step": 780 + }, + { + "epoch": 1.02, + "learning_rate": 3.9008193852782733e-05, + "logits/chosen": -1.6814169883728027, + "logits/rejected": -1.6905479431152344, + "logps/chosen": -151.2788848876953, + "logps/rejected": -212.93544006347656, + "loss": 0.0446, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4553009271621704, + "rewards/margins": 5.7619452476501465, + "rewards/rejected": -4.306644916534424, + "step": 781 + }, + { + "epoch": 1.02, + "learning_rate": 3.897850235446089e-05, + "logits/chosen": -2.067516565322876, + "logits/rejected": -2.084655284881592, + "logps/chosen": -184.94361877441406, + "logps/rejected": -231.28411865234375, + "loss": 0.1911, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.0862700939178467, + "rewards/margins": 5.273751258850098, + "rewards/rejected": -3.187481164932251, + "step": 782 + }, + { + "epoch": 1.02, + "learning_rate": 3.894878214309645e-05, + "logits/chosen": -1.7666475772857666, + "logits/rejected": -1.8602631092071533, + "logps/chosen": -147.2201385498047, + "logps/rejected": -206.15673828125, + "loss": 0.1023, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.5482131242752075, + "rewards/margins": 4.794281482696533, + "rewards/rejected": -3.246067762374878, + "step": 783 + }, + { + "epoch": 1.03, + "learning_rate": 3.8919033279737274e-05, + "logits/chosen": -1.707251787185669, + "logits/rejected": -1.7892178297042847, + "logps/chosen": -196.2216339111328, + "logps/rejected": -238.6273193359375, + "loss": 0.1521, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7280199527740479, + "rewards/margins": 3.8712680339813232, + "rewards/rejected": -3.1432480812072754, + "step": 784 + }, + { + "epoch": 1.03, + "learning_rate": 3.888925582549006e-05, + "logits/chosen": -1.66917884349823, + "logits/rejected": -1.6865543127059937, + "logps/chosen": -141.897705078125, + "logps/rejected": -234.94375610351562, + "loss": 0.0569, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.608001708984375, + "rewards/margins": 7.768197059631348, + "rewards/rejected": -5.160194396972656, + "step": 785 + }, + { + "epoch": 1.03, + "learning_rate": 3.885944984152027e-05, + "logits/chosen": -1.6489955186843872, + "logits/rejected": -1.6364490985870361, + "logps/chosen": -177.00204467773438, + "logps/rejected": -221.19728088378906, + "loss": 0.098, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.609315276145935, + "rewards/margins": 6.382396697998047, + "rewards/rejected": -4.7730817794799805, + "step": 786 + }, + { + "epoch": 1.03, + "learning_rate": 3.882961538905194e-05, + "logits/chosen": -1.6952452659606934, + "logits/rejected": -1.7020906209945679, + "logps/chosen": -175.68885803222656, + "logps/rejected": -243.1931915283203, + "loss": 0.0879, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6182076930999756, + "rewards/margins": 7.586241722106934, + "rewards/rejected": -4.968033790588379, + "step": 787 + }, + { + "epoch": 1.03, + "learning_rate": 3.879975252936761e-05, + "logits/chosen": -1.8468316793441772, + "logits/rejected": -1.9614177942276, + "logps/chosen": -147.63035583496094, + "logps/rejected": -252.27645874023438, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5491623878479004, + "rewards/margins": 8.667781829833984, + "rewards/rejected": -6.118618965148926, + "step": 788 + }, + { + "epoch": 1.03, + "learning_rate": 3.876986132380814e-05, + "logits/chosen": -1.6783697605133057, + "logits/rejected": -1.7172523736953735, + "logps/chosen": -186.01351928710938, + "logps/rejected": -243.29734802246094, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7059178352355957, + "rewards/margins": 6.882784843444824, + "rewards/rejected": -5.17686653137207, + "step": 789 + }, + { + "epoch": 1.03, + "learning_rate": 3.8739941833772643e-05, + "logits/chosen": -2.0805482864379883, + "logits/rejected": -2.104659080505371, + "logps/chosen": -164.75535583496094, + "logps/rejected": -213.48208618164062, + "loss": 0.1263, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.5398253202438354, + "rewards/margins": 5.201066017150879, + "rewards/rejected": -3.6612401008605957, + "step": 790 + }, + { + "epoch": 1.04, + "learning_rate": 3.870999412071829e-05, + "logits/chosen": -1.733805537223816, + "logits/rejected": -1.6911817789077759, + "logps/chosen": -163.6641082763672, + "logps/rejected": -234.33090209960938, + "loss": 0.0406, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.18049693107605, + "rewards/margins": 8.514333724975586, + "rewards/rejected": -6.333837509155273, + "step": 791 + }, + { + "epoch": 1.04, + "learning_rate": 3.8680018246160295e-05, + "logits/chosen": -1.4504364728927612, + "logits/rejected": -1.5445796251296997, + "logps/chosen": -152.53546142578125, + "logps/rejected": -256.28460693359375, + "loss": 0.0555, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.3757332563400269, + "rewards/margins": 6.887189865112305, + "rewards/rejected": -5.51145601272583, + "step": 792 + }, + { + "epoch": 1.04, + "learning_rate": 3.865001427167164e-05, + "logits/chosen": -1.8626388311386108, + "logits/rejected": -1.9130809307098389, + "logps/chosen": -146.39051818847656, + "logps/rejected": -227.55630493164062, + "loss": 0.1054, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.8630967140197754, + "rewards/margins": 7.0272722244262695, + "rewards/rejected": -5.164175033569336, + "step": 793 + }, + { + "epoch": 1.04, + "learning_rate": 3.861998225888307e-05, + "logits/chosen": -1.7840837240219116, + "logits/rejected": -1.786619782447815, + "logps/chosen": -172.25775146484375, + "logps/rejected": -235.2631378173828, + "loss": 0.0602, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.5000873804092407, + "rewards/margins": 7.424620628356934, + "rewards/rejected": -5.924533367156982, + "step": 794 + }, + { + "epoch": 1.04, + "learning_rate": 3.8589922269482924e-05, + "logits/chosen": -1.9556963443756104, + "logits/rejected": -1.9700720310211182, + "logps/chosen": -145.4001922607422, + "logps/rejected": -225.95314025878906, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9621433019638062, + "rewards/margins": 6.593455791473389, + "rewards/rejected": -4.631312370300293, + "step": 795 + }, + { + "epoch": 1.04, + "learning_rate": 3.855983436521699e-05, + "logits/chosen": -1.7366329431533813, + "logits/rejected": -1.733896017074585, + "logps/chosen": -178.80482482910156, + "logps/rejected": -243.79937744140625, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5504860877990723, + "rewards/margins": 7.0473833084106445, + "rewards/rejected": -5.496897220611572, + "step": 796 + }, + { + "epoch": 1.04, + "learning_rate": 3.8529718607888394e-05, + "logits/chosen": -1.7065753936767578, + "logits/rejected": -1.742546796798706, + "logps/chosen": -148.2886199951172, + "logps/rejected": -208.7879638671875, + "loss": 0.0934, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.5418556928634644, + "rewards/margins": 6.03062629699707, + "rewards/rejected": -4.488770961761475, + "step": 797 + }, + { + "epoch": 1.04, + "learning_rate": 3.8499575059357506e-05, + "logits/chosen": -1.7552120685577393, + "logits/rejected": -1.6821340322494507, + "logps/chosen": -164.05032348632812, + "logps/rejected": -235.27076721191406, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.730198621749878, + "rewards/margins": 8.732093811035156, + "rewards/rejected": -6.001894950866699, + "step": 798 + }, + { + "epoch": 1.05, + "learning_rate": 3.8469403781541745e-05, + "logits/chosen": -1.770749807357788, + "logits/rejected": -1.7531272172927856, + "logps/chosen": -174.0284881591797, + "logps/rejected": -260.97369384765625, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5956072807312012, + "rewards/margins": 7.5574846267700195, + "rewards/rejected": -5.961877822875977, + "step": 799 + }, + { + "epoch": 1.05, + "learning_rate": 3.843920483641551e-05, + "logits/chosen": -2.1300318241119385, + "logits/rejected": -2.06135630607605, + "logps/chosen": -174.64796447753906, + "logps/rejected": -267.5289306640625, + "loss": 0.0777, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.753110647201538, + "rewards/margins": 8.441932678222656, + "rewards/rejected": -6.6888227462768555, + "step": 800 + }, + { + "epoch": 1.05, + "learning_rate": 3.840897828601002e-05, + "logits/chosen": -1.5714927911758423, + "logits/rejected": -1.5915296077728271, + "logps/chosen": -158.0545196533203, + "logps/rejected": -225.82086181640625, + "loss": 0.0912, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.809893250465393, + "rewards/margins": 7.726199626922607, + "rewards/rejected": -5.916306018829346, + "step": 801 + }, + { + "epoch": 1.05, + "learning_rate": 3.83787241924132e-05, + "logits/chosen": -2.058527946472168, + "logits/rejected": -2.106814384460449, + "logps/chosen": -155.36590576171875, + "logps/rejected": -222.94711303710938, + "loss": 0.0525, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4193453788757324, + "rewards/margins": 7.101151943206787, + "rewards/rejected": -5.681806564331055, + "step": 802 + }, + { + "epoch": 1.05, + "learning_rate": 3.8348442617769564e-05, + "logits/chosen": -1.699159860610962, + "logits/rejected": -1.7645118236541748, + "logps/chosen": -156.55690002441406, + "logps/rejected": -204.28575134277344, + "loss": 0.1464, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8710487484931946, + "rewards/margins": 5.2086567878723145, + "rewards/rejected": -4.337608337402344, + "step": 803 + }, + { + "epoch": 1.05, + "learning_rate": 3.831813362428005e-05, + "logits/chosen": -1.7965739965438843, + "logits/rejected": -1.7984563112258911, + "logps/chosen": -142.91116333007812, + "logps/rejected": -249.2438201904297, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7751538157463074, + "rewards/margins": 7.4659013748168945, + "rewards/rejected": -6.690747261047363, + "step": 804 + }, + { + "epoch": 1.05, + "learning_rate": 3.8287797274201934e-05, + "logits/chosen": -1.651024580001831, + "logits/rejected": -1.6581289768218994, + "logps/chosen": -175.62232971191406, + "logps/rejected": -262.756103515625, + "loss": 0.0982, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8959269523620605, + "rewards/margins": 6.970417499542236, + "rewards/rejected": -6.074490070343018, + "step": 805 + }, + { + "epoch": 1.05, + "learning_rate": 3.825743362984868e-05, + "logits/chosen": -1.6261019706726074, + "logits/rejected": -1.6354936361312866, + "logps/chosen": -200.4067840576172, + "logps/rejected": -259.9723815917969, + "loss": 0.0891, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0647060871124268, + "rewards/margins": 6.824378967285156, + "rewards/rejected": -5.759673118591309, + "step": 806 + }, + { + "epoch": 1.06, + "learning_rate": 3.8227042753589824e-05, + "logits/chosen": -1.8112773895263672, + "logits/rejected": -1.778592824935913, + "logps/chosen": -170.0568389892578, + "logps/rejected": -230.7504425048828, + "loss": 0.1387, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11378135532140732, + "rewards/margins": 5.9576334953308105, + "rewards/rejected": -6.071414947509766, + "step": 807 + }, + { + "epoch": 1.06, + "learning_rate": 3.819662470785082e-05, + "logits/chosen": -1.573864221572876, + "logits/rejected": -1.6022815704345703, + "logps/chosen": -153.41400146484375, + "logps/rejected": -201.80267333984375, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.118287205696106, + "rewards/margins": 6.153818607330322, + "rewards/rejected": -5.035531044006348, + "step": 808 + }, + { + "epoch": 1.06, + "learning_rate": 3.816617955511296e-05, + "logits/chosen": -1.71113920211792, + "logits/rejected": -1.685309648513794, + "logps/chosen": -161.2007598876953, + "logps/rejected": -260.3743896484375, + "loss": 0.0872, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.445091724395752, + "rewards/margins": 8.42944049835205, + "rewards/rejected": -6.984348773956299, + "step": 809 + }, + { + "epoch": 1.06, + "learning_rate": 3.8135707357913176e-05, + "logits/chosen": -1.684647560119629, + "logits/rejected": -1.7264988422393799, + "logps/chosen": -161.3009033203125, + "logps/rejected": -287.2543640136719, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.751045823097229, + "rewards/margins": 10.769046783447266, + "rewards/rejected": -9.018001556396484, + "step": 810 + }, + { + "epoch": 1.06, + "learning_rate": 3.8105208178843984e-05, + "logits/chosen": -1.7524151802062988, + "logits/rejected": -1.7648451328277588, + "logps/chosen": -156.43142700195312, + "logps/rejected": -232.1067352294922, + "loss": 0.0499, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.329323649406433, + "rewards/margins": 8.172022819519043, + "rewards/rejected": -6.8426995277404785, + "step": 811 + }, + { + "epoch": 1.06, + "learning_rate": 3.8074682080553335e-05, + "logits/chosen": -1.649200677871704, + "logits/rejected": -1.6988123655319214, + "logps/chosen": -150.70803833007812, + "logps/rejected": -230.33238220214844, + "loss": 0.0699, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6994099020957947, + "rewards/margins": 7.033271789550781, + "rewards/rejected": -6.3338623046875, + "step": 812 + }, + { + "epoch": 1.06, + "learning_rate": 3.804412912574442e-05, + "logits/chosen": -1.7932782173156738, + "logits/rejected": -1.7817943096160889, + "logps/chosen": -167.56292724609375, + "logps/rejected": -223.35543823242188, + "loss": 0.0896, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1120498180389404, + "rewards/margins": 6.955061435699463, + "rewards/rejected": -5.843011856079102, + "step": 813 + }, + { + "epoch": 1.07, + "learning_rate": 3.801354937717565e-05, + "logits/chosen": -1.7469156980514526, + "logits/rejected": -1.7623285055160522, + "logps/chosen": -246.330322265625, + "logps/rejected": -311.6409912109375, + "loss": 0.1693, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7100722789764404, + "rewards/margins": 7.019688606262207, + "rewards/rejected": -7.729760646820068, + "step": 814 + }, + { + "epoch": 1.07, + "learning_rate": 3.798294289766043e-05, + "logits/chosen": -1.4315271377563477, + "logits/rejected": -1.3901729583740234, + "logps/chosen": -174.4903564453125, + "logps/rejected": -273.9493713378906, + "loss": 0.0574, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.452012300491333, + "rewards/margins": 8.629990577697754, + "rewards/rejected": -7.177978515625, + "step": 815 + }, + { + "epoch": 1.07, + "learning_rate": 3.795230975006712e-05, + "logits/chosen": -1.807803988456726, + "logits/rejected": -1.8593116998672485, + "logps/chosen": -159.69613647460938, + "logps/rejected": -266.27716064453125, + "loss": 0.0968, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7878644466400146, + "rewards/margins": 7.8176188468933105, + "rewards/rejected": -7.029754638671875, + "step": 816 + }, + { + "epoch": 1.07, + "learning_rate": 3.792164999731881e-05, + "logits/chosen": -1.9901769161224365, + "logits/rejected": -1.984059453010559, + "logps/chosen": -150.95579528808594, + "logps/rejected": -220.51315307617188, + "loss": 0.114, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.49705514311790466, + "rewards/margins": 6.722858428955078, + "rewards/rejected": -6.225803852081299, + "step": 817 + }, + { + "epoch": 1.07, + "learning_rate": 3.789096370239328e-05, + "logits/chosen": -1.8745883703231812, + "logits/rejected": -1.872071623802185, + "logps/chosen": -186.24365234375, + "logps/rejected": -262.4015197753906, + "loss": 0.0995, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6306481957435608, + "rewards/margins": 7.7143330574035645, + "rewards/rejected": -7.08368444442749, + "step": 818 + }, + { + "epoch": 1.07, + "learning_rate": 3.786025092832279e-05, + "logits/chosen": -1.7694121599197388, + "logits/rejected": -1.7589280605316162, + "logps/chosen": -172.3319854736328, + "logps/rejected": -252.0114288330078, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7360559105873108, + "rewards/margins": 8.137409210205078, + "rewards/rejected": -7.401352882385254, + "step": 819 + }, + { + "epoch": 1.07, + "learning_rate": 3.782951173819403e-05, + "logits/chosen": -1.6934165954589844, + "logits/rejected": -1.7441664934158325, + "logps/chosen": -191.2655487060547, + "logps/rejected": -317.37506103515625, + "loss": 0.0627, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.24915921688079834, + "rewards/margins": 6.986499309539795, + "rewards/rejected": -6.737339973449707, + "step": 820 + }, + { + "epoch": 1.07, + "learning_rate": 3.7798746195147914e-05, + "logits/chosen": -1.7119529247283936, + "logits/rejected": -1.7413674592971802, + "logps/chosen": -246.66983032226562, + "logps/rejected": -347.75433349609375, + "loss": 0.092, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6458020806312561, + "rewards/margins": 7.750561714172363, + "rewards/rejected": -7.1047587394714355, + "step": 821 + }, + { + "epoch": 1.08, + "learning_rate": 3.776795436237954e-05, + "logits/chosen": -1.7255234718322754, + "logits/rejected": -1.691392421722412, + "logps/chosen": -167.06256103515625, + "logps/rejected": -253.3814697265625, + "loss": 0.1163, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1990185976028442, + "rewards/margins": 8.745574951171875, + "rewards/rejected": -7.546555995941162, + "step": 822 + }, + { + "epoch": 1.08, + "learning_rate": 3.773713630313793e-05, + "logits/chosen": -1.5812559127807617, + "logits/rejected": -1.50252366065979, + "logps/chosen": -169.1471405029297, + "logps/rejected": -258.7736511230469, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11404211819171906, + "rewards/margins": 6.661553382873535, + "rewards/rejected": -6.547510623931885, + "step": 823 + }, + { + "epoch": 1.08, + "learning_rate": 3.7706292080726055e-05, + "logits/chosen": -1.608155608177185, + "logits/rejected": -1.5912903547286987, + "logps/chosen": -147.78099060058594, + "logps/rejected": -269.6217346191406, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0774881839752197, + "rewards/margins": 8.897539138793945, + "rewards/rejected": -7.820050239562988, + "step": 824 + }, + { + "epoch": 1.08, + "learning_rate": 3.767542175850058e-05, + "logits/chosen": -1.8905123472213745, + "logits/rejected": -1.8191280364990234, + "logps/chosen": -143.8882293701172, + "logps/rejected": -216.68756103515625, + "loss": 0.1049, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0501607656478882, + "rewards/margins": 6.524418830871582, + "rewards/rejected": -5.474257946014404, + "step": 825 + }, + { + "epoch": 1.08, + "learning_rate": 3.764452539987179e-05, + "logits/chosen": -1.5478218793869019, + "logits/rejected": -1.5789484977722168, + "logps/chosen": -240.63043212890625, + "logps/rejected": -349.4985656738281, + "loss": 0.0549, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.48741525411605835, + "rewards/margins": 9.919611930847168, + "rewards/rejected": -9.432197570800781, + "step": 826 + }, + { + "epoch": 1.08, + "learning_rate": 3.761360306830345e-05, + "logits/chosen": -1.500814437866211, + "logits/rejected": -1.4689234495162964, + "logps/chosen": -267.2929382324219, + "logps/rejected": -361.5623779296875, + "loss": 0.047, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6333886981010437, + "rewards/margins": 8.350789070129395, + "rewards/rejected": -7.717400074005127, + "step": 827 + }, + { + "epoch": 1.08, + "learning_rate": 3.75826548273127e-05, + "logits/chosen": -1.8496098518371582, + "logits/rejected": -1.8652946949005127, + "logps/chosen": -166.85903930664062, + "logps/rejected": -254.02622985839844, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2260518074035645, + "rewards/margins": 9.077446937561035, + "rewards/rejected": -7.851395606994629, + "step": 828 + }, + { + "epoch": 1.08, + "learning_rate": 3.7551680740469874e-05, + "logits/chosen": -1.8195977210998535, + "logits/rejected": -1.8940608501434326, + "logps/chosen": -184.0416259765625, + "logps/rejected": -289.5961608886719, + "loss": 0.1249, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4078165292739868, + "rewards/margins": 7.792662143707275, + "rewards/rejected": -7.384845733642578, + "step": 829 + }, + { + "epoch": 1.09, + "learning_rate": 3.752068087139839e-05, + "logits/chosen": -1.6744965314865112, + "logits/rejected": -1.56267511844635, + "logps/chosen": -222.7291259765625, + "logps/rejected": -262.7376403808594, + "loss": 0.1741, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7223973274230957, + "rewards/margins": 7.620642185211182, + "rewards/rejected": -6.898245811462402, + "step": 830 + }, + { + "epoch": 1.09, + "learning_rate": 3.7489655283774657e-05, + "logits/chosen": -1.6755584478378296, + "logits/rejected": -1.7019976377487183, + "logps/chosen": -192.16610717773438, + "logps/rejected": -266.71337890625, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2536769509315491, + "rewards/margins": 6.898894309997559, + "rewards/rejected": -6.6452178955078125, + "step": 831 + }, + { + "epoch": 1.09, + "learning_rate": 3.7458604041327874e-05, + "logits/chosen": -1.4574893712997437, + "logits/rejected": -1.4942587614059448, + "logps/chosen": -206.4982147216797, + "logps/rejected": -312.23504638671875, + "loss": 0.0454, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2815260887145996, + "rewards/margins": 8.590280532836914, + "rewards/rejected": -7.3087544441223145, + "step": 832 + }, + { + "epoch": 1.09, + "learning_rate": 3.742752720783997e-05, + "logits/chosen": -1.6686670780181885, + "logits/rejected": -1.7064058780670166, + "logps/chosen": -193.9601593017578, + "logps/rejected": -258.1878356933594, + "loss": 0.0769, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1386452317237854, + "rewards/margins": 6.448615074157715, + "rewards/rejected": -6.309969425201416, + "step": 833 + }, + { + "epoch": 1.09, + "learning_rate": 3.7396424847145425e-05, + "logits/chosen": -1.814921259880066, + "logits/rejected": -1.9397302865982056, + "logps/chosen": -136.74314880371094, + "logps/rejected": -240.24948120117188, + "loss": 0.1083, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41532331705093384, + "rewards/margins": 7.389758110046387, + "rewards/rejected": -6.974433898925781, + "step": 834 + }, + { + "epoch": 1.09, + "learning_rate": 3.736529702313114e-05, + "logits/chosen": -1.766431450843811, + "logits/rejected": -1.7151589393615723, + "logps/chosen": -184.22366333007812, + "logps/rejected": -266.4801025390625, + "loss": 0.0664, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2503972053527832, + "rewards/margins": 6.810416221618652, + "rewards/rejected": -6.560018062591553, + "step": 835 + }, + { + "epoch": 1.09, + "learning_rate": 3.733414379973635e-05, + "logits/chosen": -1.8444455862045288, + "logits/rejected": -1.8781747817993164, + "logps/chosen": -158.47128295898438, + "logps/rejected": -253.3724365234375, + "loss": 0.046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.781490683555603, + "rewards/margins": 8.908397674560547, + "rewards/rejected": -8.126907348632812, + "step": 836 + }, + { + "epoch": 1.1, + "learning_rate": 3.730296524095245e-05, + "logits/chosen": -1.782663345336914, + "logits/rejected": -1.8073902130126953, + "logps/chosen": -198.73843383789062, + "logps/rejected": -277.1875, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.417394995689392, + "rewards/margins": 9.88292407989502, + "rewards/rejected": -8.465529441833496, + "step": 837 + }, + { + "epoch": 1.1, + "learning_rate": 3.7271761410822856e-05, + "logits/chosen": -1.6727392673492432, + "logits/rejected": -1.7842079401016235, + "logps/chosen": -165.4208984375, + "logps/rejected": -247.4107666015625, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.052548423409461975, + "rewards/margins": 6.773774147033691, + "rewards/rejected": -6.721225738525391, + "step": 838 + }, + { + "epoch": 1.1, + "learning_rate": 3.724053237344294e-05, + "logits/chosen": -1.7459189891815186, + "logits/rejected": -1.7782320976257324, + "logps/chosen": -160.22817993164062, + "logps/rejected": -229.1803741455078, + "loss": 0.076, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19302824139595032, + "rewards/margins": 6.072666168212891, + "rewards/rejected": -5.879638671875, + "step": 839 + }, + { + "epoch": 1.1, + "learning_rate": 3.720927819295979e-05, + "logits/chosen": -1.5151444673538208, + "logits/rejected": -1.4409478902816772, + "logps/chosen": -173.93408203125, + "logps/rejected": -244.9287567138672, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37196123600006104, + "rewards/margins": 7.8127875328063965, + "rewards/rejected": -7.440826892852783, + "step": 840 + }, + { + "epoch": 1.1, + "learning_rate": 3.7177998933572186e-05, + "logits/chosen": -1.7388434410095215, + "logits/rejected": -1.7487221956253052, + "logps/chosen": -167.10128784179688, + "logps/rejected": -233.32119750976562, + "loss": 0.045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.59880131483078, + "rewards/margins": 6.784365177154541, + "rewards/rejected": -6.185564041137695, + "step": 841 + }, + { + "epoch": 1.1, + "learning_rate": 3.7146694659530425e-05, + "logits/chosen": -1.613723874092102, + "logits/rejected": -1.5765931606292725, + "logps/chosen": -188.1655731201172, + "logps/rejected": -248.87420654296875, + "loss": 0.0793, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5173835754394531, + "rewards/margins": 7.769992828369141, + "rewards/rejected": -7.252608776092529, + "step": 842 + }, + { + "epoch": 1.1, + "learning_rate": 3.711536543513614e-05, + "logits/chosen": -1.7683793306350708, + "logits/rejected": -1.7448205947875977, + "logps/chosen": -186.20999145507812, + "logps/rejected": -290.3359375, + "loss": 0.0719, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.24317941069602966, + "rewards/margins": 8.628244400024414, + "rewards/rejected": -8.385065078735352, + "step": 843 + }, + { + "epoch": 1.1, + "learning_rate": 3.708401132474228e-05, + "logits/chosen": -1.6240133047103882, + "logits/rejected": -1.5917489528656006, + "logps/chosen": -176.8642578125, + "logps/rejected": -245.3429718017578, + "loss": 0.092, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.23126250505447388, + "rewards/margins": 7.286675453186035, + "rewards/rejected": -7.055412292480469, + "step": 844 + }, + { + "epoch": 1.11, + "learning_rate": 3.705263239275284e-05, + "logits/chosen": -1.6431668996810913, + "logits/rejected": -1.6729328632354736, + "logps/chosen": -224.10528564453125, + "logps/rejected": -291.15435791015625, + "loss": 0.0575, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7494313716888428, + "rewards/margins": 5.913324356079102, + "rewards/rejected": -6.662755966186523, + "step": 845 + }, + { + "epoch": 1.11, + "learning_rate": 3.702122870362286e-05, + "logits/chosen": -1.8470516204833984, + "logits/rejected": -1.9361313581466675, + "logps/chosen": -202.118408203125, + "logps/rejected": -262.67987060546875, + "loss": 0.0793, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.422929048538208, + "rewards/margins": 6.7612433433532715, + "rewards/rejected": -6.338314056396484, + "step": 846 + }, + { + "epoch": 1.11, + "learning_rate": 3.698980032185821e-05, + "logits/chosen": -1.6243702173233032, + "logits/rejected": -1.5518081188201904, + "logps/chosen": -156.64627075195312, + "logps/rejected": -276.55133056640625, + "loss": 0.0379, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5403996706008911, + "rewards/margins": 9.535897254943848, + "rewards/rejected": -8.995497703552246, + "step": 847 + }, + { + "epoch": 1.11, + "learning_rate": 3.695834731201548e-05, + "logits/chosen": -1.706137776374817, + "logits/rejected": -1.659182071685791, + "logps/chosen": -195.2234344482422, + "logps/rejected": -257.5596008300781, + "loss": 0.2016, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3023313879966736, + "rewards/margins": 5.863803863525391, + "rewards/rejected": -5.5614728927612305, + "step": 848 + }, + { + "epoch": 1.11, + "learning_rate": 3.692686973870184e-05, + "logits/chosen": -1.8808422088623047, + "logits/rejected": -1.8596181869506836, + "logps/chosen": -158.5035858154297, + "logps/rejected": -200.2980499267578, + "loss": 0.1072, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9980390667915344, + "rewards/margins": 5.744872570037842, + "rewards/rejected": -4.746833324432373, + "step": 849 + }, + { + "epoch": 1.11, + "learning_rate": 3.689536766657494e-05, + "logits/chosen": -1.5721148252487183, + "logits/rejected": -1.6410491466522217, + "logps/chosen": -168.682861328125, + "logps/rejected": -284.8531799316406, + "loss": 0.0575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6643502116203308, + "rewards/margins": 9.553466796875, + "rewards/rejected": -8.889117240905762, + "step": 850 + }, + { + "epoch": 1.11, + "learning_rate": 3.6863841160342723e-05, + "logits/chosen": -1.563720703125, + "logits/rejected": -1.6035958528518677, + "logps/chosen": -153.61029052734375, + "logps/rejected": -225.42413330078125, + "loss": 0.0756, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4325341582298279, + "rewards/margins": 7.381722450256348, + "rewards/rejected": -6.949188709259033, + "step": 851 + }, + { + "epoch": 1.12, + "learning_rate": 3.683229028476334e-05, + "logits/chosen": -1.7827197313308716, + "logits/rejected": -1.882027268409729, + "logps/chosen": -162.43850708007812, + "logps/rejected": -275.7604064941406, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3920907974243164, + "rewards/margins": 10.132492065429688, + "rewards/rejected": -8.740402221679688, + "step": 852 + }, + { + "epoch": 1.12, + "learning_rate": 3.6800715104645e-05, + "logits/chosen": -1.7220182418823242, + "logits/rejected": -1.7189013957977295, + "logps/chosen": -140.41427612304688, + "logps/rejected": -214.07586669921875, + "loss": 0.0604, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6028104424476624, + "rewards/margins": 6.747117042541504, + "rewards/rejected": -6.1443071365356445, + "step": 853 + }, + { + "epoch": 1.12, + "learning_rate": 3.676911568484583e-05, + "logits/chosen": -1.8405961990356445, + "logits/rejected": -1.800131916999817, + "logps/chosen": -209.40870666503906, + "logps/rejected": -268.63397216796875, + "loss": 0.0353, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5028942227363586, + "rewards/margins": 7.966097354888916, + "rewards/rejected": -7.463202476501465, + "step": 854 + }, + { + "epoch": 1.12, + "learning_rate": 3.673749209027375e-05, + "logits/chosen": -1.9057409763336182, + "logits/rejected": -1.9989405870437622, + "logps/chosen": -134.8566436767578, + "logps/rejected": -231.26950073242188, + "loss": 0.141, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4298328161239624, + "rewards/margins": 7.505517482757568, + "rewards/rejected": -7.075685977935791, + "step": 855 + }, + { + "epoch": 1.12, + "learning_rate": 3.6705844385886334e-05, + "logits/chosen": -1.86636483669281, + "logits/rejected": -1.9175128936767578, + "logps/chosen": -142.00611877441406, + "logps/rejected": -224.15219116210938, + "loss": 0.1332, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33483776450157166, + "rewards/margins": 6.456839561462402, + "rewards/rejected": -6.122001647949219, + "step": 856 + }, + { + "epoch": 1.12, + "learning_rate": 3.667417263669068e-05, + "logits/chosen": -1.9074723720550537, + "logits/rejected": -1.922853946685791, + "logps/chosen": -153.0294189453125, + "logps/rejected": -247.6094970703125, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6480506658554077, + "rewards/margins": 8.770407676696777, + "rewards/rejected": -8.122356414794922, + "step": 857 + }, + { + "epoch": 1.12, + "learning_rate": 3.6642476907743276e-05, + "logits/chosen": -1.8091105222702026, + "logits/rejected": -1.8391491174697876, + "logps/chosen": -173.12405395507812, + "logps/rejected": -269.18701171875, + "loss": 0.0564, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2547444999217987, + "rewards/margins": 7.713589191436768, + "rewards/rejected": -7.968333721160889, + "step": 858 + }, + { + "epoch": 1.12, + "learning_rate": 3.661075726414986e-05, + "logits/chosen": -1.6282808780670166, + "logits/rejected": -1.5667518377304077, + "logps/chosen": -158.9324951171875, + "logps/rejected": -252.65725708007812, + "loss": 0.0396, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3669869303703308, + "rewards/margins": 7.350515842437744, + "rewards/rejected": -6.9835286140441895, + "step": 859 + }, + { + "epoch": 1.13, + "learning_rate": 3.6579013771065305e-05, + "logits/chosen": -1.5194811820983887, + "logits/rejected": -1.5251891613006592, + "logps/chosen": -162.73855590820312, + "logps/rejected": -289.39093017578125, + "loss": 0.0515, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7854263186454773, + "rewards/margins": 9.395334243774414, + "rewards/rejected": -8.609909057617188, + "step": 860 + }, + { + "epoch": 1.13, + "learning_rate": 3.654724649369348e-05, + "logits/chosen": -1.7831377983093262, + "logits/rejected": -1.8511077165603638, + "logps/chosen": -159.1595916748047, + "logps/rejected": -260.9335021972656, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.343432903289795, + "rewards/margins": 9.590715408325195, + "rewards/rejected": -8.247282981872559, + "step": 861 + }, + { + "epoch": 1.13, + "learning_rate": 3.651545549728709e-05, + "logits/chosen": -1.8324871063232422, + "logits/rejected": -1.8227266073226929, + "logps/chosen": -187.99771118164062, + "logps/rejected": -261.81964111328125, + "loss": 0.091, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.018547460436820984, + "rewards/margins": 7.8460283279418945, + "rewards/rejected": -7.86457633972168, + "step": 862 + }, + { + "epoch": 1.13, + "learning_rate": 3.6483640847147554e-05, + "logits/chosen": -1.7904331684112549, + "logits/rejected": -1.7915871143341064, + "logps/chosen": -162.7968292236328, + "logps/rejected": -221.08908081054688, + "loss": 0.0596, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.10385438799858093, + "rewards/margins": 7.333845138549805, + "rewards/rejected": -7.2299909591674805, + "step": 863 + }, + { + "epoch": 1.13, + "learning_rate": 3.645180260862492e-05, + "logits/chosen": -1.6333101987838745, + "logits/rejected": -1.680311918258667, + "logps/chosen": -205.26104736328125, + "logps/rejected": -275.9607849121094, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6030675768852234, + "rewards/margins": 7.268556118011475, + "rewards/rejected": -7.8716230392456055, + "step": 864 + }, + { + "epoch": 1.13, + "learning_rate": 3.6419940847117626e-05, + "logits/chosen": -1.7607299089431763, + "logits/rejected": -1.7272199392318726, + "logps/chosen": -186.3272705078125, + "logps/rejected": -243.32936096191406, + "loss": 0.0258, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5442850589752197, + "rewards/margins": 6.532572269439697, + "rewards/rejected": -7.076857089996338, + "step": 865 + }, + { + "epoch": 1.13, + "learning_rate": 3.638805562807249e-05, + "logits/chosen": -1.6750125885009766, + "logits/rejected": -1.6804828643798828, + "logps/chosen": -183.07521057128906, + "logps/rejected": -282.3734130859375, + "loss": 0.0878, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4056967496871948, + "rewards/margins": 8.726588249206543, + "rewards/rejected": -8.320891380310059, + "step": 866 + }, + { + "epoch": 1.13, + "learning_rate": 3.635614701698448e-05, + "logits/chosen": -1.6088908910751343, + "logits/rejected": -1.6467278003692627, + "logps/chosen": -171.95211791992188, + "logps/rejected": -264.917236328125, + "loss": 0.0761, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2036418467760086, + "rewards/margins": 8.157671928405762, + "rewards/rejected": -8.36131477355957, + "step": 867 + }, + { + "epoch": 1.14, + "learning_rate": 3.632421507939661e-05, + "logits/chosen": -1.5411345958709717, + "logits/rejected": -1.5687683820724487, + "logps/chosen": -171.95297241210938, + "logps/rejected": -265.6844177246094, + "loss": 0.0293, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8591725826263428, + "rewards/margins": 8.619390487670898, + "rewards/rejected": -7.760217666625977, + "step": 868 + }, + { + "epoch": 1.14, + "learning_rate": 3.629225988089983e-05, + "logits/chosen": -1.677573800086975, + "logits/rejected": -1.7168667316436768, + "logps/chosen": -180.29104614257812, + "logps/rejected": -263.3822326660156, + "loss": 0.0561, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3133031129837036, + "rewards/margins": 8.842041015625, + "rewards/rejected": -8.528738021850586, + "step": 869 + }, + { + "epoch": 1.14, + "learning_rate": 3.6260281487132846e-05, + "logits/chosen": -1.5518522262573242, + "logits/rejected": -1.5730174779891968, + "logps/chosen": -184.6771697998047, + "logps/rejected": -269.0153503417969, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7430427670478821, + "rewards/margins": 8.222708702087402, + "rewards/rejected": -7.479666233062744, + "step": 870 + }, + { + "epoch": 1.14, + "learning_rate": 3.622827996378203e-05, + "logits/chosen": -1.8273077011108398, + "logits/rejected": -1.753129482269287, + "logps/chosen": -192.535400390625, + "logps/rejected": -255.18661499023438, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2789681851863861, + "rewards/margins": 7.683708190917969, + "rewards/rejected": -7.962676525115967, + "step": 871 + }, + { + "epoch": 1.14, + "learning_rate": 3.6196255376581254e-05, + "logits/chosen": -1.723561406135559, + "logits/rejected": -1.7448184490203857, + "logps/chosen": -179.89093017578125, + "logps/rejected": -277.4232177734375, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9937200546264648, + "rewards/margins": 9.481063842773438, + "rewards/rejected": -8.487343788146973, + "step": 872 + }, + { + "epoch": 1.14, + "learning_rate": 3.616420779131177e-05, + "logits/chosen": -1.7428555488586426, + "logits/rejected": -1.8246798515319824, + "logps/chosen": -175.7808380126953, + "logps/rejected": -260.22637939453125, + "loss": 0.0549, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8115906119346619, + "rewards/margins": 8.594477653503418, + "rewards/rejected": -7.782886981964111, + "step": 873 + }, + { + "epoch": 1.14, + "learning_rate": 3.613213727380206e-05, + "logits/chosen": -1.8163658380508423, + "logits/rejected": -1.8145133256912231, + "logps/chosen": -163.59075927734375, + "logps/rejected": -231.6703338623047, + "loss": 0.0934, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6859133243560791, + "rewards/margins": 5.622536659240723, + "rewards/rejected": -6.308449745178223, + "step": 874 + }, + { + "epoch": 1.15, + "learning_rate": 3.610004388992771e-05, + "logits/chosen": -1.7247226238250732, + "logits/rejected": -1.7080042362213135, + "logps/chosen": -160.51553344726562, + "logps/rejected": -245.70233154296875, + "loss": 0.0738, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.1169636249542236, + "rewards/margins": 9.182759284973145, + "rewards/rejected": -8.065794944763184, + "step": 875 + }, + { + "epoch": 1.15, + "learning_rate": 3.6067927705611304e-05, + "logits/chosen": -1.6803061962127686, + "logits/rejected": -1.696300745010376, + "logps/chosen": -163.08665466308594, + "logps/rejected": -258.111328125, + "loss": 0.1504, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.11361874639987946, + "rewards/margins": 7.612594127655029, + "rewards/rejected": -7.49897575378418, + "step": 876 + }, + { + "epoch": 1.15, + "learning_rate": 3.6035788786822225e-05, + "logits/chosen": -1.7864495515823364, + "logits/rejected": -1.8517754077911377, + "logps/chosen": -169.38356018066406, + "logps/rejected": -261.1342468261719, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9378201365470886, + "rewards/margins": 8.446355819702148, + "rewards/rejected": -7.508536338806152, + "step": 877 + }, + { + "epoch": 1.15, + "learning_rate": 3.6003627199576564e-05, + "logits/chosen": -1.6908057928085327, + "logits/rejected": -1.7146823406219482, + "logps/chosen": -149.77984619140625, + "logps/rejected": -215.45220947265625, + "loss": 0.0889, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.16511961817741394, + "rewards/margins": 6.96973180770874, + "rewards/rejected": -7.134850978851318, + "step": 878 + }, + { + "epoch": 1.15, + "learning_rate": 3.597144300993699e-05, + "logits/chosen": -1.8882876634597778, + "logits/rejected": -1.9150993824005127, + "logps/chosen": -143.28770446777344, + "logps/rejected": -224.34437561035156, + "loss": 0.0574, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4502476453781128, + "rewards/margins": 7.336152076721191, + "rewards/rejected": -6.885904312133789, + "step": 879 + }, + { + "epoch": 1.15, + "learning_rate": 3.593923628401259e-05, + "logits/chosen": -1.8090243339538574, + "logits/rejected": -1.8175634145736694, + "logps/chosen": -148.93310546875, + "logps/rejected": -242.72918701171875, + "loss": 0.0463, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.103208303451538, + "rewards/margins": 8.617406845092773, + "rewards/rejected": -7.514198303222656, + "step": 880 + }, + { + "epoch": 1.15, + "learning_rate": 3.5907007087958726e-05, + "logits/chosen": -1.803382396697998, + "logits/rejected": -1.7716857194900513, + "logps/chosen": -183.49339294433594, + "logps/rejected": -251.29864501953125, + "loss": 0.1326, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4789639115333557, + "rewards/margins": 6.777097225189209, + "rewards/rejected": -7.256060600280762, + "step": 881 + }, + { + "epoch": 1.15, + "learning_rate": 3.587475548797694e-05, + "logits/chosen": -1.5749878883361816, + "logits/rejected": -1.617025375366211, + "logps/chosen": -148.74200439453125, + "logps/rejected": -228.47140502929688, + "loss": 0.0471, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5491048097610474, + "rewards/margins": 7.191831588745117, + "rewards/rejected": -6.642726421356201, + "step": 882 + }, + { + "epoch": 1.16, + "learning_rate": 3.5842481550314794e-05, + "logits/chosen": -1.8245117664337158, + "logits/rejected": -1.80112886428833, + "logps/chosen": -165.4649200439453, + "logps/rejected": -213.1611785888672, + "loss": 0.0503, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9193047881126404, + "rewards/margins": 7.948483467102051, + "rewards/rejected": -7.029178619384766, + "step": 883 + }, + { + "epoch": 1.16, + "learning_rate": 3.581018534126571e-05, + "logits/chosen": -1.801705002784729, + "logits/rejected": -1.834633231163025, + "logps/chosen": -170.57785034179688, + "logps/rejected": -256.3074035644531, + "loss": 0.0518, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.03511929512023926, + "rewards/margins": 8.271900177001953, + "rewards/rejected": -8.236780166625977, + "step": 884 + }, + { + "epoch": 1.16, + "learning_rate": 3.577786692716886e-05, + "logits/chosen": -1.6602346897125244, + "logits/rejected": -1.683791995048523, + "logps/chosen": -179.22866821289062, + "logps/rejected": -301.5026550292969, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2062432765960693, + "rewards/margins": 9.739579200744629, + "rewards/rejected": -8.533336639404297, + "step": 885 + }, + { + "epoch": 1.16, + "learning_rate": 3.574552637440907e-05, + "logits/chosen": -1.6858350038528442, + "logits/rejected": -1.6984546184539795, + "logps/chosen": -150.21563720703125, + "logps/rejected": -213.68753051757812, + "loss": 0.1469, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.48701292276382446, + "rewards/margins": 6.276709079742432, + "rewards/rejected": -5.789695739746094, + "step": 886 + }, + { + "epoch": 1.16, + "learning_rate": 3.571316374941658e-05, + "logits/chosen": -2.0152809619903564, + "logits/rejected": -2.0364301204681396, + "logps/chosen": -176.3936767578125, + "logps/rejected": -234.19671630859375, + "loss": 0.1341, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20730283856391907, + "rewards/margins": 6.322017192840576, + "rewards/rejected": -6.529320240020752, + "step": 887 + }, + { + "epoch": 1.16, + "learning_rate": 3.568077911866703e-05, + "logits/chosen": -1.8245972394943237, + "logits/rejected": -1.8635644912719727, + "logps/chosen": -180.6774444580078, + "logps/rejected": -273.9190673828125, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14365682005882263, + "rewards/margins": 8.138999938964844, + "rewards/rejected": -7.995343208312988, + "step": 888 + }, + { + "epoch": 1.16, + "learning_rate": 3.564837254868118e-05, + "logits/chosen": -1.8740934133529663, + "logits/rejected": -1.8795528411865234, + "logps/chosen": -160.5602569580078, + "logps/rejected": -250.400634765625, + "loss": 0.0483, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7248006463050842, + "rewards/margins": 7.648869514465332, + "rewards/rejected": -8.373669624328613, + "step": 889 + }, + { + "epoch": 1.16, + "learning_rate": 3.561594410602495e-05, + "logits/chosen": -1.851047158241272, + "logits/rejected": -1.8520572185516357, + "logps/chosen": -202.7530059814453, + "logps/rejected": -255.25729370117188, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1345672458410263, + "rewards/margins": 7.931034564971924, + "rewards/rejected": -7.796467304229736, + "step": 890 + }, + { + "epoch": 1.17, + "learning_rate": 3.558349385730913e-05, + "logits/chosen": -1.9940862655639648, + "logits/rejected": -1.9213124513626099, + "logps/chosen": -200.50384521484375, + "logps/rejected": -277.02764892578125, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5816916823387146, + "rewards/margins": 8.134634971618652, + "rewards/rejected": -7.552943229675293, + "step": 891 + }, + { + "epoch": 1.17, + "learning_rate": 3.5551021869189286e-05, + "logits/chosen": -1.916977047920227, + "logits/rejected": -1.9621555805206299, + "logps/chosen": -175.3245391845703, + "logps/rejected": -248.02911376953125, + "loss": 0.1087, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9088395833969116, + "rewards/margins": 7.577523231506348, + "rewards/rejected": -6.668683052062988, + "step": 892 + }, + { + "epoch": 1.17, + "learning_rate": 3.55185282083657e-05, + "logits/chosen": -1.830249547958374, + "logits/rejected": -1.9227575063705444, + "logps/chosen": -157.8826446533203, + "logps/rejected": -272.9053649902344, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1682499647140503, + "rewards/margins": 9.896993637084961, + "rewards/rejected": -8.728743553161621, + "step": 893 + }, + { + "epoch": 1.17, + "learning_rate": 3.548601294158313e-05, + "logits/chosen": -1.9553158283233643, + "logits/rejected": -1.9773513078689575, + "logps/chosen": -223.3098907470703, + "logps/rejected": -267.57659912109375, + "loss": 0.1966, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7544253468513489, + "rewards/margins": 5.037443161010742, + "rewards/rejected": -5.791868686676025, + "step": 894 + }, + { + "epoch": 1.17, + "learning_rate": 3.5453476135630706e-05, + "logits/chosen": -2.1675331592559814, + "logits/rejected": -2.06319522857666, + "logps/chosen": -168.5749053955078, + "logps/rejected": -212.3041229248047, + "loss": 0.0527, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8117193579673767, + "rewards/margins": 7.594612121582031, + "rewards/rejected": -6.782892227172852, + "step": 895 + }, + { + "epoch": 1.17, + "learning_rate": 3.542091785734184e-05, + "logits/chosen": -1.8826407194137573, + "logits/rejected": -1.9465281963348389, + "logps/chosen": -161.4238739013672, + "logps/rejected": -249.60406494140625, + "loss": 0.0825, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6117186546325684, + "rewards/margins": 8.12980842590332, + "rewards/rejected": -7.518089771270752, + "step": 896 + }, + { + "epoch": 1.17, + "learning_rate": 3.538833817359401e-05, + "logits/chosen": -1.9344408512115479, + "logits/rejected": -1.9823689460754395, + "logps/chosen": -188.45948791503906, + "logps/rejected": -285.7137756347656, + "loss": 0.1665, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4131191074848175, + "rewards/margins": 9.048032760620117, + "rewards/rejected": -8.63491439819336, + "step": 897 + }, + { + "epoch": 1.18, + "learning_rate": 3.5355737151308686e-05, + "logits/chosen": -1.7558889389038086, + "logits/rejected": -1.7752621173858643, + "logps/chosen": -162.7386016845703, + "logps/rejected": -242.3275146484375, + "loss": 0.0623, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5808318853378296, + "rewards/margins": 6.759397029876709, + "rewards/rejected": -7.340229034423828, + "step": 898 + }, + { + "epoch": 1.18, + "learning_rate": 3.5323114857451174e-05, + "logits/chosen": -2.1031816005706787, + "logits/rejected": -2.0772862434387207, + "logps/chosen": -179.40110778808594, + "logps/rejected": -246.42173767089844, + "loss": 0.0558, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9522780776023865, + "rewards/margins": 8.00682258605957, + "rewards/rejected": -7.054544448852539, + "step": 899 + }, + { + "epoch": 1.18, + "learning_rate": 3.529047135903045e-05, + "logits/chosen": -1.9245232343673706, + "logits/rejected": -1.9527860879898071, + "logps/chosen": -164.22299194335938, + "logps/rejected": -268.86181640625, + "loss": 0.0468, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6341921091079712, + "rewards/margins": 9.523921012878418, + "rewards/rejected": -8.889729499816895, + "step": 900 + }, + { + "epoch": 1.18, + "learning_rate": 3.525780672309907e-05, + "logits/chosen": -1.7722699642181396, + "logits/rejected": -1.789467215538025, + "logps/chosen": -152.01785278320312, + "logps/rejected": -239.99957275390625, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3282092809677124, + "rewards/margins": 9.678154945373535, + "rewards/rejected": -8.349946022033691, + "step": 901 + }, + { + "epoch": 1.18, + "learning_rate": 3.522512101675299e-05, + "logits/chosen": -1.8270869255065918, + "logits/rejected": -1.898951530456543, + "logps/chosen": -140.39678955078125, + "logps/rejected": -211.85061645507812, + "loss": 0.1175, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8393736481666565, + "rewards/margins": 6.014346122741699, + "rewards/rejected": -6.853720188140869, + "step": 902 + }, + { + "epoch": 1.18, + "learning_rate": 3.519241430713145e-05, + "logits/chosen": -1.741802453994751, + "logits/rejected": -1.7483961582183838, + "logps/chosen": -193.8229217529297, + "logps/rejected": -237.98211669921875, + "loss": 0.052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.886393129825592, + "rewards/margins": 5.339400291442871, + "rewards/rejected": -6.22579288482666, + "step": 903 + }, + { + "epoch": 1.18, + "learning_rate": 3.5159686661416834e-05, + "logits/chosen": -1.872272253036499, + "logits/rejected": -1.914929747581482, + "logps/chosen": -190.64895629882812, + "logps/rejected": -272.30865478515625, + "loss": 0.068, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09860727936029434, + "rewards/margins": 7.040101051330566, + "rewards/rejected": -6.941493988037109, + "step": 904 + }, + { + "epoch": 1.18, + "learning_rate": 3.512693814683456e-05, + "logits/chosen": -1.8728840351104736, + "logits/rejected": -1.8658220767974854, + "logps/chosen": -191.35928344726562, + "logps/rejected": -261.27423095703125, + "loss": 0.0753, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8436151146888733, + "rewards/margins": 7.535486221313477, + "rewards/rejected": -6.691871166229248, + "step": 905 + }, + { + "epoch": 1.19, + "learning_rate": 3.5094168830652854e-05, + "logits/chosen": -1.8646080493927002, + "logits/rejected": -1.864649772644043, + "logps/chosen": -172.74118041992188, + "logps/rejected": -247.68109130859375, + "loss": 0.0411, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3584592640399933, + "rewards/margins": 7.375222206115723, + "rewards/rejected": -7.016762733459473, + "step": 906 + }, + { + "epoch": 1.19, + "learning_rate": 3.506137878018272e-05, + "logits/chosen": -1.9659096002578735, + "logits/rejected": -1.9836225509643555, + "logps/chosen": -160.93516540527344, + "logps/rejected": -274.54052734375, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14322397112846375, + "rewards/margins": 8.324197769165039, + "rewards/rejected": -8.180973052978516, + "step": 907 + }, + { + "epoch": 1.19, + "learning_rate": 3.502856806277773e-05, + "logits/chosen": -1.8923836946487427, + "logits/rejected": -1.8981733322143555, + "logps/chosen": -187.4054412841797, + "logps/rejected": -274.0010681152344, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.868060290813446, + "rewards/margins": 10.109868049621582, + "rewards/rejected": -9.24180793762207, + "step": 908 + }, + { + "epoch": 1.19, + "learning_rate": 3.4995736745833895e-05, + "logits/chosen": -2.094193696975708, + "logits/rejected": -2.0947916507720947, + "logps/chosen": -154.0366973876953, + "logps/rejected": -235.6873016357422, + "loss": 0.1445, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21441936492919922, + "rewards/margins": 6.727463722229004, + "rewards/rejected": -6.5130438804626465, + "step": 909 + }, + { + "epoch": 1.19, + "learning_rate": 3.496288489678958e-05, + "logits/chosen": -1.4902489185333252, + "logits/rejected": -1.5242319107055664, + "logps/chosen": -194.15325927734375, + "logps/rejected": -288.6688537597656, + "loss": 0.0442, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.15938527882099152, + "rewards/margins": 8.332877159118652, + "rewards/rejected": -8.492262840270996, + "step": 910 + }, + { + "epoch": 1.19, + "learning_rate": 3.493001258312529e-05, + "logits/chosen": -1.9223310947418213, + "logits/rejected": -1.9442507028579712, + "logps/chosen": -158.0222930908203, + "logps/rejected": -258.5923767089844, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13621219992637634, + "rewards/margins": 8.879887580871582, + "rewards/rejected": -8.743675231933594, + "step": 911 + }, + { + "epoch": 1.19, + "learning_rate": 3.489711987236357e-05, + "logits/chosen": -1.8931705951690674, + "logits/rejected": -1.92643404006958, + "logps/chosen": -189.59190368652344, + "logps/rejected": -254.67291259765625, + "loss": 0.0406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.908362090587616, + "rewards/margins": 6.802270412445068, + "rewards/rejected": -7.71063232421875, + "step": 912 + }, + { + "epoch": 1.19, + "learning_rate": 3.4864206832068884e-05, + "logits/chosen": -1.6683810949325562, + "logits/rejected": -1.6903074979782104, + "logps/chosen": -174.6500244140625, + "logps/rejected": -238.3423614501953, + "loss": 0.1404, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7650407552719116, + "rewards/margins": 6.08333158493042, + "rewards/rejected": -6.848372936248779, + "step": 913 + }, + { + "epoch": 1.2, + "learning_rate": 3.483127352984742e-05, + "logits/chosen": -1.5658330917358398, + "logits/rejected": -1.5612378120422363, + "logps/chosen": -191.35238647460938, + "logps/rejected": -275.3499450683594, + "loss": 0.0956, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4658517837524414, + "rewards/margins": 7.195768356323242, + "rewards/rejected": -7.661620140075684, + "step": 914 + }, + { + "epoch": 1.2, + "learning_rate": 3.479832003334702e-05, + "logits/chosen": -1.767727017402649, + "logits/rejected": -1.743235468864441, + "logps/chosen": -194.4393310546875, + "logps/rejected": -251.51502990722656, + "loss": 0.1065, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39378470182418823, + "rewards/margins": 6.869220733642578, + "rewards/rejected": -7.26300573348999, + "step": 915 + }, + { + "epoch": 1.2, + "learning_rate": 3.476534641025698e-05, + "logits/chosen": -1.7358970642089844, + "logits/rejected": -1.6227883100509644, + "logps/chosen": -164.82554626464844, + "logps/rejected": -246.82467651367188, + "loss": 0.0634, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.006584217771887779, + "rewards/margins": 6.655069351196289, + "rewards/rejected": -6.6484856605529785, + "step": 916 + }, + { + "epoch": 1.2, + "learning_rate": 3.4732352728307966e-05, + "logits/chosen": -1.9275317192077637, + "logits/rejected": -1.9865800142288208, + "logps/chosen": -209.73939514160156, + "logps/rejected": -301.17333984375, + "loss": 0.1449, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.43461543321609497, + "rewards/margins": 7.842949390411377, + "rewards/rejected": -7.408333778381348, + "step": 917 + }, + { + "epoch": 1.2, + "learning_rate": 3.469933905527182e-05, + "logits/chosen": -1.8539807796478271, + "logits/rejected": -1.8448735475540161, + "logps/chosen": -147.5707244873047, + "logps/rejected": -214.45086669921875, + "loss": 0.0996, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.026866242289543152, + "rewards/margins": 6.803654670715332, + "rewards/rejected": -6.8305206298828125, + "step": 918 + }, + { + "epoch": 1.2, + "learning_rate": 3.466630545896146e-05, + "logits/chosen": -1.87417471408844, + "logits/rejected": -1.8515712022781372, + "logps/chosen": -181.49510192871094, + "logps/rejected": -237.30181884765625, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1314505785703659, + "rewards/margins": 6.971901893615723, + "rewards/rejected": -6.840450763702393, + "step": 919 + }, + { + "epoch": 1.2, + "learning_rate": 3.463325200723071e-05, + "logits/chosen": -1.7973699569702148, + "logits/rejected": -1.896343469619751, + "logps/chosen": -149.91091918945312, + "logps/rejected": -225.79420471191406, + "loss": 0.0931, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6555187106132507, + "rewards/margins": 6.618897438049316, + "rewards/rejected": -7.274415969848633, + "step": 920 + }, + { + "epoch": 1.21, + "learning_rate": 3.460017876797422e-05, + "logits/chosen": -1.7525713443756104, + "logits/rejected": -1.6888906955718994, + "logps/chosen": -205.05142211914062, + "logps/rejected": -276.8188781738281, + "loss": 0.1139, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1992167234420776, + "rewards/margins": 6.552946090698242, + "rewards/rejected": -7.752162933349609, + "step": 921 + }, + { + "epoch": 1.21, + "learning_rate": 3.456708580912725e-05, + "logits/chosen": -2.064255475997925, + "logits/rejected": -2.0105979442596436, + "logps/chosen": -180.6109619140625, + "logps/rejected": -242.7440948486328, + "loss": 0.0656, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47643887996673584, + "rewards/margins": 6.825469493865967, + "rewards/rejected": -7.301907539367676, + "step": 922 + }, + { + "epoch": 1.21, + "learning_rate": 3.453397319866557e-05, + "logits/chosen": -1.9662121534347534, + "logits/rejected": -1.9616503715515137, + "logps/chosen": -158.33319091796875, + "logps/rejected": -220.38682556152344, + "loss": 0.1106, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5450817346572876, + "rewards/margins": 5.366931438446045, + "rewards/rejected": -5.912014007568359, + "step": 923 + }, + { + "epoch": 1.21, + "learning_rate": 3.4500841004605324e-05, + "logits/chosen": -1.5947680473327637, + "logits/rejected": -1.641003131866455, + "logps/chosen": -191.16773986816406, + "logps/rejected": -282.4808044433594, + "loss": 0.1363, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3834429085254669, + "rewards/margins": 9.609367370605469, + "rewards/rejected": -9.225924491882324, + "step": 924 + }, + { + "epoch": 1.21, + "learning_rate": 3.446768929500288e-05, + "logits/chosen": -1.9656989574432373, + "logits/rejected": -1.9933035373687744, + "logps/chosen": -169.61203002929688, + "logps/rejected": -272.0287170410156, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.409053236246109, + "rewards/margins": 9.076614379882812, + "rewards/rejected": -8.667560577392578, + "step": 925 + }, + { + "epoch": 1.21, + "learning_rate": 3.443451813795469e-05, + "logits/chosen": -1.8398690223693848, + "logits/rejected": -1.8831474781036377, + "logps/chosen": -211.554931640625, + "logps/rejected": -309.2585144042969, + "loss": 0.0769, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6313567161560059, + "rewards/margins": 8.130779266357422, + "rewards/rejected": -8.762136459350586, + "step": 926 + }, + { + "epoch": 1.21, + "learning_rate": 3.4401327601597174e-05, + "logits/chosen": -1.985721468925476, + "logits/rejected": -1.9450913667678833, + "logps/chosen": -218.31907653808594, + "logps/rejected": -290.0771484375, + "loss": 0.0982, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06794175505638123, + "rewards/margins": 7.096756935119629, + "rewards/rejected": -7.028815746307373, + "step": 927 + }, + { + "epoch": 1.21, + "learning_rate": 3.436811775410651e-05, + "logits/chosen": -1.8266397714614868, + "logits/rejected": -1.869195580482483, + "logps/chosen": -158.58258056640625, + "logps/rejected": -253.25064086914062, + "loss": 0.0554, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.1370869874954224, + "rewards/margins": 9.21893310546875, + "rewards/rejected": -8.081846237182617, + "step": 928 + }, + { + "epoch": 1.22, + "learning_rate": 3.43348886636986e-05, + "logits/chosen": -1.9212427139282227, + "logits/rejected": -1.9511475563049316, + "logps/chosen": -158.55897521972656, + "logps/rejected": -246.16014099121094, + "loss": 0.062, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13792043924331665, + "rewards/margins": 8.233397483825684, + "rewards/rejected": -8.095476150512695, + "step": 929 + }, + { + "epoch": 1.22, + "learning_rate": 3.430164039862882e-05, + "logits/chosen": -1.6417392492294312, + "logits/rejected": -1.6897914409637451, + "logps/chosen": -168.390625, + "logps/rejected": -240.8642578125, + "loss": 0.0492, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16894683241844177, + "rewards/margins": 7.576196670532227, + "rewards/rejected": -7.407250881195068, + "step": 930 + }, + { + "epoch": 1.22, + "learning_rate": 3.426837302719197e-05, + "logits/chosen": -1.8382885456085205, + "logits/rejected": -1.816691517829895, + "logps/chosen": -228.66392517089844, + "logps/rejected": -333.4481201171875, + "loss": 0.0483, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.22568544745445251, + "rewards/margins": 8.674262046813965, + "rewards/rejected": -8.899946212768555, + "step": 931 + }, + { + "epoch": 1.22, + "learning_rate": 3.42350866177221e-05, + "logits/chosen": -1.8598941564559937, + "logits/rejected": -1.8177353143692017, + "logps/chosen": -168.64523315429688, + "logps/rejected": -267.4147644042969, + "loss": 0.0688, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.040854811668395996, + "rewards/margins": 7.731595516204834, + "rewards/rejected": -7.772449970245361, + "step": 932 + }, + { + "epoch": 1.22, + "learning_rate": 3.420178123859233e-05, + "logits/chosen": -1.762475609779358, + "logits/rejected": -1.712884545326233, + "logps/chosen": -189.55911254882812, + "logps/rejected": -261.5013427734375, + "loss": 0.0699, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7337988615036011, + "rewards/margins": 6.591477870941162, + "rewards/rejected": -7.3252763748168945, + "step": 933 + }, + { + "epoch": 1.22, + "learning_rate": 3.416845695821476e-05, + "logits/chosen": -1.8344154357910156, + "logits/rejected": -1.8057382106781006, + "logps/chosen": -180.87850952148438, + "logps/rejected": -250.47337341308594, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29066789150238037, + "rewards/margins": 7.957887649536133, + "rewards/rejected": -8.248556137084961, + "step": 934 + }, + { + "epoch": 1.22, + "learning_rate": 3.413511384504034e-05, + "logits/chosen": -2.0944817066192627, + "logits/rejected": -2.08333420753479, + "logps/chosen": -171.9430694580078, + "logps/rejected": -253.50955200195312, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2787511646747589, + "rewards/margins": 8.49667739868164, + "rewards/rejected": -8.775429725646973, + "step": 935 + }, + { + "epoch": 1.22, + "learning_rate": 3.410175196755866e-05, + "logits/chosen": -1.9048943519592285, + "logits/rejected": -1.8727785348892212, + "logps/chosen": -178.324951171875, + "logps/rejected": -269.0649719238281, + "loss": 0.0686, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0880991518497467, + "rewards/margins": 8.45480728149414, + "rewards/rejected": -8.542905807495117, + "step": 936 + }, + { + "epoch": 1.23, + "learning_rate": 3.40683713942979e-05, + "logits/chosen": -1.671642780303955, + "logits/rejected": -1.6982778310775757, + "logps/chosen": -176.78634643554688, + "logps/rejected": -280.5889892578125, + "loss": 0.0955, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5775284767150879, + "rewards/margins": 8.944602966308594, + "rewards/rejected": -9.52213191986084, + "step": 937 + }, + { + "epoch": 1.23, + "learning_rate": 3.403497219382461e-05, + "logits/chosen": -1.9782202243804932, + "logits/rejected": -1.998267650604248, + "logps/chosen": -163.75387573242188, + "logps/rejected": -259.76483154296875, + "loss": 0.0988, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2686743140220642, + "rewards/margins": 8.010343551635742, + "rewards/rejected": -7.741668701171875, + "step": 938 + }, + { + "epoch": 1.23, + "learning_rate": 3.400155443474361e-05, + "logits/chosen": -1.8512263298034668, + "logits/rejected": -1.8064016103744507, + "logps/chosen": -194.15982055664062, + "logps/rejected": -293.0312194824219, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.759626567363739, + "rewards/margins": 10.006168365478516, + "rewards/rejected": -9.246540069580078, + "step": 939 + }, + { + "epoch": 1.23, + "learning_rate": 3.396811818569785e-05, + "logits/chosen": -1.7697315216064453, + "logits/rejected": -1.7975068092346191, + "logps/chosen": -168.0388946533203, + "logps/rejected": -251.87774658203125, + "loss": 0.0526, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.377260446548462, + "rewards/margins": 9.335554122924805, + "rewards/rejected": -7.9582929611206055, + "step": 940 + }, + { + "epoch": 1.23, + "learning_rate": 3.3934663515368236e-05, + "logits/chosen": -1.8283406496047974, + "logits/rejected": -1.9062525033950806, + "logps/chosen": -164.63389587402344, + "logps/rejected": -247.40115356445312, + "loss": 0.1111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09177665412425995, + "rewards/margins": 7.34284782409668, + "rewards/rejected": -7.434624671936035, + "step": 941 + }, + { + "epoch": 1.23, + "learning_rate": 3.3901190492473554e-05, + "logits/chosen": -1.8158893585205078, + "logits/rejected": -1.8978768587112427, + "logps/chosen": -171.03732299804688, + "logps/rejected": -265.78216552734375, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36732611060142517, + "rewards/margins": 8.224309921264648, + "rewards/rejected": -8.591635704040527, + "step": 942 + }, + { + "epoch": 1.23, + "learning_rate": 3.3867699185770255e-05, + "logits/chosen": -1.5865942239761353, + "logits/rejected": -1.663236379623413, + "logps/chosen": -207.1858367919922, + "logps/rejected": -319.83319091796875, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15542030334472656, + "rewards/margins": 10.122888565063477, + "rewards/rejected": -10.278308868408203, + "step": 943 + }, + { + "epoch": 1.24, + "learning_rate": 3.383418966405234e-05, + "logits/chosen": -1.7222788333892822, + "logits/rejected": -1.751545786857605, + "logps/chosen": -174.07720947265625, + "logps/rejected": -266.2615051269531, + "loss": 0.1213, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.24621230363845825, + "rewards/margins": 8.12299919128418, + "rewards/rejected": -7.876787185668945, + "step": 944 + }, + { + "epoch": 1.24, + "learning_rate": 3.3800661996151264e-05, + "logits/chosen": -1.7203212976455688, + "logits/rejected": -1.7764899730682373, + "logps/chosen": -164.42947387695312, + "logps/rejected": -256.416259765625, + "loss": 0.0529, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7283108234405518, + "rewards/margins": 8.709585189819336, + "rewards/rejected": -7.981274604797363, + "step": 945 + }, + { + "epoch": 1.24, + "learning_rate": 3.376711625093571e-05, + "logits/chosen": -1.603279709815979, + "logits/rejected": -1.5945706367492676, + "logps/chosen": -192.04019165039062, + "logps/rejected": -271.12579345703125, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3455383777618408, + "rewards/margins": 7.874174118041992, + "rewards/rejected": -8.219711303710938, + "step": 946 + }, + { + "epoch": 1.24, + "learning_rate": 3.373355249731153e-05, + "logits/chosen": -1.7255451679229736, + "logits/rejected": -1.7624372243881226, + "logps/chosen": -164.78860473632812, + "logps/rejected": -276.87115478515625, + "loss": 0.0507, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0089190006256104, + "rewards/margins": 9.92430305480957, + "rewards/rejected": -8.915383338928223, + "step": 947 + }, + { + "epoch": 1.24, + "learning_rate": 3.369997080422155e-05, + "logits/chosen": -1.7678481340408325, + "logits/rejected": -1.833672285079956, + "logps/chosen": -194.07901000976562, + "logps/rejected": -296.79266357421875, + "loss": 0.0517, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4331166446208954, + "rewards/margins": 10.406550407409668, + "rewards/rejected": -9.973432540893555, + "step": 948 + }, + { + "epoch": 1.24, + "learning_rate": 3.366637124064544e-05, + "logits/chosen": -1.9094618558883667, + "logits/rejected": -1.9102199077606201, + "logps/chosen": -166.8306121826172, + "logps/rejected": -290.0064697265625, + "loss": 0.0532, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8467321395874023, + "rewards/margins": 10.709648132324219, + "rewards/rejected": -9.8629150390625, + "step": 949 + }, + { + "epoch": 1.24, + "learning_rate": 3.36327538755996e-05, + "logits/chosen": -1.890424370765686, + "logits/rejected": -1.8965479135513306, + "logps/chosen": -203.60035705566406, + "logps/rejected": -275.9346923828125, + "loss": 0.1489, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8383345603942871, + "rewards/margins": 6.4683966636657715, + "rewards/rejected": -7.306731700897217, + "step": 950 + }, + { + "epoch": 1.24, + "learning_rate": 3.3599118778136965e-05, + "logits/chosen": -1.4926958084106445, + "logits/rejected": -1.5053917169570923, + "logps/chosen": -230.29598999023438, + "logps/rejected": -295.8033752441406, + "loss": 0.1053, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3924224376678467, + "rewards/margins": 7.463953495025635, + "rewards/rejected": -8.856375694274902, + "step": 951 + }, + { + "epoch": 1.25, + "learning_rate": 3.356546601734692e-05, + "logits/chosen": -1.5974106788635254, + "logits/rejected": -1.6120538711547852, + "logps/chosen": -199.6564178466797, + "logps/rejected": -310.5819396972656, + "loss": 0.0693, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2950756549835205, + "rewards/margins": 6.976012229919434, + "rewards/rejected": -8.271088600158691, + "step": 952 + }, + { + "epoch": 1.25, + "learning_rate": 3.3531795662355115e-05, + "logits/chosen": -1.8552980422973633, + "logits/rejected": -1.8632795810699463, + "logps/chosen": -192.36004638671875, + "logps/rejected": -277.8472900390625, + "loss": 0.0521, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0061737298965454, + "rewards/margins": 8.26196002960205, + "rewards/rejected": -9.268133163452148, + "step": 953 + }, + { + "epoch": 1.25, + "learning_rate": 3.349810778232335e-05, + "logits/chosen": -1.8003088235855103, + "logits/rejected": -1.7985399961471558, + "logps/chosen": -151.33642578125, + "logps/rejected": -245.3261260986328, + "loss": 0.1194, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.29827451705932617, + "rewards/margins": 8.245087623596191, + "rewards/rejected": -7.946812629699707, + "step": 954 + }, + { + "epoch": 1.25, + "learning_rate": 3.346440244644942e-05, + "logits/chosen": -1.7284401655197144, + "logits/rejected": -1.7348697185516357, + "logps/chosen": -169.25662231445312, + "logps/rejected": -268.5526428222656, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.59054034948349, + "rewards/margins": 8.547279357910156, + "rewards/rejected": -9.137819290161133, + "step": 955 + }, + { + "epoch": 1.25, + "learning_rate": 3.3430679723966976e-05, + "logits/chosen": -1.776133418083191, + "logits/rejected": -1.8338634967803955, + "logps/chosen": -169.5686492919922, + "logps/rejected": -286.97802734375, + "loss": 0.125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3752596378326416, + "rewards/margins": 8.643117904663086, + "rewards/rejected": -8.267858505249023, + "step": 956 + }, + { + "epoch": 1.25, + "learning_rate": 3.339693968414538e-05, + "logits/chosen": -1.6924769878387451, + "logits/rejected": -1.69418466091156, + "logps/chosen": -199.43775939941406, + "logps/rejected": -265.88092041015625, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8435568809509277, + "rewards/margins": 7.179770469665527, + "rewards/rejected": -8.023327827453613, + "step": 957 + }, + { + "epoch": 1.25, + "learning_rate": 3.336318239628956e-05, + "logits/chosen": -1.7992136478424072, + "logits/rejected": -1.7918800115585327, + "logps/chosen": -173.89222717285156, + "logps/rejected": -242.6941375732422, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014464115723967552, + "rewards/margins": 7.979608535766602, + "rewards/rejected": -7.994071960449219, + "step": 958 + }, + { + "epoch": 1.26, + "learning_rate": 3.3329407929739906e-05, + "logits/chosen": -1.8405758142471313, + "logits/rejected": -1.8193663358688354, + "logps/chosen": -191.95260620117188, + "logps/rejected": -312.0247802734375, + "loss": 0.0309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0012057796120643616, + "rewards/margins": 9.450359344482422, + "rewards/rejected": -9.45156478881836, + "step": 959 + }, + { + "epoch": 1.26, + "learning_rate": 3.3295616353872026e-05, + "logits/chosen": -1.6075886487960815, + "logits/rejected": -1.5258815288543701, + "logps/chosen": -171.4676513671875, + "logps/rejected": -256.88238525390625, + "loss": 0.0715, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5326271057128906, + "rewards/margins": 9.00959587097168, + "rewards/rejected": -9.542222023010254, + "step": 960 + }, + { + "epoch": 1.26, + "learning_rate": 3.326180773809676e-05, + "logits/chosen": -1.7329258918762207, + "logits/rejected": -1.7353699207305908, + "logps/chosen": -167.04550170898438, + "logps/rejected": -292.188232421875, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.487869143486023, + "rewards/margins": 11.961874008178711, + "rewards/rejected": -10.474005699157715, + "step": 961 + }, + { + "epoch": 1.26, + "learning_rate": 3.3227982151859873e-05, + "logits/chosen": -1.8756731748580933, + "logits/rejected": -1.9023237228393555, + "logps/chosen": -167.3818359375, + "logps/rejected": -231.07977294921875, + "loss": 0.1076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8674279451370239, + "rewards/margins": 6.189681053161621, + "rewards/rejected": -7.0571088790893555, + "step": 962 + }, + { + "epoch": 1.26, + "learning_rate": 3.3194139664642035e-05, + "logits/chosen": -1.8310325145721436, + "logits/rejected": -1.843670129776001, + "logps/chosen": -161.2306365966797, + "logps/rejected": -294.9887390136719, + "loss": 0.0467, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.30892035365104675, + "rewards/margins": 11.805313110351562, + "rewards/rejected": -11.496391296386719, + "step": 963 + }, + { + "epoch": 1.26, + "learning_rate": 3.3160280345958614e-05, + "logits/chosen": -1.7281033992767334, + "logits/rejected": -1.7897300720214844, + "logps/chosen": -142.33782958984375, + "logps/rejected": -255.6434326171875, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6555277109146118, + "rewards/margins": 9.613879203796387, + "rewards/rejected": -8.958351135253906, + "step": 964 + }, + { + "epoch": 1.26, + "learning_rate": 3.3126404265359545e-05, + "logits/chosen": -1.865938425064087, + "logits/rejected": -1.8880172967910767, + "logps/chosen": -183.08792114257812, + "logps/rejected": -271.8849182128906, + "loss": 0.029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19245201349258423, + "rewards/margins": 8.922394752502441, + "rewards/rejected": -9.114846229553223, + "step": 965 + }, + { + "epoch": 1.26, + "learning_rate": 3.3092511492429216e-05, + "logits/chosen": -1.819749116897583, + "logits/rejected": -1.7672382593154907, + "logps/chosen": -193.35067749023438, + "logps/rejected": -261.9708251953125, + "loss": 0.0636, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.34252068400382996, + "rewards/margins": 6.740435600280762, + "rewards/rejected": -7.082956314086914, + "step": 966 + }, + { + "epoch": 1.27, + "learning_rate": 3.305860209678628e-05, + "logits/chosen": -1.6987472772598267, + "logits/rejected": -1.7095977067947388, + "logps/chosen": -139.22987365722656, + "logps/rejected": -237.2021484375, + "loss": 0.0886, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03404363989830017, + "rewards/margins": 7.799056529998779, + "rewards/rejected": -7.833099365234375, + "step": 967 + }, + { + "epoch": 1.27, + "learning_rate": 3.3024676148083555e-05, + "logits/chosen": -1.7296700477600098, + "logits/rejected": -1.7009743452072144, + "logps/chosen": -184.88128662109375, + "logps/rejected": -294.8798522949219, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7556076645851135, + "rewards/margins": 10.581036567687988, + "rewards/rejected": -9.825429916381836, + "step": 968 + }, + { + "epoch": 1.27, + "learning_rate": 3.299073371600784e-05, + "logits/chosen": -1.6321117877960205, + "logits/rejected": -1.672057032585144, + "logps/chosen": -181.47630310058594, + "logps/rejected": -283.00860595703125, + "loss": 0.0612, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08924361318349838, + "rewards/margins": 9.234962463378906, + "rewards/rejected": -9.324206352233887, + "step": 969 + }, + { + "epoch": 1.27, + "learning_rate": 3.29567748702798e-05, + "logits/chosen": -1.3825104236602783, + "logits/rejected": -1.3645695447921753, + "logps/chosen": -162.88233947753906, + "logps/rejected": -217.79019165039062, + "loss": 0.1412, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11819088459014893, + "rewards/margins": 5.8656392097473145, + "rewards/rejected": -5.983829975128174, + "step": 970 + }, + { + "epoch": 1.27, + "learning_rate": 3.2922799680653816e-05, + "logits/chosen": -1.5520780086517334, + "logits/rejected": -1.5310850143432617, + "logps/chosen": -230.1056671142578, + "logps/rejected": -302.88671875, + "loss": 0.1557, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2427598237991333, + "rewards/margins": 6.7874555587768555, + "rewards/rejected": -8.0302152633667, + "step": 971 + }, + { + "epoch": 1.27, + "learning_rate": 3.288880821691785e-05, + "logits/chosen": -1.3305836915969849, + "logits/rejected": -1.3754537105560303, + "logps/chosen": -189.15621948242188, + "logps/rejected": -298.706298828125, + "loss": 0.0487, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9303971529006958, + "rewards/margins": 8.755258560180664, + "rewards/rejected": -9.68565559387207, + "step": 972 + }, + { + "epoch": 1.27, + "learning_rate": 3.285480054889327e-05, + "logits/chosen": -1.5995906591415405, + "logits/rejected": -1.7117842435836792, + "logps/chosen": -147.44998168945312, + "logps/rejected": -252.13072204589844, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49690061807632446, + "rewards/margins": 9.193412780761719, + "rewards/rejected": -8.696512222290039, + "step": 973 + }, + { + "epoch": 1.27, + "learning_rate": 3.2820776746434764e-05, + "logits/chosen": -1.558459997177124, + "logits/rejected": -1.5185562372207642, + "logps/chosen": -221.10980224609375, + "logps/rejected": -294.4140930175781, + "loss": 0.0953, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.43937918543815613, + "rewards/margins": 8.335892677307129, + "rewards/rejected": -8.775272369384766, + "step": 974 + }, + { + "epoch": 1.28, + "learning_rate": 3.278673687943011e-05, + "logits/chosen": -1.3949395418167114, + "logits/rejected": -1.476244568824768, + "logps/chosen": -157.56048583984375, + "logps/rejected": -242.78616333007812, + "loss": 0.1012, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5070436596870422, + "rewards/margins": 6.898143768310547, + "rewards/rejected": -7.405187129974365, + "step": 975 + }, + { + "epoch": 1.28, + "learning_rate": 3.2752681017800144e-05, + "logits/chosen": -1.8722678422927856, + "logits/rejected": -1.8975712060928345, + "logps/chosen": -189.45339965820312, + "logps/rejected": -278.3753662109375, + "loss": 0.053, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11852896958589554, + "rewards/margins": 8.716239929199219, + "rewards/rejected": -8.834769248962402, + "step": 976 + }, + { + "epoch": 1.28, + "learning_rate": 3.27186092314985e-05, + "logits/chosen": -1.9218028783798218, + "logits/rejected": -1.8684682846069336, + "logps/chosen": -149.4844970703125, + "logps/rejected": -228.8724365234375, + "loss": 0.0556, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9821925163269043, + "rewards/margins": 9.00061321258545, + "rewards/rejected": -8.018420219421387, + "step": 977 + }, + { + "epoch": 1.28, + "learning_rate": 3.2684521590511566e-05, + "logits/chosen": -1.8193577527999878, + "logits/rejected": -1.8498221635818481, + "logps/chosen": -167.66262817382812, + "logps/rejected": -270.91973876953125, + "loss": 0.0469, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8246172666549683, + "rewards/margins": 9.927530288696289, + "rewards/rejected": -9.102913856506348, + "step": 978 + }, + { + "epoch": 1.28, + "learning_rate": 3.2650418164858284e-05, + "logits/chosen": -1.5121347904205322, + "logits/rejected": -1.5500718355178833, + "logps/chosen": -186.26876831054688, + "logps/rejected": -265.0696105957031, + "loss": 0.0578, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2416999489068985, + "rewards/margins": 7.793680667877197, + "rewards/rejected": -8.035380363464355, + "step": 979 + }, + { + "epoch": 1.28, + "learning_rate": 3.261629902459e-05, + "logits/chosen": -1.4318028688430786, + "logits/rejected": -1.4715421199798584, + "logps/chosen": -162.150146484375, + "logps/rejected": -278.2225341796875, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.4904072284698486, + "rewards/margins": 11.90435791015625, + "rewards/rejected": -10.41395092010498, + "step": 980 + }, + { + "epoch": 1.28, + "learning_rate": 3.258216423979037e-05, + "logits/chosen": -1.8608146905899048, + "logits/rejected": -1.864013910293579, + "logps/chosen": -269.37750244140625, + "logps/rejected": -334.7197265625, + "loss": 0.1331, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6033797860145569, + "rewards/margins": 7.50750207901001, + "rewards/rejected": -8.110881805419922, + "step": 981 + }, + { + "epoch": 1.29, + "learning_rate": 3.254801388057514e-05, + "logits/chosen": -1.7995353937149048, + "logits/rejected": -1.7990977764129639, + "logps/chosen": -202.12106323242188, + "logps/rejected": -247.49078369140625, + "loss": 0.2716, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1631054878234863, + "rewards/margins": 4.216814994812012, + "rewards/rejected": -5.37992000579834, + "step": 982 + }, + { + "epoch": 1.29, + "learning_rate": 3.2513848017092113e-05, + "logits/chosen": -1.7793301343917847, + "logits/rejected": -1.8699491024017334, + "logps/chosen": -148.6905059814453, + "logps/rejected": -219.80819702148438, + "loss": 0.1464, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.534216046333313, + "rewards/margins": 6.915114402770996, + "rewards/rejected": -6.380898952484131, + "step": 983 + }, + { + "epoch": 1.29, + "learning_rate": 3.2479666719520886e-05, + "logits/chosen": -1.8794190883636475, + "logits/rejected": -1.856317400932312, + "logps/chosen": -180.57888793945312, + "logps/rejected": -245.49722290039062, + "loss": 0.1261, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5208327174186707, + "rewards/margins": 7.4504923820495605, + "rewards/rejected": -7.971325874328613, + "step": 984 + }, + { + "epoch": 1.29, + "learning_rate": 3.2445470058072766e-05, + "logits/chosen": -1.51198410987854, + "logits/rejected": -1.5328192710876465, + "logps/chosen": -184.41305541992188, + "logps/rejected": -261.3963623046875, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18090005218982697, + "rewards/margins": 8.79633903503418, + "rewards/rejected": -8.615439414978027, + "step": 985 + }, + { + "epoch": 1.29, + "learning_rate": 3.2411258102990646e-05, + "logits/chosen": -1.5592520236968994, + "logits/rejected": -1.514370083808899, + "logps/chosen": -200.31686401367188, + "logps/rejected": -272.80242919921875, + "loss": 0.157, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8379793167114258, + "rewards/margins": 7.916141986846924, + "rewards/rejected": -7.078163146972656, + "step": 986 + }, + { + "epoch": 1.29, + "learning_rate": 3.23770309245488e-05, + "logits/chosen": -1.639445424079895, + "logits/rejected": -1.7446305751800537, + "logps/chosen": -166.3204803466797, + "logps/rejected": -287.1727294921875, + "loss": 0.0799, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19196566939353943, + "rewards/margins": 9.632253646850586, + "rewards/rejected": -9.82421875, + "step": 987 + }, + { + "epoch": 1.29, + "learning_rate": 3.23427885930528e-05, + "logits/chosen": -1.686637043952942, + "logits/rejected": -1.6766071319580078, + "logps/chosen": -166.44894409179688, + "logps/rejected": -271.5466003417969, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21667608618736267, + "rewards/margins": 8.782600402832031, + "rewards/rejected": -8.999276161193848, + "step": 988 + }, + { + "epoch": 1.29, + "learning_rate": 3.230853117883933e-05, + "logits/chosen": -1.733090877532959, + "logits/rejected": -1.7348949909210205, + "logps/chosen": -174.36004638671875, + "logps/rejected": -263.1109313964844, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.019744902849197388, + "rewards/margins": 8.265658378601074, + "rewards/rejected": -8.2459135055542, + "step": 989 + }, + { + "epoch": 1.3, + "learning_rate": 3.227425875227605e-05, + "logits/chosen": -1.705898642539978, + "logits/rejected": -1.6789770126342773, + "logps/chosen": -163.54830932617188, + "logps/rejected": -250.91964721679688, + "loss": 0.0963, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3017692565917969, + "rewards/margins": 8.326699256896973, + "rewards/rejected": -8.024930000305176, + "step": 990 + }, + { + "epoch": 1.3, + "learning_rate": 3.223997138376146e-05, + "logits/chosen": -1.7970457077026367, + "logits/rejected": -1.8647854328155518, + "logps/chosen": -165.6417236328125, + "logps/rejected": -269.0453186035156, + "loss": 0.1124, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12270615994930267, + "rewards/margins": 9.013164520263672, + "rewards/rejected": -8.890458106994629, + "step": 991 + }, + { + "epoch": 1.3, + "learning_rate": 3.220566914372477e-05, + "logits/chosen": -1.5819454193115234, + "logits/rejected": -1.594781517982483, + "logps/chosen": -245.6867218017578, + "logps/rejected": -308.18817138671875, + "loss": 0.1232, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.10953688621521, + "rewards/margins": 5.188804626464844, + "rewards/rejected": -7.298341274261475, + "step": 992 + }, + { + "epoch": 1.3, + "learning_rate": 3.2171352102625716e-05, + "logits/chosen": -1.5325591564178467, + "logits/rejected": -1.6081345081329346, + "logps/chosen": -203.92449951171875, + "logps/rejected": -325.21148681640625, + "loss": 0.0911, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.15414515137672424, + "rewards/margins": 8.53257942199707, + "rewards/rejected": -8.378433227539062, + "step": 993 + }, + { + "epoch": 1.3, + "learning_rate": 3.213702033095444e-05, + "logits/chosen": -1.7691090106964111, + "logits/rejected": -1.7098833322525024, + "logps/chosen": -187.43008422851562, + "logps/rejected": -263.68438720703125, + "loss": 0.0524, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04066956788301468, + "rewards/margins": 8.466104507446289, + "rewards/rejected": -8.425435066223145, + "step": 994 + }, + { + "epoch": 1.3, + "learning_rate": 3.210267389923135e-05, + "logits/chosen": -1.6959140300750732, + "logits/rejected": -1.5891695022583008, + "logps/chosen": -178.58628845214844, + "logps/rejected": -301.677978515625, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10630902647972107, + "rewards/margins": 10.849169731140137, + "rewards/rejected": -10.74285888671875, + "step": 995 + }, + { + "epoch": 1.3, + "learning_rate": 3.2068312878006955e-05, + "logits/chosen": -1.8529281616210938, + "logits/rejected": -1.8796100616455078, + "logps/chosen": -184.71463012695312, + "logps/rejected": -256.54901123046875, + "loss": 0.1309, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16027602553367615, + "rewards/margins": 8.256537437438965, + "rewards/rejected": -8.096261978149414, + "step": 996 + }, + { + "epoch": 1.3, + "learning_rate": 3.2033937337861744e-05, + "logits/chosen": -1.5665572881698608, + "logits/rejected": -1.5762784481048584, + "logps/chosen": -153.64761352539062, + "logps/rejected": -258.431396484375, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6890445947647095, + "rewards/margins": 9.799956321716309, + "rewards/rejected": -9.110910415649414, + "step": 997 + }, + { + "epoch": 1.31, + "learning_rate": 3.199954734940603e-05, + "logits/chosen": -1.528867244720459, + "logits/rejected": -1.5182762145996094, + "logps/chosen": -205.97349548339844, + "logps/rejected": -284.24505615234375, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1888141632080078, + "rewards/margins": 8.177652359008789, + "rewards/rejected": -9.366466522216797, + "step": 998 + }, + { + "epoch": 1.31, + "learning_rate": 3.196514298327979e-05, + "logits/chosen": -1.818434476852417, + "logits/rejected": -1.8412598371505737, + "logps/chosen": -169.71066284179688, + "logps/rejected": -250.16668701171875, + "loss": 0.0535, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2685055732727051, + "rewards/margins": 7.847587585449219, + "rewards/rejected": -8.116093635559082, + "step": 999 + }, + { + "epoch": 1.31, + "learning_rate": 3.193072431015254e-05, + "logits/chosen": -1.796441674232483, + "logits/rejected": -1.8247101306915283, + "logps/chosen": -242.6537628173828, + "logps/rejected": -339.2400817871094, + "loss": 0.1228, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.054071068763733, + "rewards/margins": 8.617059707641602, + "rewards/rejected": -9.671130180358887, + "step": 1000 + }, + { + "epoch": 1.31, + "learning_rate": 3.18962914007232e-05, + "logits/chosen": -1.684266448020935, + "logits/rejected": -1.7495726346969604, + "logps/chosen": -211.7010040283203, + "logps/rejected": -296.0808410644531, + "loss": 0.1383, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3020642101764679, + "rewards/margins": 8.83458137512207, + "rewards/rejected": -9.136645317077637, + "step": 1001 + }, + { + "epoch": 1.31, + "learning_rate": 3.18618443257199e-05, + "logits/chosen": -1.8807414770126343, + "logits/rejected": -1.8555285930633545, + "logps/chosen": -175.353515625, + "logps/rejected": -272.2889099121094, + "loss": 0.0869, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6951203942298889, + "rewards/margins": 8.173113822937012, + "rewards/rejected": -8.868233680725098, + "step": 1002 + }, + { + "epoch": 1.31, + "learning_rate": 3.182738315589991e-05, + "logits/chosen": -1.7039772272109985, + "logits/rejected": -1.6441880464553833, + "logps/chosen": -177.56491088867188, + "logps/rejected": -245.73123168945312, + "loss": 0.1174, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0576106309890747, + "rewards/margins": 7.181726932525635, + "rewards/rejected": -8.239336967468262, + "step": 1003 + }, + { + "epoch": 1.31, + "learning_rate": 3.17929079620494e-05, + "logits/chosen": -1.5224440097808838, + "logits/rejected": -1.5071367025375366, + "logps/chosen": -181.99505615234375, + "logps/rejected": -285.6095275878906, + "loss": 0.0435, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2493596076965332, + "rewards/margins": 10.302899360656738, + "rewards/rejected": -9.053540229797363, + "step": 1004 + }, + { + "epoch": 1.32, + "learning_rate": 3.17584188149834e-05, + "logits/chosen": -1.80663001537323, + "logits/rejected": -1.781527042388916, + "logps/chosen": -171.4976043701172, + "logps/rejected": -281.3936767578125, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9321253299713135, + "rewards/margins": 9.852705001831055, + "rewards/rejected": -8.92057991027832, + "step": 1005 + }, + { + "epoch": 1.32, + "learning_rate": 3.172391578554557e-05, + "logits/chosen": -1.902237057685852, + "logits/rejected": -1.9380607604980469, + "logps/chosen": -141.2318115234375, + "logps/rejected": -248.71295166015625, + "loss": 0.1273, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3797750473022461, + "rewards/margins": 9.218080520629883, + "rewards/rejected": -8.83830451965332, + "step": 1006 + }, + { + "epoch": 1.32, + "learning_rate": 3.1689398944608076e-05, + "logits/chosen": -1.6993157863616943, + "logits/rejected": -1.67729914188385, + "logps/chosen": -183.81637573242188, + "logps/rejected": -275.666748046875, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16810892522335052, + "rewards/margins": 8.843433380126953, + "rewards/rejected": -9.011541366577148, + "step": 1007 + }, + { + "epoch": 1.32, + "learning_rate": 3.1654868363071484e-05, + "logits/chosen": -1.6814528703689575, + "logits/rejected": -1.7419649362564087, + "logps/chosen": -226.9930877685547, + "logps/rejected": -298.8405456542969, + "loss": 0.1793, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7978208661079407, + "rewards/margins": 7.192959785461426, + "rewards/rejected": -7.990780830383301, + "step": 1008 + }, + { + "epoch": 1.32, + "learning_rate": 3.162032411186456e-05, + "logits/chosen": -1.7620617151260376, + "logits/rejected": -1.740466833114624, + "logps/chosen": -160.1719970703125, + "logps/rejected": -216.30531311035156, + "loss": 0.1228, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2583532929420471, + "rewards/margins": 6.348572731018066, + "rewards/rejected": -6.6069254875183105, + "step": 1009 + }, + { + "epoch": 1.32, + "learning_rate": 3.158576626194417e-05, + "logits/chosen": -1.5181941986083984, + "logits/rejected": -1.504223108291626, + "logps/chosen": -205.6512451171875, + "logps/rejected": -273.6099548339844, + "loss": 0.0948, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5751641988754272, + "rewards/margins": 6.030278205871582, + "rewards/rejected": -6.605443000793457, + "step": 1010 + }, + { + "epoch": 1.32, + "learning_rate": 3.15511948842951e-05, + "logits/chosen": -1.77535879611969, + "logits/rejected": -1.7580792903900146, + "logps/chosen": -158.79725646972656, + "logps/rejected": -270.26129150390625, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.437267780303955, + "rewards/margins": 10.901632308959961, + "rewards/rejected": -9.464365005493164, + "step": 1011 + }, + { + "epoch": 1.32, + "learning_rate": 3.151661004992992e-05, + "logits/chosen": -1.668846607208252, + "logits/rejected": -1.6981290578842163, + "logps/chosen": -164.50094604492188, + "logps/rejected": -253.15211486816406, + "loss": 0.0528, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6544564366340637, + "rewards/margins": 8.268686294555664, + "rewards/rejected": -7.614229679107666, + "step": 1012 + }, + { + "epoch": 1.33, + "learning_rate": 3.1482011829888836e-05, + "logits/chosen": -1.3924821615219116, + "logits/rejected": -1.4576594829559326, + "logps/chosen": -165.29043579101562, + "logps/rejected": -274.30511474609375, + "loss": 0.0453, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5792158246040344, + "rewards/margins": 8.832959175109863, + "rewards/rejected": -8.253743171691895, + "step": 1013 + }, + { + "epoch": 1.33, + "learning_rate": 3.1447400295239575e-05, + "logits/chosen": -1.6502984762191772, + "logits/rejected": -1.6198225021362305, + "logps/chosen": -149.57823181152344, + "logps/rejected": -221.24867248535156, + "loss": 0.0958, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2110995054244995, + "rewards/margins": 6.368166923522949, + "rewards/rejected": -6.5792670249938965, + "step": 1014 + }, + { + "epoch": 1.33, + "learning_rate": 3.1412775517077195e-05, + "logits/chosen": -1.593336820602417, + "logits/rejected": -1.5360331535339355, + "logps/chosen": -199.99185180664062, + "logps/rejected": -279.9867248535156, + "loss": 0.1005, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5594850182533264, + "rewards/margins": 8.919145584106445, + "rewards/rejected": -9.478631019592285, + "step": 1015 + }, + { + "epoch": 1.33, + "learning_rate": 3.137813756652395e-05, + "logits/chosen": -1.4276340007781982, + "logits/rejected": -1.39561128616333, + "logps/chosen": -189.6681671142578, + "logps/rejected": -320.6572570800781, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3362197279930115, + "rewards/margins": 11.467063903808594, + "rewards/rejected": -11.130845069885254, + "step": 1016 + }, + { + "epoch": 1.33, + "learning_rate": 3.134348651472917e-05, + "logits/chosen": -1.862006664276123, + "logits/rejected": -1.9267525672912598, + "logps/chosen": -166.74551391601562, + "logps/rejected": -249.555419921875, + "loss": 0.104, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4345891773700714, + "rewards/margins": 7.377204895019531, + "rewards/rejected": -7.811794281005859, + "step": 1017 + }, + { + "epoch": 1.33, + "learning_rate": 3.130882243286908e-05, + "logits/chosen": -1.650821328163147, + "logits/rejected": -1.7058120965957642, + "logps/chosen": -189.9503173828125, + "logps/rejected": -267.7425537109375, + "loss": 0.0781, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06037762016057968, + "rewards/margins": 7.55076265335083, + "rewards/rejected": -7.611140727996826, + "step": 1018 + }, + { + "epoch": 1.33, + "learning_rate": 3.127414539214668e-05, + "logits/chosen": -1.6594420671463013, + "logits/rejected": -1.6205840110778809, + "logps/chosen": -180.5568084716797, + "logps/rejected": -252.425048828125, + "loss": 0.0599, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2819436192512512, + "rewards/margins": 7.8513970375061035, + "rewards/rejected": -7.569454193115234, + "step": 1019 + }, + { + "epoch": 1.33, + "learning_rate": 3.12394554637916e-05, + "logits/chosen": -1.7752412557601929, + "logits/rejected": -1.8104010820388794, + "logps/chosen": -199.2642364501953, + "logps/rejected": -245.20188903808594, + "loss": 0.0658, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.26455584168434143, + "rewards/margins": 6.335668563842773, + "rewards/rejected": -6.600224494934082, + "step": 1020 + }, + { + "epoch": 1.34, + "learning_rate": 3.12047527190599e-05, + "logits/chosen": -1.6434757709503174, + "logits/rejected": -1.6865171194076538, + "logps/chosen": -180.5344696044922, + "logps/rejected": -257.50091552734375, + "loss": 0.0555, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.011539027094841003, + "rewards/margins": 7.5164055824279785, + "rewards/rejected": -7.527944087982178, + "step": 1021 + }, + { + "epoch": 1.34, + "learning_rate": 3.1170037229234006e-05, + "logits/chosen": -1.841266393661499, + "logits/rejected": -1.8170844316482544, + "logps/chosen": -175.3178253173828, + "logps/rejected": -235.3691864013672, + "loss": 0.1078, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.406562477350235, + "rewards/margins": 6.693796634674072, + "rewards/rejected": -7.100358963012695, + "step": 1022 + }, + { + "epoch": 1.34, + "learning_rate": 3.113530906562252e-05, + "logits/chosen": -1.6263827085494995, + "logits/rejected": -1.6030185222625732, + "logps/chosen": -193.2816619873047, + "logps/rejected": -251.95472717285156, + "loss": 0.0875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.10303999483585358, + "rewards/margins": 8.109235763549805, + "rewards/rejected": -8.006196975708008, + "step": 1023 + }, + { + "epoch": 1.34, + "learning_rate": 3.110056829956006e-05, + "logits/chosen": -1.8593924045562744, + "logits/rejected": -1.8750531673431396, + "logps/chosen": -157.7016143798828, + "logps/rejected": -260.2314453125, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23302239179611206, + "rewards/margins": 8.660212516784668, + "rewards/rejected": -8.427189826965332, + "step": 1024 + }, + { + "epoch": 1.34, + "learning_rate": 3.1065815002407136e-05, + "logits/chosen": -1.728441834449768, + "logits/rejected": -1.7998254299163818, + "logps/chosen": -188.9195098876953, + "logps/rejected": -279.92266845703125, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14355725049972534, + "rewards/margins": 9.690081596374512, + "rewards/rejected": -9.546525955200195, + "step": 1025 + }, + { + "epoch": 1.34, + "learning_rate": 3.103104924555e-05, + "logits/chosen": -1.972607135772705, + "logits/rejected": -1.894500970840454, + "logps/chosen": -188.02474975585938, + "logps/rejected": -268.5780029296875, + "loss": 0.0606, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.38073399662971497, + "rewards/margins": 8.988433837890625, + "rewards/rejected": -8.60770034790039, + "step": 1026 + }, + { + "epoch": 1.34, + "learning_rate": 3.099627110040052e-05, + "logits/chosen": -1.3176733255386353, + "logits/rejected": -1.3635295629501343, + "logps/chosen": -158.56094360351562, + "logps/rejected": -283.16607666015625, + "loss": 0.0313, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3060721755027771, + "rewards/margins": 8.611286163330078, + "rewards/rejected": -8.305213928222656, + "step": 1027 + }, + { + "epoch": 1.35, + "learning_rate": 3.096148063839596e-05, + "logits/chosen": -1.8229846954345703, + "logits/rejected": -1.7924790382385254, + "logps/chosen": -150.7916259765625, + "logps/rejected": -198.27381896972656, + "loss": 0.0622, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.280334234237671, + "rewards/margins": 7.269871711730957, + "rewards/rejected": -5.989537715911865, + "step": 1028 + }, + { + "epoch": 1.35, + "learning_rate": 3.0926677930998924e-05, + "logits/chosen": -1.7558845281600952, + "logits/rejected": -1.7652242183685303, + "logps/chosen": -152.77943420410156, + "logps/rejected": -258.5368957519531, + "loss": 0.0589, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9764621257781982, + "rewards/margins": 10.016199111938477, + "rewards/rejected": -9.0397367477417, + "step": 1029 + }, + { + "epoch": 1.35, + "learning_rate": 3.0891863049697165e-05, + "logits/chosen": -1.8508270978927612, + "logits/rejected": -1.7795063257217407, + "logps/chosen": -157.1493682861328, + "logps/rejected": -237.9957733154297, + "loss": 0.0365, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0358507633209229, + "rewards/margins": 9.060894966125488, + "rewards/rejected": -8.025044441223145, + "step": 1030 + }, + { + "epoch": 1.35, + "learning_rate": 3.0857036066003414e-05, + "logits/chosen": -1.9584459066390991, + "logits/rejected": -1.972018837928772, + "logps/chosen": -195.3858642578125, + "logps/rejected": -274.5120544433594, + "loss": 0.0898, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7697188854217529, + "rewards/margins": 8.89430046081543, + "rewards/rejected": -8.124581336975098, + "step": 1031 + }, + { + "epoch": 1.35, + "learning_rate": 3.08221970514553e-05, + "logits/chosen": -1.7750444412231445, + "logits/rejected": -1.7884280681610107, + "logps/chosen": -187.94300842285156, + "logps/rejected": -300.2756042480469, + "loss": 0.0473, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9924745559692383, + "rewards/margins": 9.716182708740234, + "rewards/rejected": -8.723708152770996, + "step": 1032 + }, + { + "epoch": 1.35, + "learning_rate": 3.0787346077615155e-05, + "logits/chosen": -1.6590096950531006, + "logits/rejected": -1.7242157459259033, + "logps/chosen": -142.8770294189453, + "logps/rejected": -266.1553955078125, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41273677349090576, + "rewards/margins": 9.018227577209473, + "rewards/rejected": -8.605491638183594, + "step": 1033 + }, + { + "epoch": 1.35, + "learning_rate": 3.0752483216069846e-05, + "logits/chosen": -1.6022425889968872, + "logits/rejected": -1.6180634498596191, + "logps/chosen": -172.05636596679688, + "logps/rejected": -274.3914794921875, + "loss": 0.0542, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.740761399269104, + "rewards/margins": 10.236021995544434, + "rewards/rejected": -9.495260238647461, + "step": 1034 + }, + { + "epoch": 1.35, + "learning_rate": 3.071760853843069e-05, + "logits/chosen": -1.5469131469726562, + "logits/rejected": -1.5504934787750244, + "logps/chosen": -159.5862274169922, + "logps/rejected": -241.58901977539062, + "loss": 0.0294, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27919813990592957, + "rewards/margins": 8.046732902526855, + "rewards/rejected": -7.767534255981445, + "step": 1035 + }, + { + "epoch": 1.36, + "learning_rate": 3.068272211633326e-05, + "logits/chosen": -1.6643675565719604, + "logits/rejected": -1.7440696954727173, + "logps/chosen": -175.10739135742188, + "logps/rejected": -287.9505920410156, + "loss": 0.0634, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20883485674858093, + "rewards/margins": 7.869156360626221, + "rewards/rejected": -8.077991485595703, + "step": 1036 + }, + { + "epoch": 1.36, + "learning_rate": 3.0647824021437266e-05, + "logits/chosen": -1.9178826808929443, + "logits/rejected": -1.9084827899932861, + "logps/chosen": -180.52581787109375, + "logps/rejected": -257.9354553222656, + "loss": 0.0473, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5014915466308594, + "rewards/margins": 7.780052661895752, + "rewards/rejected": -7.278561115264893, + "step": 1037 + }, + { + "epoch": 1.36, + "learning_rate": 3.061291432542639e-05, + "logits/chosen": -1.829077124595642, + "logits/rejected": -1.8546404838562012, + "logps/chosen": -169.27935791015625, + "logps/rejected": -289.7463073730469, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.270003318786621, + "rewards/margins": 10.353739738464355, + "rewards/rejected": -9.083736419677734, + "step": 1038 + }, + { + "epoch": 1.36, + "learning_rate": 3.0577993100008135e-05, + "logits/chosen": -1.5773746967315674, + "logits/rejected": -1.6665159463882446, + "logps/chosen": -181.35531616210938, + "logps/rejected": -295.3477783203125, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5459751486778259, + "rewards/margins": 8.71547794342041, + "rewards/rejected": -9.261452674865723, + "step": 1039 + }, + { + "epoch": 1.36, + "learning_rate": 3.0543060416913696e-05, + "logits/chosen": -1.5834990739822388, + "logits/rejected": -1.6100273132324219, + "logps/chosen": -168.49917602539062, + "logps/rejected": -242.64942932128906, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7964572310447693, + "rewards/margins": 8.272777557373047, + "rewards/rejected": -7.476320266723633, + "step": 1040 + }, + { + "epoch": 1.36, + "learning_rate": 3.050811634789779e-05, + "logits/chosen": -1.7382659912109375, + "logits/rejected": -1.7036820650100708, + "logps/chosen": -192.46153259277344, + "logps/rejected": -276.86077880859375, + "loss": 0.0444, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10978998243808746, + "rewards/margins": 9.028346061706543, + "rewards/rejected": -8.918556213378906, + "step": 1041 + }, + { + "epoch": 1.36, + "learning_rate": 3.0473160964738555e-05, + "logits/chosen": -1.3314677476882935, + "logits/rejected": -1.3481682538986206, + "logps/chosen": -165.89361572265625, + "logps/rejected": -250.7019805908203, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41093093156814575, + "rewards/margins": 8.487051010131836, + "rewards/rejected": -8.076119422912598, + "step": 1042 + }, + { + "epoch": 1.36, + "learning_rate": 3.0438194339237325e-05, + "logits/chosen": -1.4145452976226807, + "logits/rejected": -1.3113701343536377, + "logps/chosen": -232.9272918701172, + "logps/rejected": -308.4940185546875, + "loss": 0.096, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1446681022644043, + "rewards/margins": 7.0652899742126465, + "rewards/rejected": -6.920622825622559, + "step": 1043 + }, + { + "epoch": 1.37, + "learning_rate": 3.0403216543218547e-05, + "logits/chosen": -1.5912857055664062, + "logits/rejected": -1.7396855354309082, + "logps/chosen": -191.90408325195312, + "logps/rejected": -265.6307373046875, + "loss": 0.0507, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9149222373962402, + "rewards/margins": 8.989510536193848, + "rewards/rejected": -8.07458782196045, + "step": 1044 + }, + { + "epoch": 1.37, + "learning_rate": 3.036822764852963e-05, + "logits/chosen": -1.6572009325027466, + "logits/rejected": -1.6666812896728516, + "logps/chosen": -161.16978454589844, + "logps/rejected": -202.1544952392578, + "loss": 0.0713, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5580483675003052, + "rewards/margins": 5.284550189971924, + "rewards/rejected": -5.842597961425781, + "step": 1045 + }, + { + "epoch": 1.37, + "learning_rate": 3.0333227727040742e-05, + "logits/chosen": -1.4827758073806763, + "logits/rejected": -1.465570330619812, + "logps/chosen": -221.41485595703125, + "logps/rejected": -278.4585876464844, + "loss": 0.0972, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10512249171733856, + "rewards/margins": 7.118978500366211, + "rewards/rejected": -7.2241010665893555, + "step": 1046 + }, + { + "epoch": 1.37, + "learning_rate": 3.029821685064475e-05, + "logits/chosen": -1.570320963859558, + "logits/rejected": -1.5827525854110718, + "logps/chosen": -173.6188201904297, + "logps/rejected": -334.1131591796875, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5533016920089722, + "rewards/margins": 13.774866104125977, + "rewards/rejected": -12.221563339233398, + "step": 1047 + }, + { + "epoch": 1.37, + "learning_rate": 3.026319509125697e-05, + "logits/chosen": -1.461116075515747, + "logits/rejected": -1.4795494079589844, + "logps/chosen": -158.5972442626953, + "logps/rejected": -249.91854858398438, + "loss": 0.076, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8533519506454468, + "rewards/margins": 7.1458516120910645, + "rewards/rejected": -7.999203205108643, + "step": 1048 + }, + { + "epoch": 1.37, + "learning_rate": 3.0228162520815117e-05, + "logits/chosen": -1.517564058303833, + "logits/rejected": -1.526706337928772, + "logps/chosen": -181.30294799804688, + "logps/rejected": -283.81610107421875, + "loss": 0.0468, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1314409077167511, + "rewards/margins": 9.630779266357422, + "rewards/rejected": -9.762221336364746, + "step": 1049 + }, + { + "epoch": 1.37, + "learning_rate": 3.0193119211279097e-05, + "logits/chosen": -1.917505145072937, + "logits/rejected": -1.925102949142456, + "logps/chosen": -170.56272888183594, + "logps/rejected": -230.60565185546875, + "loss": 0.1049, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.02511174976825714, + "rewards/margins": 7.496645927429199, + "rewards/rejected": -7.521758079528809, + "step": 1050 + }, + { + "epoch": 1.38, + "learning_rate": 3.015806523463085e-05, + "logits/chosen": -1.8236424922943115, + "logits/rejected": -1.8360267877578735, + "logps/chosen": -181.96212768554688, + "logps/rejected": -283.1656188964844, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3086252510547638, + "rewards/margins": 9.338162422180176, + "rewards/rejected": -9.029537200927734, + "step": 1051 + }, + { + "epoch": 1.38, + "learning_rate": 3.0123000662874272e-05, + "logits/chosen": -1.5274633169174194, + "logits/rejected": -1.5471253395080566, + "logps/chosen": -177.43600463867188, + "logps/rejected": -239.90560913085938, + "loss": 0.0881, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3131393492221832, + "rewards/margins": 7.149758338928223, + "rewards/rejected": -7.462898254394531, + "step": 1052 + }, + { + "epoch": 1.38, + "learning_rate": 3.0087925568034998e-05, + "logits/chosen": -1.6211353540420532, + "logits/rejected": -1.7500112056732178, + "logps/chosen": -147.48223876953125, + "logps/rejected": -247.85064697265625, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.655480146408081, + "rewards/margins": 8.437618255615234, + "rewards/rejected": -7.782138824462891, + "step": 1053 + }, + { + "epoch": 1.38, + "learning_rate": 3.0052840022160273e-05, + "logits/chosen": -1.901058316230774, + "logits/rejected": -1.924602746963501, + "logps/chosen": -204.58761596679688, + "logps/rejected": -280.8456726074219, + "loss": 0.1446, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21391651034355164, + "rewards/margins": 8.089306831359863, + "rewards/rejected": -7.875391006469727, + "step": 1054 + }, + { + "epoch": 1.38, + "learning_rate": 3.0017744097318823e-05, + "logits/chosen": -1.763657569885254, + "logits/rejected": -1.7600390911102295, + "logps/chosen": -168.9661865234375, + "logps/rejected": -238.08570861816406, + "loss": 0.0902, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5401232242584229, + "rewards/margins": 7.412095069885254, + "rewards/rejected": -7.952218532562256, + "step": 1055 + }, + { + "epoch": 1.38, + "learning_rate": 2.9982637865600683e-05, + "logits/chosen": -1.8518264293670654, + "logits/rejected": -1.8301531076431274, + "logps/chosen": -157.35792541503906, + "logps/rejected": -241.36534118652344, + "loss": 0.0872, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6445915699005127, + "rewards/margins": 9.140910148620605, + "rewards/rejected": -8.496318817138672, + "step": 1056 + }, + { + "epoch": 1.38, + "learning_rate": 2.994752139911706e-05, + "logits/chosen": -1.6274082660675049, + "logits/rejected": -1.7299463748931885, + "logps/chosen": -168.86997985839844, + "logps/rejected": -269.8592224121094, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48749327659606934, + "rewards/margins": 8.431032180786133, + "rewards/rejected": -8.918526649475098, + "step": 1057 + }, + { + "epoch": 1.38, + "learning_rate": 2.991239477000021e-05, + "logits/chosen": -1.4427497386932373, + "logits/rejected": -1.4453539848327637, + "logps/chosen": -163.6460418701172, + "logps/rejected": -270.0966491699219, + "loss": 0.0522, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04818262159824371, + "rewards/margins": 9.032136917114258, + "rewards/rejected": -8.983954429626465, + "step": 1058 + }, + { + "epoch": 1.39, + "learning_rate": 2.9877258050403212e-05, + "logits/chosen": -1.564314365386963, + "logits/rejected": -1.5277494192123413, + "logps/chosen": -168.06900024414062, + "logps/rejected": -230.80673217773438, + "loss": 0.0457, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3408430814743042, + "rewards/margins": 8.206949234008789, + "rewards/rejected": -7.866106033325195, + "step": 1059 + }, + { + "epoch": 1.39, + "learning_rate": 2.9842111312499914e-05, + "logits/chosen": -1.7981947660446167, + "logits/rejected": -1.7717591524124146, + "logps/chosen": -167.16009521484375, + "logps/rejected": -239.8763885498047, + "loss": 0.1224, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.056708812713623, + "rewards/margins": 8.697671890258789, + "rewards/rejected": -7.640964031219482, + "step": 1060 + }, + { + "epoch": 1.39, + "learning_rate": 2.9806954628484734e-05, + "logits/chosen": -1.7545130252838135, + "logits/rejected": -1.811667561531067, + "logps/chosen": -185.0836181640625, + "logps/rejected": -286.8186340332031, + "loss": 0.0915, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.26118355989456177, + "rewards/margins": 9.414217948913574, + "rewards/rejected": -9.153034210205078, + "step": 1061 + }, + { + "epoch": 1.39, + "learning_rate": 2.9771788070572514e-05, + "logits/chosen": -1.7203443050384521, + "logits/rejected": -1.8027807474136353, + "logps/chosen": -169.2890167236328, + "logps/rejected": -273.4495544433594, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4334350526332855, + "rewards/margins": 9.510089874267578, + "rewards/rejected": -9.076654434204102, + "step": 1062 + }, + { + "epoch": 1.39, + "learning_rate": 2.9736611710998368e-05, + "logits/chosen": -1.726889967918396, + "logits/rejected": -1.6837644577026367, + "logps/chosen": -214.25038146972656, + "logps/rejected": -328.2698974609375, + "loss": 0.0893, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2967264950275421, + "rewards/margins": 10.300281524658203, + "rewards/rejected": -10.003554344177246, + "step": 1063 + }, + { + "epoch": 1.39, + "learning_rate": 2.9701425622017583e-05, + "logits/chosen": -1.6448372602462769, + "logits/rejected": -1.6740950345993042, + "logps/chosen": -197.98626708984375, + "logps/rejected": -324.76202392578125, + "loss": 0.0537, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06527144461870193, + "rewards/margins": 9.979601860046387, + "rewards/rejected": -10.04487419128418, + "step": 1064 + }, + { + "epoch": 1.39, + "learning_rate": 2.9666229875905373e-05, + "logits/chosen": -1.5349295139312744, + "logits/rejected": -1.5960193872451782, + "logps/chosen": -159.6785888671875, + "logps/rejected": -268.4906921386719, + "loss": 0.046, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5456817150115967, + "rewards/margins": 10.050601959228516, + "rewards/rejected": -9.504919052124023, + "step": 1065 + }, + { + "epoch": 1.4, + "learning_rate": 2.963102454495683e-05, + "logits/chosen": -1.5063464641571045, + "logits/rejected": -1.5059911012649536, + "logps/chosen": -196.3102569580078, + "logps/rejected": -278.2349853515625, + "loss": 0.0973, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4087756872177124, + "rewards/margins": 8.119946479797363, + "rewards/rejected": -8.528722763061523, + "step": 1066 + }, + { + "epoch": 1.4, + "learning_rate": 2.959580970148673e-05, + "logits/chosen": -1.7497241497039795, + "logits/rejected": -1.7890876531600952, + "logps/chosen": -180.35324096679688, + "logps/rejected": -269.443359375, + "loss": 0.1012, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3109995722770691, + "rewards/margins": 8.98426628112793, + "rewards/rejected": -8.673266410827637, + "step": 1067 + }, + { + "epoch": 1.4, + "learning_rate": 2.9560585417829368e-05, + "logits/chosen": -1.592320203781128, + "logits/rejected": -1.6150949001312256, + "logps/chosen": -159.53489685058594, + "logps/rejected": -227.51358032226562, + "loss": 0.0654, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12761837244033813, + "rewards/margins": 7.307836532592773, + "rewards/rejected": -7.180217742919922, + "step": 1068 + }, + { + "epoch": 1.4, + "learning_rate": 2.952535176633846e-05, + "logits/chosen": -1.6989355087280273, + "logits/rejected": -1.693108320236206, + "logps/chosen": -170.16575622558594, + "logps/rejected": -260.4944152832031, + "loss": 0.0703, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6813508868217468, + "rewards/margins": 8.625094413757324, + "rewards/rejected": -7.943744659423828, + "step": 1069 + }, + { + "epoch": 1.4, + "learning_rate": 2.9490108819386936e-05, + "logits/chosen": -1.644622564315796, + "logits/rejected": -1.6186461448669434, + "logps/chosen": -184.5675811767578, + "logps/rejected": -237.4679718017578, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1123928651213646, + "rewards/margins": 7.406467914581299, + "rewards/rejected": -7.294075012207031, + "step": 1070 + }, + { + "epoch": 1.4, + "learning_rate": 2.945485664936683e-05, + "logits/chosen": -1.655521035194397, + "logits/rejected": -1.5490621328353882, + "logps/chosen": -187.23654174804688, + "logps/rejected": -271.51055908203125, + "loss": 0.0607, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7184284329414368, + "rewards/margins": 7.089148998260498, + "rewards/rejected": -7.807576656341553, + "step": 1071 + }, + { + "epoch": 1.4, + "learning_rate": 2.9419595328689138e-05, + "logits/chosen": -1.5867788791656494, + "logits/rejected": -1.6296215057373047, + "logps/chosen": -213.49026489257812, + "logps/rejected": -318.98193359375, + "loss": 0.0598, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.10515981912612915, + "rewards/margins": 9.39758014678955, + "rewards/rejected": -9.29241943359375, + "step": 1072 + }, + { + "epoch": 1.4, + "learning_rate": 2.938432492978361e-05, + "logits/chosen": -1.8805725574493408, + "logits/rejected": -1.9039075374603271, + "logps/chosen": -184.86695861816406, + "logps/rejected": -241.6600341796875, + "loss": 0.0818, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4736900329589844, + "rewards/margins": 5.245334625244141, + "rewards/rejected": -5.719025135040283, + "step": 1073 + }, + { + "epoch": 1.41, + "learning_rate": 2.9349045525098688e-05, + "logits/chosen": -1.7326735258102417, + "logits/rejected": -1.7649610042572021, + "logps/chosen": -181.72320556640625, + "logps/rejected": -283.0134582519531, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7515249252319336, + "rewards/margins": 10.077593803405762, + "rewards/rejected": -9.326070785522461, + "step": 1074 + }, + { + "epoch": 1.41, + "learning_rate": 2.9313757187101297e-05, + "logits/chosen": -1.638093113899231, + "logits/rejected": -1.6165810823440552, + "logps/chosen": -156.86587524414062, + "logps/rejected": -238.19873046875, + "loss": 0.1166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03780980408191681, + "rewards/margins": 7.915657997131348, + "rewards/rejected": -7.953467845916748, + "step": 1075 + }, + { + "epoch": 1.41, + "learning_rate": 2.9278459988276703e-05, + "logits/chosen": -1.743675708770752, + "logits/rejected": -1.7930182218551636, + "logps/chosen": -168.07261657714844, + "logps/rejected": -243.85671997070312, + "loss": 0.0786, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.050581879913806915, + "rewards/margins": 7.069265842437744, + "rewards/rejected": -7.018683910369873, + "step": 1076 + }, + { + "epoch": 1.41, + "learning_rate": 2.9243154001128386e-05, + "logits/chosen": -1.8438432216644287, + "logits/rejected": -1.8247106075286865, + "logps/chosen": -165.50645446777344, + "logps/rejected": -235.93930053710938, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35804885625839233, + "rewards/margins": 8.000460624694824, + "rewards/rejected": -7.642411708831787, + "step": 1077 + }, + { + "epoch": 1.41, + "learning_rate": 2.920783929817786e-05, + "logits/chosen": -1.7801151275634766, + "logits/rejected": -1.8452836275100708, + "logps/chosen": -139.43931579589844, + "logps/rejected": -220.37332153320312, + "loss": 0.1464, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.27093997597694397, + "rewards/margins": 7.245481014251709, + "rewards/rejected": -6.974541187286377, + "step": 1078 + }, + { + "epoch": 1.41, + "learning_rate": 2.9172515951964558e-05, + "logits/chosen": -1.446393609046936, + "logits/rejected": -1.4515058994293213, + "logps/chosen": -153.79742431640625, + "logps/rejected": -233.67364501953125, + "loss": 0.0824, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3887996673583984, + "rewards/margins": 6.059497833251953, + "rewards/rejected": -7.44829797744751, + "step": 1079 + }, + { + "epoch": 1.41, + "learning_rate": 2.913718403504567e-05, + "logits/chosen": -1.4622185230255127, + "logits/rejected": -1.4909744262695312, + "logps/chosen": -194.57229614257812, + "logps/rejected": -242.60299682617188, + "loss": 0.0603, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5127065181732178, + "rewards/margins": 8.00437068939209, + "rewards/rejected": -8.517077445983887, + "step": 1080 + }, + { + "epoch": 1.41, + "learning_rate": 2.9101843619995968e-05, + "logits/chosen": -1.689691424369812, + "logits/rejected": -1.682550311088562, + "logps/chosen": -180.0986328125, + "logps/rejected": -259.7098388671875, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9263864755630493, + "rewards/margins": 9.332463264465332, + "rewards/rejected": -8.406075477600098, + "step": 1081 + }, + { + "epoch": 1.42, + "learning_rate": 2.906649477940771e-05, + "logits/chosen": -1.836404800415039, + "logits/rejected": -1.8635551929473877, + "logps/chosen": -166.10693359375, + "logps/rejected": -269.972900390625, + "loss": 0.1075, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15306350588798523, + "rewards/margins": 8.478963851928711, + "rewards/rejected": -8.632026672363281, + "step": 1082 + }, + { + "epoch": 1.42, + "learning_rate": 2.9031137585890445e-05, + "logits/chosen": -1.7309619188308716, + "logits/rejected": -1.735274314880371, + "logps/chosen": -165.82891845703125, + "logps/rejected": -237.98660278320312, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.034985065460205, + "rewards/margins": 6.53951358795166, + "rewards/rejected": -7.574498653411865, + "step": 1083 + }, + { + "epoch": 1.42, + "learning_rate": 2.899577211207087e-05, + "logits/chosen": -1.708977460861206, + "logits/rejected": -1.749969244003296, + "logps/chosen": -185.85816955566406, + "logps/rejected": -239.14401245117188, + "loss": 0.1959, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5559318661689758, + "rewards/margins": 7.477483749389648, + "rewards/rejected": -6.921550750732422, + "step": 1084 + }, + { + "epoch": 1.42, + "learning_rate": 2.89603984305927e-05, + "logits/chosen": -1.8304327726364136, + "logits/rejected": -1.8747177124023438, + "logps/chosen": -187.71591186523438, + "logps/rejected": -283.69232177734375, + "loss": 0.0446, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3536360263824463, + "rewards/margins": 9.561899185180664, + "rewards/rejected": -9.20826244354248, + "step": 1085 + }, + { + "epoch": 1.42, + "learning_rate": 2.8925016614116534e-05, + "logits/chosen": -1.8107715845108032, + "logits/rejected": -1.8216731548309326, + "logps/chosen": -178.56851196289062, + "logps/rejected": -252.02215576171875, + "loss": 0.1307, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04138310253620148, + "rewards/margins": 8.433449745178223, + "rewards/rejected": -8.39206600189209, + "step": 1086 + }, + { + "epoch": 1.42, + "learning_rate": 2.8889626735319635e-05, + "logits/chosen": -1.791649341583252, + "logits/rejected": -1.8210809230804443, + "logps/chosen": -178.6534423828125, + "logps/rejected": -303.63690185546875, + "loss": 0.0742, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030027128756046295, + "rewards/margins": 10.293094635009766, + "rewards/rejected": -10.323122024536133, + "step": 1087 + }, + { + "epoch": 1.42, + "learning_rate": 2.8854228866895855e-05, + "logits/chosen": -1.5863910913467407, + "logits/rejected": -1.6101830005645752, + "logps/chosen": -219.22608947753906, + "logps/rejected": -318.7516784667969, + "loss": 0.0977, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1301908791065216, + "rewards/margins": 8.287530899047852, + "rewards/rejected": -8.417722702026367, + "step": 1088 + }, + { + "epoch": 1.43, + "learning_rate": 2.8818823081555445e-05, + "logits/chosen": -1.2189265489578247, + "logits/rejected": -1.2112969160079956, + "logps/chosen": -155.92262268066406, + "logps/rejected": -233.3964080810547, + "loss": 0.0947, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3505321145057678, + "rewards/margins": 6.995457172393799, + "rewards/rejected": -7.345990180969238, + "step": 1089 + }, + { + "epoch": 1.43, + "learning_rate": 2.8783409452024934e-05, + "logits/chosen": -1.7439119815826416, + "logits/rejected": -1.7457126379013062, + "logps/chosen": -219.94570922851562, + "logps/rejected": -282.0072326660156, + "loss": 0.1396, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1430746614933014, + "rewards/margins": 6.051929473876953, + "rewards/rejected": -5.9088544845581055, + "step": 1090 + }, + { + "epoch": 1.43, + "learning_rate": 2.874798805104696e-05, + "logits/chosen": -1.6577132940292358, + "logits/rejected": -1.620776891708374, + "logps/chosen": -197.17550659179688, + "logps/rejected": -276.7876281738281, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06978775560855865, + "rewards/margins": 8.667924880981445, + "rewards/rejected": -8.737712860107422, + "step": 1091 + }, + { + "epoch": 1.43, + "learning_rate": 2.8712558951380097e-05, + "logits/chosen": -1.7607347965240479, + "logits/rejected": -1.8015058040618896, + "logps/chosen": -179.78219604492188, + "logps/rejected": -287.1150817871094, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015934646129608154, + "rewards/margins": 9.509757041931152, + "rewards/rejected": -9.525691032409668, + "step": 1092 + }, + { + "epoch": 1.43, + "learning_rate": 2.867712222579877e-05, + "logits/chosen": -1.2007501125335693, + "logits/rejected": -1.181114673614502, + "logps/chosen": -192.53250122070312, + "logps/rejected": -345.1432800292969, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.1310138702392578, + "rewards/margins": 12.169549942016602, + "rewards/rejected": -11.038536071777344, + "step": 1093 + }, + { + "epoch": 1.43, + "learning_rate": 2.864167794709305e-05, + "logits/chosen": -1.8039956092834473, + "logits/rejected": -1.802217721939087, + "logps/chosen": -179.5605926513672, + "logps/rejected": -277.24688720703125, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5370839238166809, + "rewards/margins": 9.079758644104004, + "rewards/rejected": -9.616842269897461, + "step": 1094 + }, + { + "epoch": 1.43, + "learning_rate": 2.860622618806852e-05, + "logits/chosen": -1.6770459413528442, + "logits/rejected": -1.6736505031585693, + "logps/chosen": -168.60211181640625, + "logps/rejected": -275.28912353515625, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6077154874801636, + "rewards/margins": 9.361126899719238, + "rewards/rejected": -8.753411293029785, + "step": 1095 + }, + { + "epoch": 1.43, + "learning_rate": 2.857076702154614e-05, + "logits/chosen": -1.7024035453796387, + "logits/rejected": -1.7034558057785034, + "logps/chosen": -172.82936096191406, + "logps/rejected": -245.3527069091797, + "loss": 0.0306, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4698975086212158, + "rewards/margins": 8.477577209472656, + "rewards/rejected": -8.00767993927002, + "step": 1096 + }, + { + "epoch": 1.44, + "learning_rate": 2.8535300520362075e-05, + "logits/chosen": -1.7404273748397827, + "logits/rejected": -1.7562859058380127, + "logps/chosen": -147.20562744140625, + "logps/rejected": -245.9154815673828, + "loss": 0.0526, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6291453242301941, + "rewards/margins": 8.902300834655762, + "rewards/rejected": -8.273155212402344, + "step": 1097 + }, + { + "epoch": 1.44, + "learning_rate": 2.849982675736756e-05, + "logits/chosen": -1.652204990386963, + "logits/rejected": -1.7528135776519775, + "logps/chosen": -202.58811950683594, + "logps/rejected": -262.0469970703125, + "loss": 0.1477, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6120052337646484, + "rewards/margins": 5.328189849853516, + "rewards/rejected": -5.940195083618164, + "step": 1098 + }, + { + "epoch": 1.44, + "learning_rate": 2.8464345805428753e-05, + "logits/chosen": -1.6539255380630493, + "logits/rejected": -1.7018496990203857, + "logps/chosen": -155.0974884033203, + "logps/rejected": -230.4415283203125, + "loss": 0.192, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.47378963232040405, + "rewards/margins": 6.784183025360107, + "rewards/rejected": -7.2579731941223145, + "step": 1099 + }, + { + "epoch": 1.44, + "learning_rate": 2.8428857737426556e-05, + "logits/chosen": -1.6058200597763062, + "logits/rejected": -1.6428849697113037, + "logps/chosen": -159.9298858642578, + "logps/rejected": -204.16192626953125, + "loss": 0.2319, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06437686085700989, + "rewards/margins": 6.440631866455078, + "rewards/rejected": -6.505008697509766, + "step": 1100 + }, + { + "epoch": 1.44, + "learning_rate": 2.839336262625652e-05, + "logits/chosen": -1.8214541673660278, + "logits/rejected": -1.8728939294815063, + "logps/chosen": -202.13702392578125, + "logps/rejected": -292.6018371582031, + "loss": 0.1447, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6903783082962036, + "rewards/margins": 7.224602222442627, + "rewards/rejected": -7.914981365203857, + "step": 1101 + }, + { + "epoch": 1.44, + "learning_rate": 2.835786054482864e-05, + "logits/chosen": -1.7908833026885986, + "logits/rejected": -1.7908052206039429, + "logps/chosen": -174.4796142578125, + "logps/rejected": -295.7412109375, + "loss": 0.0507, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6345799565315247, + "rewards/margins": 10.110870361328125, + "rewards/rejected": -9.476289749145508, + "step": 1102 + }, + { + "epoch": 1.44, + "learning_rate": 2.832235156606724e-05, + "logits/chosen": -1.56364107131958, + "logits/rejected": -1.635093331336975, + "logps/chosen": -152.8809356689453, + "logps/rejected": -243.0329132080078, + "loss": 0.0538, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.18794478476047516, + "rewards/margins": 7.540948390960693, + "rewards/rejected": -7.353002548217773, + "step": 1103 + }, + { + "epoch": 1.44, + "learning_rate": 2.8286835762910803e-05, + "logits/chosen": -1.6258413791656494, + "logits/rejected": -1.6799871921539307, + "logps/chosen": -151.55735778808594, + "logps/rejected": -219.32901000976562, + "loss": 0.0885, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1063820123672485, + "rewards/margins": 7.075350284576416, + "rewards/rejected": -5.968967437744141, + "step": 1104 + }, + { + "epoch": 1.45, + "learning_rate": 2.8251313208311837e-05, + "logits/chosen": -1.747852087020874, + "logits/rejected": -1.824681043624878, + "logps/chosen": -146.47882080078125, + "logps/rejected": -251.85317993164062, + "loss": 0.0698, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4700224697589874, + "rewards/margins": 8.218472480773926, + "rewards/rejected": -7.74845027923584, + "step": 1105 + }, + { + "epoch": 1.45, + "learning_rate": 2.8215783975236715e-05, + "logits/chosen": -1.8190606832504272, + "logits/rejected": -1.8369245529174805, + "logps/chosen": -168.99636840820312, + "logps/rejected": -246.48812866210938, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49010467529296875, + "rewards/margins": 8.350204467773438, + "rewards/rejected": -7.860099792480469, + "step": 1106 + }, + { + "epoch": 1.45, + "learning_rate": 2.8180248136665527e-05, + "logits/chosen": -1.6487927436828613, + "logits/rejected": -1.6826856136322021, + "logps/chosen": -163.9696807861328, + "logps/rejected": -280.2496643066406, + "loss": 0.0881, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2544436454772949, + "rewards/margins": 8.320480346679688, + "rewards/rejected": -8.574923515319824, + "step": 1107 + }, + { + "epoch": 1.45, + "learning_rate": 2.8144705765591938e-05, + "logits/chosen": -1.5302926301956177, + "logits/rejected": -1.5753281116485596, + "logps/chosen": -192.76434326171875, + "logps/rejected": -338.2936096191406, + "loss": 0.0523, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.11479449272155762, + "rewards/margins": 11.131499290466309, + "rewards/rejected": -11.016705513000488, + "step": 1108 + }, + { + "epoch": 1.45, + "learning_rate": 2.810915693502302e-05, + "logits/chosen": -1.7479517459869385, + "logits/rejected": -1.738733172416687, + "logps/chosen": -235.47677612304688, + "logps/rejected": -314.4061279296875, + "loss": 0.1167, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.011833474040031433, + "rewards/margins": 8.613738059997559, + "rewards/rejected": -8.625571250915527, + "step": 1109 + }, + { + "epoch": 1.45, + "learning_rate": 2.807360171797912e-05, + "logits/chosen": -1.8737869262695312, + "logits/rejected": -1.874306082725525, + "logps/chosen": -160.55758666992188, + "logps/rejected": -259.0960998535156, + "loss": 0.0973, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5080325603485107, + "rewards/margins": 9.35029411315918, + "rewards/rejected": -8.842262268066406, + "step": 1110 + }, + { + "epoch": 1.45, + "learning_rate": 2.803804018749371e-05, + "logits/chosen": -1.6433359384536743, + "logits/rejected": -1.535301685333252, + "logps/chosen": -164.74240112304688, + "logps/rejected": -251.42335510253906, + "loss": 0.0615, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35988637804985046, + "rewards/margins": 7.138075828552246, + "rewards/rejected": -7.49796199798584, + "step": 1111 + }, + { + "epoch": 1.46, + "learning_rate": 2.800247241661321e-05, + "logits/chosen": -1.408790111541748, + "logits/rejected": -1.4200383424758911, + "logps/chosen": -167.5392303466797, + "logps/rejected": -274.0494079589844, + "loss": 0.0532, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16386836767196655, + "rewards/margins": 9.678466796875, + "rewards/rejected": -9.51459789276123, + "step": 1112 + }, + { + "epoch": 1.46, + "learning_rate": 2.796689847839689e-05, + "logits/chosen": -1.5821647644042969, + "logits/rejected": -1.654584527015686, + "logps/chosen": -177.93264770507812, + "logps/rejected": -287.0840759277344, + "loss": 0.0264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.016507327556610107, + "rewards/margins": 9.057121276855469, + "rewards/rejected": -9.040614128112793, + "step": 1113 + }, + { + "epoch": 1.46, + "learning_rate": 2.793131844591666e-05, + "logits/chosen": -1.8002557754516602, + "logits/rejected": -1.8141741752624512, + "logps/chosen": -159.45155334472656, + "logps/rejected": -270.21435546875, + "loss": 0.0575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.15638035535812378, + "rewards/margins": 9.859724998474121, + "rewards/rejected": -9.703344345092773, + "step": 1114 + }, + { + "epoch": 1.46, + "learning_rate": 2.7895732392256952e-05, + "logits/chosen": -1.6191045045852661, + "logits/rejected": -1.627183198928833, + "logps/chosen": -165.16212463378906, + "logps/rejected": -244.70327758789062, + "loss": 0.0668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5004497766494751, + "rewards/margins": 7.141720771789551, + "rewards/rejected": -7.642170429229736, + "step": 1115 + }, + { + "epoch": 1.46, + "learning_rate": 2.7860140390514583e-05, + "logits/chosen": -1.5408159494400024, + "logits/rejected": -1.5526851415634155, + "logps/chosen": -202.81900024414062, + "logps/rejected": -301.04083251953125, + "loss": 0.1546, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4465533494949341, + "rewards/margins": 6.982842445373535, + "rewards/rejected": -7.42939567565918, + "step": 1116 + }, + { + "epoch": 1.46, + "learning_rate": 2.7824542513798567e-05, + "logits/chosen": -1.8387327194213867, + "logits/rejected": -1.8972498178482056, + "logps/chosen": -169.3407440185547, + "logps/rejected": -262.0710144042969, + "loss": 0.0503, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.22446361184120178, + "rewards/margins": 8.968152046203613, + "rewards/rejected": -8.743688583374023, + "step": 1117 + }, + { + "epoch": 1.46, + "learning_rate": 2.7788938835230005e-05, + "logits/chosen": -1.4195456504821777, + "logits/rejected": -1.4532251358032227, + "logps/chosen": -227.37469482421875, + "logps/rejected": -287.0699157714844, + "loss": 0.1431, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44098955392837524, + "rewards/margins": 6.198091506958008, + "rewards/rejected": -6.639081001281738, + "step": 1118 + }, + { + "epoch": 1.46, + "learning_rate": 2.77533294279419e-05, + "logits/chosen": -1.3463804721832275, + "logits/rejected": -1.362419843673706, + "logps/chosen": -161.67636108398438, + "logps/rejected": -243.70254516601562, + "loss": 0.074, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.30078402161598206, + "rewards/margins": 7.2766900062561035, + "rewards/rejected": -6.975905895233154, + "step": 1119 + }, + { + "epoch": 1.47, + "learning_rate": 2.771771436507903e-05, + "logits/chosen": -1.674617052078247, + "logits/rejected": -1.7133612632751465, + "logps/chosen": -160.76336669921875, + "logps/rejected": -265.93377685546875, + "loss": 0.0386, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.773129940032959, + "rewards/margins": 9.26294231414795, + "rewards/rejected": -8.489812850952148, + "step": 1120 + }, + { + "epoch": 1.47, + "learning_rate": 2.7682093719797792e-05, + "logits/chosen": -1.4538536071777344, + "logits/rejected": -1.4257911443710327, + "logps/chosen": -160.9686737060547, + "logps/rejected": -260.2874450683594, + "loss": 0.1115, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3260264992713928, + "rewards/margins": 8.452909469604492, + "rewards/rejected": -8.778935432434082, + "step": 1121 + }, + { + "epoch": 1.47, + "learning_rate": 2.764646756526603e-05, + "logits/chosen": -1.9220608472824097, + "logits/rejected": -1.9210290908813477, + "logps/chosen": -169.73904418945312, + "logps/rejected": -250.19448852539062, + "loss": 0.1191, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4911082088947296, + "rewards/margins": 7.829379081726074, + "rewards/rejected": -7.338270664215088, + "step": 1122 + }, + { + "epoch": 1.47, + "learning_rate": 2.7610835974662942e-05, + "logits/chosen": -1.6652580499649048, + "logits/rejected": -1.6305097341537476, + "logps/chosen": -158.3380126953125, + "logps/rejected": -239.27713012695312, + "loss": 0.0944, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7113561630249023, + "rewards/margins": 8.961684226989746, + "rewards/rejected": -8.25032901763916, + "step": 1123 + }, + { + "epoch": 1.47, + "learning_rate": 2.757519902117886e-05, + "logits/chosen": -1.693634033203125, + "logits/rejected": -1.7926090955734253, + "logps/chosen": -160.93408203125, + "logps/rejected": -305.1036682128906, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11803394556045532, + "rewards/margins": 11.129570007324219, + "rewards/rejected": -11.247602462768555, + "step": 1124 + }, + { + "epoch": 1.47, + "learning_rate": 2.7539556778015147e-05, + "logits/chosen": -1.8918925523757935, + "logits/rejected": -1.93614661693573, + "logps/chosen": -170.09307861328125, + "logps/rejected": -285.0545959472656, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3320740461349487, + "rewards/margins": 10.145771980285645, + "rewards/rejected": -8.813698768615723, + "step": 1125 + }, + { + "epoch": 1.47, + "learning_rate": 2.7503909318384026e-05, + "logits/chosen": -1.5562089681625366, + "logits/rejected": -1.553199052810669, + "logps/chosen": -204.55950927734375, + "logps/rejected": -278.0888366699219, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41781148314476013, + "rewards/margins": 8.417019844055176, + "rewards/rejected": -8.834830284118652, + "step": 1126 + }, + { + "epoch": 1.47, + "learning_rate": 2.7468256715508428e-05, + "logits/chosen": -1.7003482580184937, + "logits/rejected": -1.7231361865997314, + "logps/chosen": -174.7547607421875, + "logps/rejected": -240.66641235351562, + "loss": 0.0592, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11631502956151962, + "rewards/margins": 7.218903064727783, + "rewards/rejected": -7.102587699890137, + "step": 1127 + }, + { + "epoch": 1.48, + "learning_rate": 2.743259904262187e-05, + "logits/chosen": -1.711890459060669, + "logits/rejected": -1.694798231124878, + "logps/chosen": -168.28359985351562, + "logps/rejected": -209.40847778320312, + "loss": 0.2473, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8987728953361511, + "rewards/margins": 5.042698383331299, + "rewards/rejected": -5.941471099853516, + "step": 1128 + }, + { + "epoch": 1.48, + "learning_rate": 2.739693637296826e-05, + "logits/chosen": -1.6521188020706177, + "logits/rejected": -1.6186320781707764, + "logps/chosen": -165.4458770751953, + "logps/rejected": -252.67311096191406, + "loss": 0.0465, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7411643266677856, + "rewards/margins": 9.447739601135254, + "rewards/rejected": -8.706574440002441, + "step": 1129 + }, + { + "epoch": 1.48, + "learning_rate": 2.7361268779801785e-05, + "logits/chosen": -1.6575287580490112, + "logits/rejected": -1.6670286655426025, + "logps/chosen": -149.54388427734375, + "logps/rejected": -227.07464599609375, + "loss": 0.0932, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.635836660861969, + "rewards/margins": 7.351537227630615, + "rewards/rejected": -6.715700149536133, + "step": 1130 + }, + { + "epoch": 1.48, + "learning_rate": 2.7325596336386738e-05, + "logits/chosen": -1.7564228773117065, + "logits/rejected": -1.780297040939331, + "logps/chosen": -181.02215576171875, + "logps/rejected": -242.42831420898438, + "loss": 0.0826, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4995889663696289, + "rewards/margins": 7.388679504394531, + "rewards/rejected": -7.88826847076416, + "step": 1131 + }, + { + "epoch": 1.48, + "learning_rate": 2.7289919115997374e-05, + "logits/chosen": -1.747208595275879, + "logits/rejected": -1.7824649810791016, + "logps/chosen": -149.55833435058594, + "logps/rejected": -237.29415893554688, + "loss": 0.0779, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3986753821372986, + "rewards/margins": 7.962288856506348, + "rewards/rejected": -7.5636138916015625, + "step": 1132 + }, + { + "epoch": 1.48, + "learning_rate": 2.7254237191917776e-05, + "logits/chosen": -1.7027745246887207, + "logits/rejected": -1.7555227279663086, + "logps/chosen": -166.1653289794922, + "logps/rejected": -287.9402160644531, + "loss": 0.0932, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.015581237152218819, + "rewards/margins": 9.20970344543457, + "rewards/rejected": -9.194122314453125, + "step": 1133 + }, + { + "epoch": 1.48, + "learning_rate": 2.721855063744165e-05, + "logits/chosen": -1.9002448320388794, + "logits/rejected": -1.9046696424484253, + "logps/chosen": -168.03701782226562, + "logps/rejected": -261.700439453125, + "loss": 0.0963, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04782811552286148, + "rewards/margins": 8.795093536376953, + "rewards/rejected": -8.747264862060547, + "step": 1134 + }, + { + "epoch": 1.49, + "learning_rate": 2.718285952587228e-05, + "logits/chosen": -1.5534412860870361, + "logits/rejected": -1.6070075035095215, + "logps/chosen": -175.8369140625, + "logps/rejected": -254.75802612304688, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5186577439308167, + "rewards/margins": 8.06625747680664, + "rewards/rejected": -8.584914207458496, + "step": 1135 + }, + { + "epoch": 1.49, + "learning_rate": 2.714716393052223e-05, + "logits/chosen": -1.3456964492797852, + "logits/rejected": -1.371073603630066, + "logps/chosen": -177.3367156982422, + "logps/rejected": -264.3311767578125, + "loss": 0.0609, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10340505838394165, + "rewards/margins": 8.876799583435059, + "rewards/rejected": -8.773394584655762, + "step": 1136 + }, + { + "epoch": 1.49, + "learning_rate": 2.711146392471333e-05, + "logits/chosen": -1.7302535772323608, + "logits/rejected": -1.7177340984344482, + "logps/chosen": -183.34715270996094, + "logps/rejected": -281.00970458984375, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1131555438041687, + "rewards/margins": 8.751636505126953, + "rewards/rejected": -8.864792823791504, + "step": 1137 + }, + { + "epoch": 1.49, + "learning_rate": 2.7075759581776462e-05, + "logits/chosen": -1.7394368648529053, + "logits/rejected": -1.7250502109527588, + "logps/chosen": -170.9840087890625, + "logps/rejected": -243.2357177734375, + "loss": 0.1006, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23918724060058594, + "rewards/margins": 7.328280448913574, + "rewards/rejected": -7.0890936851501465, + "step": 1138 + }, + { + "epoch": 1.49, + "learning_rate": 2.704005097505139e-05, + "logits/chosen": -1.7313486337661743, + "logits/rejected": -1.7989723682403564, + "logps/chosen": -152.57363891601562, + "logps/rejected": -280.65020751953125, + "loss": 0.0335, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9314048290252686, + "rewards/margins": 11.033601760864258, + "rewards/rejected": -10.10219669342041, + "step": 1139 + }, + { + "epoch": 1.49, + "learning_rate": 2.7004338177886672e-05, + "logits/chosen": -1.4895107746124268, + "logits/rejected": -1.4733555316925049, + "logps/chosen": -175.56729125976562, + "logps/rejected": -275.58624267578125, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4191722869873047, + "rewards/margins": 9.123605728149414, + "rewards/rejected": -9.542778968811035, + "step": 1140 + }, + { + "epoch": 1.49, + "learning_rate": 2.6968621263639444e-05, + "logits/chosen": -1.8433064222335815, + "logits/rejected": -1.8755922317504883, + "logps/chosen": -268.6973876953125, + "logps/rejected": -323.88775634765625, + "loss": 0.1385, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7809931039810181, + "rewards/margins": 6.128871917724609, + "rewards/rejected": -6.90986442565918, + "step": 1141 + }, + { + "epoch": 1.49, + "learning_rate": 2.693290030567532e-05, + "logits/chosen": -1.742930293083191, + "logits/rejected": -1.8159878253936768, + "logps/chosen": -167.55697631835938, + "logps/rejected": -257.6251220703125, + "loss": 0.098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6375917792320251, + "rewards/margins": 6.96030330657959, + "rewards/rejected": -7.59789514541626, + "step": 1142 + }, + { + "epoch": 1.5, + "learning_rate": 2.6897175377368207e-05, + "logits/chosen": -1.6764832735061646, + "logits/rejected": -1.6324677467346191, + "logps/chosen": -178.06137084960938, + "logps/rejected": -243.72128295898438, + "loss": 0.0674, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9304772019386292, + "rewards/margins": 7.14499044418335, + "rewards/rejected": -8.075468063354492, + "step": 1143 + }, + { + "epoch": 1.5, + "learning_rate": 2.686144655210016e-05, + "logits/chosen": -1.918563723564148, + "logits/rejected": -1.9766278266906738, + "logps/chosen": -182.52072143554688, + "logps/rejected": -249.92652893066406, + "loss": 0.0675, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.913346290588379, + "rewards/margins": 6.472333908081055, + "rewards/rejected": -8.38568115234375, + "step": 1144 + }, + { + "epoch": 1.5, + "learning_rate": 2.6825713903261273e-05, + "logits/chosen": -1.8053951263427734, + "logits/rejected": -1.852967381477356, + "logps/chosen": -177.16278076171875, + "logps/rejected": -248.65371704101562, + "loss": 0.0864, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9976847171783447, + "rewards/margins": 6.778054237365723, + "rewards/rejected": -7.7757391929626465, + "step": 1145 + }, + { + "epoch": 1.5, + "learning_rate": 2.6789977504249454e-05, + "logits/chosen": -1.6686856746673584, + "logits/rejected": -1.656692624092102, + "logps/chosen": -194.56002807617188, + "logps/rejected": -273.6646423339844, + "loss": 0.0984, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20852971076965332, + "rewards/margins": 7.629216194152832, + "rewards/rejected": -7.837745666503906, + "step": 1146 + }, + { + "epoch": 1.5, + "learning_rate": 2.6754237428470336e-05, + "logits/chosen": -1.9256387948989868, + "logits/rejected": -1.9152940511703491, + "logps/chosen": -174.7666473388672, + "logps/rejected": -257.781005859375, + "loss": 0.0911, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06490552425384521, + "rewards/margins": 8.398702621459961, + "rewards/rejected": -8.463607788085938, + "step": 1147 + }, + { + "epoch": 1.5, + "learning_rate": 2.6718493749337105e-05, + "logits/chosen": -1.4999029636383057, + "logits/rejected": -1.5613821744918823, + "logps/chosen": -168.26942443847656, + "logps/rejected": -269.7207946777344, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3551241457462311, + "rewards/margins": 10.351297378540039, + "rewards/rejected": -10.706421852111816, + "step": 1148 + }, + { + "epoch": 1.5, + "learning_rate": 2.668274654027033e-05, + "logits/chosen": -1.7891987562179565, + "logits/rejected": -1.8110944032669067, + "logps/chosen": -169.90142822265625, + "logps/rejected": -268.4530944824219, + "loss": 0.0618, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3460179567337036, + "rewards/margins": 8.770615577697754, + "rewards/rejected": -9.116633415222168, + "step": 1149 + }, + { + "epoch": 1.5, + "learning_rate": 2.664699587469786e-05, + "logits/chosen": -1.6863102912902832, + "logits/rejected": -1.7447364330291748, + "logps/chosen": -164.82554626464844, + "logps/rejected": -252.87368774414062, + "loss": 0.0254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7057994604110718, + "rewards/margins": 7.548184871673584, + "rewards/rejected": -8.253984451293945, + "step": 1150 + }, + { + "epoch": 1.51, + "learning_rate": 2.6611241826054617e-05, + "logits/chosen": -1.7308387756347656, + "logits/rejected": -1.7175371646881104, + "logps/chosen": -161.2141571044922, + "logps/rejected": -238.70584106445312, + "loss": 0.0249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03660699725151062, + "rewards/margins": 8.925530433654785, + "rewards/rejected": -8.888922691345215, + "step": 1151 + }, + { + "epoch": 1.51, + "learning_rate": 2.6575484467782486e-05, + "logits/chosen": -1.657118320465088, + "logits/rejected": -1.6596860885620117, + "logps/chosen": -157.85108947753906, + "logps/rejected": -259.59271240234375, + "loss": 0.049, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.22496214509010315, + "rewards/margins": 7.334076881408691, + "rewards/rejected": -7.559039115905762, + "step": 1152 + }, + { + "epoch": 1.51, + "learning_rate": 2.6539723873330148e-05, + "logits/chosen": -1.740351915359497, + "logits/rejected": -1.7968876361846924, + "logps/chosen": -186.0464324951172, + "logps/rejected": -300.3115539550781, + "loss": 0.0446, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0093516111373901, + "rewards/margins": 10.845284461975098, + "rewards/rejected": -9.835932731628418, + "step": 1153 + }, + { + "epoch": 1.51, + "learning_rate": 2.6503960116152933e-05, + "logits/chosen": -1.7496631145477295, + "logits/rejected": -1.7538864612579346, + "logps/chosen": -153.28085327148438, + "logps/rejected": -260.5108642578125, + "loss": 0.0688, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.33029717206954956, + "rewards/margins": 9.61520004272461, + "rewards/rejected": -9.28490161895752, + "step": 1154 + }, + { + "epoch": 1.51, + "learning_rate": 2.646819326971266e-05, + "logits/chosen": -1.7677584886550903, + "logits/rejected": -1.7890934944152832, + "logps/chosen": -152.42288208007812, + "logps/rejected": -239.4620819091797, + "loss": 0.0942, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.449545681476593, + "rewards/margins": 6.730370044708252, + "rewards/rejected": -7.179915428161621, + "step": 1155 + }, + { + "epoch": 1.51, + "learning_rate": 2.6432423407477496e-05, + "logits/chosen": -1.7305803298950195, + "logits/rejected": -1.751845359802246, + "logps/chosen": -179.49514770507812, + "logps/rejected": -268.3816223144531, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15854497253894806, + "rewards/margins": 9.50815200805664, + "rewards/rejected": -9.666696548461914, + "step": 1156 + }, + { + "epoch": 1.51, + "learning_rate": 2.6396650602921824e-05, + "logits/chosen": -1.7047319412231445, + "logits/rejected": -1.733163833618164, + "logps/chosen": -143.9405517578125, + "logps/rejected": -253.3537139892578, + "loss": 0.0532, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15038195252418518, + "rewards/margins": 8.654561996459961, + "rewards/rejected": -8.504179954528809, + "step": 1157 + }, + { + "epoch": 1.52, + "learning_rate": 2.636087492952603e-05, + "logits/chosen": -1.6103425025939941, + "logits/rejected": -1.6657140254974365, + "logps/chosen": -230.7855682373047, + "logps/rejected": -332.8481750488281, + "loss": 0.1037, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8001385927200317, + "rewards/margins": 8.867574691772461, + "rewards/rejected": -9.667713165283203, + "step": 1158 + }, + { + "epoch": 1.52, + "learning_rate": 2.6325096460776422e-05, + "logits/chosen": -1.654494285583496, + "logits/rejected": -1.69438898563385, + "logps/chosen": -180.56234741210938, + "logps/rejected": -286.9242858886719, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0065844058990478516, + "rewards/margins": 9.324426651000977, + "rewards/rejected": -9.317842483520508, + "step": 1159 + }, + { + "epoch": 1.52, + "learning_rate": 2.6289315270165062e-05, + "logits/chosen": -1.647437334060669, + "logits/rejected": -1.6055926084518433, + "logps/chosen": -180.94369506835938, + "logps/rejected": -241.0670166015625, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27645260095596313, + "rewards/margins": 7.618689060211182, + "rewards/rejected": -7.8951416015625, + "step": 1160 + }, + { + "epoch": 1.52, + "learning_rate": 2.625353143118955e-05, + "logits/chosen": -1.465190052986145, + "logits/rejected": -1.516771674156189, + "logps/chosen": -177.10516357421875, + "logps/rejected": -230.43557739257812, + "loss": 0.0553, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6754331588745117, + "rewards/margins": 5.267922878265381, + "rewards/rejected": -5.943356037139893, + "step": 1161 + }, + { + "epoch": 1.52, + "learning_rate": 2.621774501735299e-05, + "logits/chosen": -1.6105995178222656, + "logits/rejected": -1.6135510206222534, + "logps/chosen": -194.22898864746094, + "logps/rejected": -286.6337890625, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40234309434890747, + "rewards/margins": 8.20213508605957, + "rewards/rejected": -8.60447883605957, + "step": 1162 + }, + { + "epoch": 1.52, + "learning_rate": 2.6181956102163724e-05, + "logits/chosen": -1.761580467224121, + "logits/rejected": -1.7239105701446533, + "logps/chosen": -175.4378662109375, + "logps/rejected": -240.81204223632812, + "loss": 0.0462, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2544100284576416, + "rewards/margins": 7.783388137817383, + "rewards/rejected": -8.037797927856445, + "step": 1163 + }, + { + "epoch": 1.52, + "learning_rate": 2.6146164759135266e-05, + "logits/chosen": -1.6953476667404175, + "logits/rejected": -1.7311131954193115, + "logps/chosen": -171.0646514892578, + "logps/rejected": -252.68792724609375, + "loss": 0.0544, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11951437592506409, + "rewards/margins": 7.911185264587402, + "rewards/rejected": -8.030699729919434, + "step": 1164 + }, + { + "epoch": 1.52, + "learning_rate": 2.6110371061786104e-05, + "logits/chosen": -1.4548978805541992, + "logits/rejected": -1.5253571271896362, + "logps/chosen": -187.3463134765625, + "logps/rejected": -251.83641052246094, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8273530006408691, + "rewards/margins": 6.687466144561768, + "rewards/rejected": -7.514819145202637, + "step": 1165 + }, + { + "epoch": 1.53, + "learning_rate": 2.607457508363955e-05, + "logits/chosen": -1.8594155311584473, + "logits/rejected": -1.8027570247650146, + "logps/chosen": -183.12960815429688, + "logps/rejected": -281.23968505859375, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7055124640464783, + "rewards/margins": 10.659345626831055, + "rewards/rejected": -9.953832626342773, + "step": 1166 + }, + { + "epoch": 1.53, + "learning_rate": 2.6038776898223627e-05, + "logits/chosen": -1.619349479675293, + "logits/rejected": -1.6578694581985474, + "logps/chosen": -173.89231872558594, + "logps/rejected": -246.46131896972656, + "loss": 0.1215, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2473626136779785, + "rewards/margins": 6.9637532234191895, + "rewards/rejected": -8.211116790771484, + "step": 1167 + }, + { + "epoch": 1.53, + "learning_rate": 2.6002976579070872e-05, + "logits/chosen": -1.655588984489441, + "logits/rejected": -1.6937859058380127, + "logps/chosen": -170.17945861816406, + "logps/rejected": -283.6971435546875, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2622162103652954, + "rewards/margins": 9.306379318237305, + "rewards/rejected": -9.568595886230469, + "step": 1168 + }, + { + "epoch": 1.53, + "learning_rate": 2.5967174199718202e-05, + "logits/chosen": -1.8254674673080444, + "logits/rejected": -1.8279321193695068, + "logps/chosen": -195.8396453857422, + "logps/rejected": -289.9363098144531, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8294849395751953, + "rewards/margins": 8.92324161529541, + "rewards/rejected": -9.752725601196289, + "step": 1169 + }, + { + "epoch": 1.53, + "learning_rate": 2.5931369833706797e-05, + "logits/chosen": -1.7834821939468384, + "logits/rejected": -1.778806209564209, + "logps/chosen": -168.0321044921875, + "logps/rejected": -226.83917236328125, + "loss": 0.1051, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7548105716705322, + "rewards/margins": 6.8317718505859375, + "rewards/rejected": -7.586582660675049, + "step": 1170 + }, + { + "epoch": 1.53, + "learning_rate": 2.5895563554581865e-05, + "logits/chosen": -1.6463509798049927, + "logits/rejected": -1.6807492971420288, + "logps/chosen": -181.89962768554688, + "logps/rejected": -263.8111877441406, + "loss": 0.0861, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7350143790245056, + "rewards/margins": 7.759056568145752, + "rewards/rejected": -8.494071960449219, + "step": 1171 + }, + { + "epoch": 1.53, + "learning_rate": 2.5859755435892597e-05, + "logits/chosen": -1.7671129703521729, + "logits/rejected": -1.6204795837402344, + "logps/chosen": -168.16664123535156, + "logps/rejected": -198.9818115234375, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4251019060611725, + "rewards/margins": 5.303590297698975, + "rewards/rejected": -5.728692054748535, + "step": 1172 + }, + { + "epoch": 1.54, + "learning_rate": 2.5823945551191937e-05, + "logits/chosen": -1.5633729696273804, + "logits/rejected": -1.553313970565796, + "logps/chosen": -170.26429748535156, + "logps/rejected": -236.29759216308594, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03574070334434509, + "rewards/margins": 7.335710525512695, + "rewards/rejected": -7.2999701499938965, + "step": 1173 + }, + { + "epoch": 1.54, + "learning_rate": 2.578813397403645e-05, + "logits/chosen": -1.653340458869934, + "logits/rejected": -1.7039482593536377, + "logps/chosen": -156.35513305664062, + "logps/rejected": -263.75592041015625, + "loss": 0.0506, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29325199127197266, + "rewards/margins": 9.51681137084961, + "rewards/rejected": -9.22355842590332, + "step": 1174 + }, + { + "epoch": 1.54, + "learning_rate": 2.5752320777986195e-05, + "logits/chosen": -1.8285808563232422, + "logits/rejected": -1.8245221376419067, + "logps/chosen": -210.67347717285156, + "logps/rejected": -281.6502380371094, + "loss": 0.0987, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5995347499847412, + "rewards/margins": 8.755189895629883, + "rewards/rejected": -8.155654907226562, + "step": 1175 + }, + { + "epoch": 1.54, + "learning_rate": 2.5716506036604542e-05, + "logits/chosen": -1.730687141418457, + "logits/rejected": -1.6980805397033691, + "logps/chosen": -176.00051879882812, + "logps/rejected": -289.8759765625, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33925819396972656, + "rewards/margins": 9.794126510620117, + "rewards/rejected": -10.13338565826416, + "step": 1176 + }, + { + "epoch": 1.54, + "learning_rate": 2.568068982345804e-05, + "logits/chosen": -1.965528964996338, + "logits/rejected": -2.0052847862243652, + "logps/chosen": -225.929931640625, + "logps/rejected": -317.4052734375, + "loss": 0.0549, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7350475192070007, + "rewards/margins": 8.015153884887695, + "rewards/rejected": -8.750200271606445, + "step": 1177 + }, + { + "epoch": 1.54, + "learning_rate": 2.5644872212116267e-05, + "logits/chosen": -1.785213589668274, + "logits/rejected": -1.798585057258606, + "logps/chosen": -165.72169494628906, + "logps/rejected": -262.3560791015625, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12321795523166656, + "rewards/margins": 9.160881996154785, + "rewards/rejected": -9.284100532531738, + "step": 1178 + }, + { + "epoch": 1.54, + "learning_rate": 2.560905327615168e-05, + "logits/chosen": -1.6976568698883057, + "logits/rejected": -1.7386258840560913, + "logps/chosen": -173.86561584472656, + "logps/rejected": -286.4976806640625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08777953684329987, + "rewards/margins": 10.085185050964355, + "rewards/rejected": -10.172965049743652, + "step": 1179 + }, + { + "epoch": 1.54, + "learning_rate": 2.557323308913942e-05, + "logits/chosen": -1.5106477737426758, + "logits/rejected": -1.560227870941162, + "logps/chosen": -241.93836975097656, + "logps/rejected": -350.15673828125, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7953296899795532, + "rewards/margins": 10.489126205444336, + "rewards/rejected": -11.284457206726074, + "step": 1180 + }, + { + "epoch": 1.55, + "learning_rate": 2.553741172465724e-05, + "logits/chosen": -1.730987787246704, + "logits/rejected": -1.7462615966796875, + "logps/chosen": -174.098388671875, + "logps/rejected": -282.7524108886719, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29300644993782043, + "rewards/margins": 10.730010032653809, + "rewards/rejected": -10.437003135681152, + "step": 1181 + }, + { + "epoch": 1.55, + "learning_rate": 2.5501589256285285e-05, + "logits/chosen": -1.3600053787231445, + "logits/rejected": -1.3349223136901855, + "logps/chosen": -203.03582763671875, + "logps/rejected": -284.1298828125, + "loss": 0.0962, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3049746751785278, + "rewards/margins": 7.621538162231445, + "rewards/rejected": -8.926513671875, + "step": 1182 + }, + { + "epoch": 1.55, + "learning_rate": 2.546576575760598e-05, + "logits/chosen": -1.6916255950927734, + "logits/rejected": -1.7467602491378784, + "logps/chosen": -172.93417358398438, + "logps/rejected": -273.44281005859375, + "loss": 0.0503, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.44223520159721375, + "rewards/margins": 8.477460861206055, + "rewards/rejected": -8.035224914550781, + "step": 1183 + }, + { + "epoch": 1.55, + "learning_rate": 2.542994130220388e-05, + "logits/chosen": -1.7573150396347046, + "logits/rejected": -1.7552380561828613, + "logps/chosen": -168.72528076171875, + "logps/rejected": -268.1504821777344, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.560140073299408, + "rewards/margins": 9.79890251159668, + "rewards/rejected": -9.238762855529785, + "step": 1184 + }, + { + "epoch": 1.55, + "learning_rate": 2.539411596366546e-05, + "logits/chosen": -1.7729578018188477, + "logits/rejected": -1.8577322959899902, + "logps/chosen": -177.11383056640625, + "logps/rejected": -255.9529571533203, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3055181503295898, + "rewards/margins": 8.495362281799316, + "rewards/rejected": -9.800880432128906, + "step": 1185 + }, + { + "epoch": 1.55, + "learning_rate": 2.535828981557906e-05, + "logits/chosen": -1.5117790699005127, + "logits/rejected": -1.5896857976913452, + "logps/chosen": -213.57638549804688, + "logps/rejected": -280.8465270996094, + "loss": 0.0296, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5985381603240967, + "rewards/margins": 8.224201202392578, + "rewards/rejected": -9.822739601135254, + "step": 1186 + }, + { + "epoch": 1.55, + "learning_rate": 2.5322462931534658e-05, + "logits/chosen": -1.6663739681243896, + "logits/rejected": -1.6551300287246704, + "logps/chosen": -189.54556274414062, + "logps/rejected": -296.9450988769531, + "loss": 0.1034, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5812823176383972, + "rewards/margins": 10.018152236938477, + "rewards/rejected": -10.599433898925781, + "step": 1187 + }, + { + "epoch": 1.55, + "learning_rate": 2.5286635385123725e-05, + "logits/chosen": -1.7790355682373047, + "logits/rejected": -1.813086986541748, + "logps/chosen": -238.7310028076172, + "logps/rejected": -303.84320068359375, + "loss": 0.0891, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7319673299789429, + "rewards/margins": 7.041068077087402, + "rewards/rejected": -8.773035049438477, + "step": 1188 + }, + { + "epoch": 1.56, + "learning_rate": 2.525080724993914e-05, + "logits/chosen": -1.752792477607727, + "logits/rejected": -1.8063410520553589, + "logps/chosen": -202.1349334716797, + "logps/rejected": -285.0832824707031, + "loss": 0.1143, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.883070468902588, + "rewards/margins": 6.467588424682617, + "rewards/rejected": -8.350658416748047, + "step": 1189 + }, + { + "epoch": 1.56, + "learning_rate": 2.521497859957495e-05, + "logits/chosen": -1.7477810382843018, + "logits/rejected": -1.6951274871826172, + "logps/chosen": -182.742919921875, + "logps/rejected": -242.98825073242188, + "loss": 0.1535, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0694918632507324, + "rewards/margins": 6.54644775390625, + "rewards/rejected": -7.615940570831299, + "step": 1190 + }, + { + "epoch": 1.56, + "learning_rate": 2.5179149507626288e-05, + "logits/chosen": -1.4599380493164062, + "logits/rejected": -1.5278089046478271, + "logps/chosen": -183.69979858398438, + "logps/rejected": -258.5072021484375, + "loss": 0.0907, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.130995512008667, + "rewards/margins": 7.200542449951172, + "rewards/rejected": -8.331538200378418, + "step": 1191 + }, + { + "epoch": 1.56, + "learning_rate": 2.5143320047689173e-05, + "logits/chosen": -1.7460194826126099, + "logits/rejected": -1.7528928518295288, + "logps/chosen": -167.43943786621094, + "logps/rejected": -275.01397705078125, + "loss": 0.0649, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.3066067695617676, + "rewards/margins": 11.605785369873047, + "rewards/rejected": -10.299177169799805, + "step": 1192 + }, + { + "epoch": 1.56, + "learning_rate": 2.510749029336038e-05, + "logits/chosen": -1.4108166694641113, + "logits/rejected": -1.4186828136444092, + "logps/chosen": -207.23220825195312, + "logps/rejected": -309.2670593261719, + "loss": 0.0876, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5894308686256409, + "rewards/margins": 9.677989959716797, + "rewards/rejected": -10.26742172241211, + "step": 1193 + }, + { + "epoch": 1.56, + "learning_rate": 2.5071660318237312e-05, + "logits/chosen": -1.396280288696289, + "logits/rejected": -1.3600797653198242, + "logps/chosen": -214.23016357421875, + "logps/rejected": -306.523681640625, + "loss": 0.1006, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1179972887039185, + "rewards/margins": 8.530187606811523, + "rewards/rejected": -9.648184776306152, + "step": 1194 + }, + { + "epoch": 1.56, + "learning_rate": 2.5035830195917803e-05, + "logits/chosen": -1.6759350299835205, + "logits/rejected": -1.719247579574585, + "logps/chosen": -244.61578369140625, + "logps/rejected": -294.2692565917969, + "loss": 0.1318, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9835204482078552, + "rewards/margins": 6.74935245513916, + "rewards/rejected": -7.73287296295166, + "step": 1195 + }, + { + "epoch": 1.57, + "learning_rate": 2.5e-05, + "logits/chosen": -1.6701037883758545, + "logits/rejected": -1.6478197574615479, + "logps/chosen": -171.58067321777344, + "logps/rejected": -292.2452087402344, + "loss": 0.0509, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0454554557800293, + "rewards/margins": 9.922323226928711, + "rewards/rejected": -10.967779159545898, + "step": 1196 + }, + { + "epoch": 1.57, + "learning_rate": 2.49641698040822e-05, + "logits/chosen": -1.7644122838974, + "logits/rejected": -1.815422773361206, + "logps/chosen": -207.94436645507812, + "logps/rejected": -272.7748718261719, + "loss": 0.0958, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4154800474643707, + "rewards/margins": 9.19281005859375, + "rewards/rejected": -9.60828971862793, + "step": 1197 + }, + { + "epoch": 1.57, + "learning_rate": 2.4928339681762687e-05, + "logits/chosen": -1.8087043762207031, + "logits/rejected": -1.7989740371704102, + "logps/chosen": -250.95794677734375, + "logps/rejected": -307.74200439453125, + "loss": 0.1429, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.345405101776123, + "rewards/margins": 6.367486953735352, + "rewards/rejected": -7.712892055511475, + "step": 1198 + }, + { + "epoch": 1.57, + "learning_rate": 2.489250970663963e-05, + "logits/chosen": -1.463957667350769, + "logits/rejected": -1.5611281394958496, + "logps/chosen": -212.74093627929688, + "logps/rejected": -310.60845947265625, + "loss": 0.0447, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1063156127929688, + "rewards/margins": 9.414066314697266, + "rewards/rejected": -11.520381927490234, + "step": 1199 + }, + { + "epoch": 1.57, + "learning_rate": 2.485667995231084e-05, + "logits/chosen": -1.5089648962020874, + "logits/rejected": -1.5149952173233032, + "logps/chosen": -171.41029357910156, + "logps/rejected": -290.08544921875, + "loss": 0.0489, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.3906606435775757, + "rewards/margins": 12.079473495483398, + "rewards/rejected": -10.688812255859375, + "step": 1200 + }, + { + "epoch": 1.57, + "learning_rate": 2.4820850492373718e-05, + "logits/chosen": -1.779677391052246, + "logits/rejected": -1.7650448083877563, + "logps/chosen": -177.953369140625, + "logps/rejected": -252.42312622070312, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9676764607429504, + "rewards/margins": 8.15107536315918, + "rewards/rejected": -9.118751525878906, + "step": 1201 + }, + { + "epoch": 1.57, + "learning_rate": 2.4785021400425053e-05, + "logits/chosen": -1.456091046333313, + "logits/rejected": -1.4852440357208252, + "logps/chosen": -194.1282501220703, + "logps/rejected": -271.22064208984375, + "loss": 0.0936, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6693957448005676, + "rewards/margins": 7.4512248039245605, + "rewards/rejected": -8.120620727539062, + "step": 1202 + }, + { + "epoch": 1.57, + "learning_rate": 2.474919275006086e-05, + "logits/chosen": -1.778357982635498, + "logits/rejected": -1.7304366827011108, + "logps/chosen": -171.03952026367188, + "logps/rejected": -262.9996643066406, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1879677772521973, + "rewards/margins": 6.896914958953857, + "rewards/rejected": -8.084883689880371, + "step": 1203 + }, + { + "epoch": 1.58, + "learning_rate": 2.4713364614876274e-05, + "logits/chosen": -1.8325196504592896, + "logits/rejected": -1.8118171691894531, + "logps/chosen": -216.16830444335938, + "logps/rejected": -272.31927490234375, + "loss": 0.0514, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9808077812194824, + "rewards/margins": 6.602880477905273, + "rewards/rejected": -7.583687782287598, + "step": 1204 + }, + { + "epoch": 1.58, + "learning_rate": 2.4677537068465355e-05, + "logits/chosen": -1.7110586166381836, + "logits/rejected": -1.6683855056762695, + "logps/chosen": -208.89988708496094, + "logps/rejected": -279.6601867675781, + "loss": 0.1506, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7757519483566284, + "rewards/margins": 7.374731063842773, + "rewards/rejected": -8.150483131408691, + "step": 1205 + }, + { + "epoch": 1.58, + "learning_rate": 2.4641710184420945e-05, + "logits/chosen": -1.8039824962615967, + "logits/rejected": -1.8695347309112549, + "logps/chosen": -172.88931274414062, + "logps/rejected": -264.0073547363281, + "loss": 0.1639, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.696793794631958, + "rewards/margins": 8.378421783447266, + "rewards/rejected": -9.075215339660645, + "step": 1206 + }, + { + "epoch": 1.58, + "learning_rate": 2.4605884036334546e-05, + "logits/chosen": -1.3711414337158203, + "logits/rejected": -1.4007333517074585, + "logps/chosen": -159.1909637451172, + "logps/rejected": -251.80691528320312, + "loss": 0.0542, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.24178600311279297, + "rewards/margins": 9.485416412353516, + "rewards/rejected": -9.243630409240723, + "step": 1207 + }, + { + "epoch": 1.58, + "learning_rate": 2.4570058697796125e-05, + "logits/chosen": -1.640980839729309, + "logits/rejected": -1.709542155265808, + "logps/chosen": -158.44482421875, + "logps/rejected": -289.26361083984375, + "loss": 0.0446, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.393771231174469, + "rewards/margins": 9.867655754089355, + "rewards/rejected": -9.473884582519531, + "step": 1208 + }, + { + "epoch": 1.58, + "learning_rate": 2.4534234242394015e-05, + "logits/chosen": -1.7154505252838135, + "logits/rejected": -1.744265079498291, + "logps/chosen": -196.77346801757812, + "logps/rejected": -283.8974609375, + "loss": 0.0916, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5173804759979248, + "rewards/margins": 8.576264381408691, + "rewards/rejected": -10.093645095825195, + "step": 1209 + }, + { + "epoch": 1.58, + "learning_rate": 2.449841074371472e-05, + "logits/chosen": -1.7710267305374146, + "logits/rejected": -1.7606544494628906, + "logps/chosen": -184.89724731445312, + "logps/rejected": -261.56787109375, + "loss": 0.0862, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6486842036247253, + "rewards/margins": 8.085794448852539, + "rewards/rejected": -8.734478950500488, + "step": 1210 + }, + { + "epoch": 1.58, + "learning_rate": 2.4462588275342773e-05, + "logits/chosen": -1.7582557201385498, + "logits/rejected": -1.7958168983459473, + "logps/chosen": -189.32861328125, + "logps/rejected": -271.3257751464844, + "loss": 0.0889, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5048856735229492, + "rewards/margins": 8.395346641540527, + "rewards/rejected": -8.900232315063477, + "step": 1211 + }, + { + "epoch": 1.59, + "learning_rate": 2.4426766910860585e-05, + "logits/chosen": -1.6755955219268799, + "logits/rejected": -1.7484309673309326, + "logps/chosen": -159.52206420898438, + "logps/rejected": -243.6758270263672, + "loss": 0.0837, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.547278881072998, + "rewards/margins": 7.531097412109375, + "rewards/rejected": -8.078375816345215, + "step": 1212 + }, + { + "epoch": 1.59, + "learning_rate": 2.439094672384833e-05, + "logits/chosen": -1.71492600440979, + "logits/rejected": -1.6572831869125366, + "logps/chosen": -199.33560180664062, + "logps/rejected": -309.53887939453125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7166223526000977, + "rewards/margins": 10.223067283630371, + "rewards/rejected": -9.506444931030273, + "step": 1213 + }, + { + "epoch": 1.59, + "learning_rate": 2.4355127787883732e-05, + "logits/chosen": -1.4446295499801636, + "logits/rejected": -1.4801466464996338, + "logps/chosen": -139.96969604492188, + "logps/rejected": -196.85963439941406, + "loss": 0.1415, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3243495523929596, + "rewards/margins": 6.393020153045654, + "rewards/rejected": -6.717370510101318, + "step": 1214 + }, + { + "epoch": 1.59, + "learning_rate": 2.4319310176541958e-05, + "logits/chosen": -1.7520809173583984, + "logits/rejected": -1.808814525604248, + "logps/chosen": -173.7810516357422, + "logps/rejected": -259.78448486328125, + "loss": 0.0474, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.22734731435775757, + "rewards/margins": 8.69068717956543, + "rewards/rejected": -8.463340759277344, + "step": 1215 + }, + { + "epoch": 1.59, + "learning_rate": 2.428349396339547e-05, + "logits/chosen": -1.8968578577041626, + "logits/rejected": -1.847566843032837, + "logps/chosen": -156.80325317382812, + "logps/rejected": -227.9071044921875, + "loss": 0.0667, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0012752600014209747, + "rewards/margins": 6.912710666656494, + "rewards/rejected": -6.913986682891846, + "step": 1216 + }, + { + "epoch": 1.59, + "learning_rate": 2.424767922201381e-05, + "logits/chosen": -1.7135039567947388, + "logits/rejected": -1.7431325912475586, + "logps/chosen": -173.42684936523438, + "logps/rejected": -278.7267150878906, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10155464708805084, + "rewards/margins": 9.204533576965332, + "rewards/rejected": -9.10297966003418, + "step": 1217 + }, + { + "epoch": 1.59, + "learning_rate": 2.4211866025963557e-05, + "logits/chosen": -1.5762271881103516, + "logits/rejected": -1.5741418600082397, + "logps/chosen": -164.8806915283203, + "logps/rejected": -234.95248413085938, + "loss": 0.1213, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5879465937614441, + "rewards/margins": 7.251880645751953, + "rewards/rejected": -7.839826583862305, + "step": 1218 + }, + { + "epoch": 1.6, + "learning_rate": 2.417605444880807e-05, + "logits/chosen": -1.5475339889526367, + "logits/rejected": -1.5313646793365479, + "logps/chosen": -191.47743225097656, + "logps/rejected": -269.4033203125, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5132349133491516, + "rewards/margins": 8.826435089111328, + "rewards/rejected": -9.339670181274414, + "step": 1219 + }, + { + "epoch": 1.6, + "learning_rate": 2.4140244564107402e-05, + "logits/chosen": -1.7505229711532593, + "logits/rejected": -1.70061194896698, + "logps/chosen": -197.9004364013672, + "logps/rejected": -299.317138671875, + "loss": 0.0643, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2676445245742798, + "rewards/margins": 9.395583152770996, + "rewards/rejected": -8.127938270568848, + "step": 1220 + }, + { + "epoch": 1.6, + "learning_rate": 2.4104436445418145e-05, + "logits/chosen": -1.5308407545089722, + "logits/rejected": -1.5543413162231445, + "logps/chosen": -185.0545654296875, + "logps/rejected": -310.54852294921875, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.010423183441162, + "rewards/margins": 11.161713600158691, + "rewards/rejected": -10.151289939880371, + "step": 1221 + }, + { + "epoch": 1.6, + "learning_rate": 2.4068630166293215e-05, + "logits/chosen": -1.6732879877090454, + "logits/rejected": -1.678316354751587, + "logps/chosen": -209.31515502929688, + "logps/rejected": -266.0266418457031, + "loss": 0.0537, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.21912261843681335, + "rewards/margins": 7.3648481369018555, + "rewards/rejected": -7.583970546722412, + "step": 1222 + }, + { + "epoch": 1.6, + "learning_rate": 2.4032825800281804e-05, + "logits/chosen": -1.9282745122909546, + "logits/rejected": -1.9191973209381104, + "logps/chosen": -179.317626953125, + "logps/rejected": -246.746826171875, + "loss": 0.0529, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.060091596096754074, + "rewards/margins": 6.826633453369141, + "rewards/rejected": -6.886724472045898, + "step": 1223 + }, + { + "epoch": 1.6, + "learning_rate": 2.3997023420929137e-05, + "logits/chosen": -1.9516724348068237, + "logits/rejected": -1.8882488012313843, + "logps/chosen": -234.22061157226562, + "logps/rejected": -303.38861083984375, + "loss": 0.0975, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.16620711982250214, + "rewards/margins": 7.9483137130737305, + "rewards/rejected": -8.114520072937012, + "step": 1224 + }, + { + "epoch": 1.6, + "learning_rate": 2.3961223101776375e-05, + "logits/chosen": -1.9422708749771118, + "logits/rejected": -1.9600876569747925, + "logps/chosen": -166.08450317382812, + "logps/rejected": -228.75006103515625, + "loss": 0.093, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6036259531974792, + "rewards/margins": 7.759881019592285, + "rewards/rejected": -7.156254768371582, + "step": 1225 + }, + { + "epoch": 1.6, + "learning_rate": 2.392542491636045e-05, + "logits/chosen": -1.7444486618041992, + "logits/rejected": -1.7814403772354126, + "logps/chosen": -168.4735870361328, + "logps/rejected": -243.81700134277344, + "loss": 0.0565, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1344974935054779, + "rewards/margins": 8.722359657287598, + "rewards/rejected": -8.856857299804688, + "step": 1226 + }, + { + "epoch": 1.61, + "learning_rate": 2.3889628938213905e-05, + "logits/chosen": -1.7509342432022095, + "logits/rejected": -1.7369998693466187, + "logps/chosen": -184.87417602539062, + "logps/rejected": -293.61407470703125, + "loss": 0.0505, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32053953409194946, + "rewards/margins": 9.440658569335938, + "rewards/rejected": -9.120119094848633, + "step": 1227 + }, + { + "epoch": 1.61, + "learning_rate": 2.3853835240864743e-05, + "logits/chosen": -1.6605939865112305, + "logits/rejected": -1.7019318342208862, + "logps/chosen": -189.66299438476562, + "logps/rejected": -301.81402587890625, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7814435958862305, + "rewards/margins": 8.428256034851074, + "rewards/rejected": -9.209700584411621, + "step": 1228 + }, + { + "epoch": 1.61, + "learning_rate": 2.381804389783628e-05, + "logits/chosen": -1.5725795030593872, + "logits/rejected": -1.5468381643295288, + "logps/chosen": -200.10365295410156, + "logps/rejected": -291.8433532714844, + "loss": 0.0515, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.22199401259422302, + "rewards/margins": 9.91091537475586, + "rewards/rejected": -9.688920974731445, + "step": 1229 + }, + { + "epoch": 1.61, + "learning_rate": 2.3782254982647013e-05, + "logits/chosen": -1.709377646446228, + "logits/rejected": -1.7137666940689087, + "logps/chosen": -174.21273803710938, + "logps/rejected": -241.732666015625, + "loss": 0.0691, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.005614191293716431, + "rewards/margins": 6.928896903991699, + "rewards/rejected": -6.934510707855225, + "step": 1230 + }, + { + "epoch": 1.61, + "learning_rate": 2.374646856881045e-05, + "logits/chosen": -1.742085576057434, + "logits/rejected": -1.7839727401733398, + "logps/chosen": -140.5599365234375, + "logps/rejected": -261.37347412109375, + "loss": 0.1062, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5944003462791443, + "rewards/margins": 10.062451362609863, + "rewards/rejected": -9.468050956726074, + "step": 1231 + }, + { + "epoch": 1.61, + "learning_rate": 2.3710684729834954e-05, + "logits/chosen": -1.7672961950302124, + "logits/rejected": -1.854988932609558, + "logps/chosen": -165.33840942382812, + "logps/rejected": -229.556640625, + "loss": 0.0914, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3458694815635681, + "rewards/margins": 7.2956037521362305, + "rewards/rejected": -6.949734687805176, + "step": 1232 + }, + { + "epoch": 1.61, + "learning_rate": 2.367490353922358e-05, + "logits/chosen": -1.588930368423462, + "logits/rejected": -1.6457085609436035, + "logps/chosen": -152.62991333007812, + "logps/rejected": -237.3344268798828, + "loss": 0.0863, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.42593804001808167, + "rewards/margins": 7.208555698394775, + "rewards/rejected": -7.634493827819824, + "step": 1233 + }, + { + "epoch": 1.61, + "learning_rate": 2.3639125070473975e-05, + "logits/chosen": -1.4693684577941895, + "logits/rejected": -1.5538169145584106, + "logps/chosen": -156.80490112304688, + "logps/rejected": -228.3775177001953, + "loss": 0.0765, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5519406795501709, + "rewards/margins": 6.328725814819336, + "rewards/rejected": -6.880666732788086, + "step": 1234 + }, + { + "epoch": 1.62, + "learning_rate": 2.3603349397078182e-05, + "logits/chosen": -1.7044745683670044, + "logits/rejected": -1.73379647731781, + "logps/chosen": -167.63999938964844, + "logps/rejected": -244.90013122558594, + "loss": 0.1991, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6486749649047852, + "rewards/margins": 6.945343017578125, + "rewards/rejected": -7.594018459320068, + "step": 1235 + }, + { + "epoch": 1.62, + "learning_rate": 2.3567576592522507e-05, + "logits/chosen": -1.78825044631958, + "logits/rejected": -1.843505859375, + "logps/chosen": -171.70828247070312, + "logps/rejected": -259.90704345703125, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9683905839920044, + "rewards/margins": 9.303258895874023, + "rewards/rejected": -8.334867477416992, + "step": 1236 + }, + { + "epoch": 1.62, + "learning_rate": 2.3531806730287342e-05, + "logits/chosen": -1.8858531713485718, + "logits/rejected": -1.8973859548568726, + "logps/chosen": -187.58120727539062, + "logps/rejected": -292.45263671875, + "loss": 0.1306, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2341868132352829, + "rewards/margins": 7.944581031799316, + "rewards/rejected": -8.178768157958984, + "step": 1237 + }, + { + "epoch": 1.62, + "learning_rate": 2.349603988384708e-05, + "logits/chosen": -1.667884349822998, + "logits/rejected": -1.6044398546218872, + "logps/chosen": -190.60089111328125, + "logps/rejected": -269.7427062988281, + "loss": 0.0677, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6046330332756042, + "rewards/margins": 6.621963977813721, + "rewards/rejected": -7.226597309112549, + "step": 1238 + }, + { + "epoch": 1.62, + "learning_rate": 2.3460276126669854e-05, + "logits/chosen": -1.7128525972366333, + "logits/rejected": -1.8230139017105103, + "logps/chosen": -156.34112548828125, + "logps/rejected": -260.3674621582031, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010287538170814514, + "rewards/margins": 8.259210586547852, + "rewards/rejected": -8.248923301696777, + "step": 1239 + }, + { + "epoch": 1.62, + "learning_rate": 2.342451553221752e-05, + "logits/chosen": -1.6798733472824097, + "logits/rejected": -1.649233102798462, + "logps/chosen": -156.37783813476562, + "logps/rejected": -280.7669677734375, + "loss": 0.0564, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8614262938499451, + "rewards/margins": 8.050861358642578, + "rewards/rejected": -8.912288665771484, + "step": 1240 + }, + { + "epoch": 1.62, + "learning_rate": 2.338875817394539e-05, + "logits/chosen": -1.518446922302246, + "logits/rejected": -1.5001682043075562, + "logps/chosen": -187.62730407714844, + "logps/rejected": -274.3509826660156, + "loss": 0.1006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32726746797561646, + "rewards/margins": 8.384183883666992, + "rewards/rejected": -8.71145248413086, + "step": 1241 + }, + { + "epoch": 1.63, + "learning_rate": 2.3353004125302142e-05, + "logits/chosen": -1.7005969285964966, + "logits/rejected": -1.7075004577636719, + "logps/chosen": -157.98902893066406, + "logps/rejected": -223.71835327148438, + "loss": 0.1194, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0016721785068511963, + "rewards/margins": 7.834788799285889, + "rewards/rejected": -7.836461067199707, + "step": 1242 + }, + { + "epoch": 1.63, + "learning_rate": 2.331725345972968e-05, + "logits/chosen": -1.7345434427261353, + "logits/rejected": -1.7812566757202148, + "logps/chosen": -184.71971130371094, + "logps/rejected": -250.2111358642578, + "loss": 0.0523, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.378732681274414, + "rewards/margins": 10.200021743774414, + "rewards/rejected": -8.821290016174316, + "step": 1243 + }, + { + "epoch": 1.63, + "learning_rate": 2.32815062506629e-05, + "logits/chosen": -1.353702425956726, + "logits/rejected": -1.415529727935791, + "logps/chosen": -186.42298889160156, + "logps/rejected": -300.6753845214844, + "loss": 0.0701, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.035717248916626, + "rewards/margins": 7.932292461395264, + "rewards/rejected": -8.968009948730469, + "step": 1244 + }, + { + "epoch": 1.63, + "learning_rate": 2.3245762571529667e-05, + "logits/chosen": -1.7484427690505981, + "logits/rejected": -1.813354730606079, + "logps/chosen": -138.3816680908203, + "logps/rejected": -247.28982543945312, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5285117626190186, + "rewards/margins": 10.118577003479004, + "rewards/rejected": -8.590065956115723, + "step": 1245 + }, + { + "epoch": 1.63, + "learning_rate": 2.3210022495750552e-05, + "logits/chosen": -1.7814013957977295, + "logits/rejected": -1.7743293046951294, + "logps/chosen": -160.9421844482422, + "logps/rejected": -270.0235595703125, + "loss": 0.0491, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5635257959365845, + "rewards/margins": 8.027871131896973, + "rewards/rejected": -8.59139633178711, + "step": 1246 + }, + { + "epoch": 1.63, + "learning_rate": 2.317428609673873e-05, + "logits/chosen": -1.6264969110488892, + "logits/rejected": -1.6207430362701416, + "logps/chosen": -183.52662658691406, + "logps/rejected": -268.860107421875, + "loss": 0.0333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4351687431335449, + "rewards/margins": 8.183597564697266, + "rewards/rejected": -8.618766784667969, + "step": 1247 + }, + { + "epoch": 1.63, + "learning_rate": 2.3138553447899835e-05, + "logits/chosen": -1.7999589443206787, + "logits/rejected": -1.7919996976852417, + "logps/chosen": -164.28555297851562, + "logps/rejected": -226.3788604736328, + "loss": 0.0652, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5257167220115662, + "rewards/margins": 6.826259613037109, + "rewards/rejected": -7.35197639465332, + "step": 1248 + }, + { + "epoch": 1.63, + "learning_rate": 2.3102824622631803e-05, + "logits/chosen": -1.8032572269439697, + "logits/rejected": -1.8391257524490356, + "logps/chosen": -173.9495849609375, + "logps/rejected": -220.82058715820312, + "loss": 0.1733, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.717067003250122, + "rewards/margins": 4.9027791023254395, + "rewards/rejected": -6.619846343994141, + "step": 1249 + }, + { + "epoch": 1.64, + "learning_rate": 2.3067099694324686e-05, + "logits/chosen": -1.700643539428711, + "logits/rejected": -1.704951524734497, + "logps/chosen": -172.47607421875, + "logps/rejected": -280.7120056152344, + "loss": 0.0485, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6809951066970825, + "rewards/margins": 9.853233337402344, + "rewards/rejected": -9.17223834991455, + "step": 1250 + }, + { + "epoch": 1.64, + "learning_rate": 2.3031378736360562e-05, + "logits/chosen": -1.4982144832611084, + "logits/rejected": -1.5640318393707275, + "logps/chosen": -180.1248016357422, + "logps/rejected": -284.48895263671875, + "loss": 0.0503, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.35612598061561584, + "rewards/margins": 10.218582153320312, + "rewards/rejected": -9.862455368041992, + "step": 1251 + }, + { + "epoch": 1.64, + "learning_rate": 2.299566182211333e-05, + "logits/chosen": -1.6494331359863281, + "logits/rejected": -1.7211148738861084, + "logps/chosen": -162.5277557373047, + "logps/rejected": -258.582763671875, + "loss": 0.0901, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.026039481163025, + "rewards/margins": 7.643625259399414, + "rewards/rejected": -8.669665336608887, + "step": 1252 + }, + { + "epoch": 1.64, + "learning_rate": 2.295994902494861e-05, + "logits/chosen": -1.7468377351760864, + "logits/rejected": -1.7673417329788208, + "logps/chosen": -167.36648559570312, + "logps/rejected": -246.13308715820312, + "loss": 0.0511, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3232724666595459, + "rewards/margins": 7.1906232833862305, + "rewards/rejected": -7.5138959884643555, + "step": 1253 + }, + { + "epoch": 1.64, + "learning_rate": 2.292424041822355e-05, + "logits/chosen": -1.750199794769287, + "logits/rejected": -1.7258578538894653, + "logps/chosen": -198.74107360839844, + "logps/rejected": -277.41876220703125, + "loss": 0.1352, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3927921652793884, + "rewards/margins": 7.941418170928955, + "rewards/rejected": -7.548626899719238, + "step": 1254 + }, + { + "epoch": 1.64, + "learning_rate": 2.2888536075286675e-05, + "logits/chosen": -1.472868800163269, + "logits/rejected": -1.5704387426376343, + "logps/chosen": -171.90785217285156, + "logps/rejected": -257.0007019042969, + "loss": 0.0514, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2161782681941986, + "rewards/margins": 9.074492454528809, + "rewards/rejected": -8.858314514160156, + "step": 1255 + }, + { + "epoch": 1.64, + "learning_rate": 2.2852836069477773e-05, + "logits/chosen": -1.6353071928024292, + "logits/rejected": -1.6045923233032227, + "logps/chosen": -207.18911743164062, + "logps/rejected": -332.3581848144531, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04016450047492981, + "rewards/margins": 10.133883476257324, + "rewards/rejected": -10.093719482421875, + "step": 1256 + }, + { + "epoch": 1.65, + "learning_rate": 2.281714047412773e-05, + "logits/chosen": -1.6737552881240845, + "logits/rejected": -1.6502115726470947, + "logps/chosen": -200.25421142578125, + "logps/rejected": -265.3543701171875, + "loss": 0.0943, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3247901201248169, + "rewards/margins": 8.3992280960083, + "rewards/rejected": -8.724019050598145, + "step": 1257 + }, + { + "epoch": 1.65, + "learning_rate": 2.2781449362558347e-05, + "logits/chosen": -1.9014949798583984, + "logits/rejected": -1.8843990564346313, + "logps/chosen": -151.2793731689453, + "logps/rejected": -251.31907653808594, + "loss": 0.0887, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5275027751922607, + "rewards/margins": 7.974277973175049, + "rewards/rejected": -8.50178050994873, + "step": 1258 + }, + { + "epoch": 1.65, + "learning_rate": 2.2745762808082223e-05, + "logits/chosen": -1.9266729354858398, + "logits/rejected": -1.9442236423492432, + "logps/chosen": -171.84243774414062, + "logps/rejected": -272.5491027832031, + "loss": 0.0876, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10790953785181046, + "rewards/margins": 10.097283363342285, + "rewards/rejected": -9.989373207092285, + "step": 1259 + }, + { + "epoch": 1.65, + "learning_rate": 2.2710080884002632e-05, + "logits/chosen": -1.7961838245391846, + "logits/rejected": -1.8136701583862305, + "logps/chosen": -181.12777709960938, + "logps/rejected": -242.9888153076172, + "loss": 0.1576, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7358523607254028, + "rewards/margins": 6.861660480499268, + "rewards/rejected": -7.597512722015381, + "step": 1260 + }, + { + "epoch": 1.65, + "learning_rate": 2.2674403663613267e-05, + "logits/chosen": -1.9694092273712158, + "logits/rejected": -2.0717010498046875, + "logps/chosen": -193.0343780517578, + "logps/rejected": -302.82977294921875, + "loss": 0.0748, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2841745615005493, + "rewards/margins": 7.759544849395752, + "rewards/rejected": -9.043719291687012, + "step": 1261 + }, + { + "epoch": 1.65, + "learning_rate": 2.263873122019822e-05, + "logits/chosen": -1.662811279296875, + "logits/rejected": -1.5405888557434082, + "logps/chosen": -201.64263916015625, + "logps/rejected": -251.61192321777344, + "loss": 0.1166, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.42639869451522827, + "rewards/margins": 8.240140914916992, + "rewards/rejected": -7.813741683959961, + "step": 1262 + }, + { + "epoch": 1.65, + "learning_rate": 2.2603063627031744e-05, + "logits/chosen": -1.662422776222229, + "logits/rejected": -1.686476469039917, + "logps/chosen": -190.2987060546875, + "logps/rejected": -294.76416015625, + "loss": 0.0702, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09475106000900269, + "rewards/margins": 8.890002250671387, + "rewards/rejected": -8.79525089263916, + "step": 1263 + }, + { + "epoch": 1.65, + "learning_rate": 2.2567400957378132e-05, + "logits/chosen": -1.927976369857788, + "logits/rejected": -1.8747197389602661, + "logps/chosen": -196.0269317626953, + "logps/rejected": -266.9189758300781, + "loss": 0.1008, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.015792138874530792, + "rewards/margins": 7.972220420837402, + "rewards/rejected": -7.956428527832031, + "step": 1264 + }, + { + "epoch": 1.66, + "learning_rate": 2.253174328449158e-05, + "logits/chosen": -1.766976237297058, + "logits/rejected": -1.812227487564087, + "logps/chosen": -154.96624755859375, + "logps/rejected": -228.69532775878906, + "loss": 0.0967, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.341795951128006, + "rewards/margins": 6.919504165649414, + "rewards/rejected": -7.261300563812256, + "step": 1265 + }, + { + "epoch": 1.66, + "learning_rate": 2.2496090681615984e-05, + "logits/chosen": -1.614734411239624, + "logits/rejected": -1.6324650049209595, + "logps/chosen": -193.87318420410156, + "logps/rejected": -264.75823974609375, + "loss": 0.1718, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9917040467262268, + "rewards/margins": 5.997256278991699, + "rewards/rejected": -6.988959789276123, + "step": 1266 + }, + { + "epoch": 1.66, + "learning_rate": 2.246044322198486e-05, + "logits/chosen": -1.4767704010009766, + "logits/rejected": -1.4471330642700195, + "logps/chosen": -211.3580780029297, + "logps/rejected": -279.1626281738281, + "loss": 0.0681, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6186728477478027, + "rewards/margins": 6.812000751495361, + "rewards/rejected": -7.430673599243164, + "step": 1267 + }, + { + "epoch": 1.66, + "learning_rate": 2.2424800978821146e-05, + "logits/chosen": -1.733639121055603, + "logits/rejected": -1.7162387371063232, + "logps/chosen": -191.9793701171875, + "logps/rejected": -306.0337219238281, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19160997867584229, + "rewards/margins": 9.62875747680664, + "rewards/rejected": -9.820367813110352, + "step": 1268 + }, + { + "epoch": 1.66, + "learning_rate": 2.238916402533706e-05, + "logits/chosen": -1.7573490142822266, + "logits/rejected": -1.7638121843338013, + "logps/chosen": -159.29444885253906, + "logps/rejected": -244.30816650390625, + "loss": 0.0776, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.25248706340789795, + "rewards/margins": 7.951773643493652, + "rewards/rejected": -8.20426082611084, + "step": 1269 + }, + { + "epoch": 1.66, + "learning_rate": 2.235353243473398e-05, + "logits/chosen": -1.848198413848877, + "logits/rejected": -1.8705389499664307, + "logps/chosen": -226.47897338867188, + "logps/rejected": -309.0074462890625, + "loss": 0.1366, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5574419498443604, + "rewards/margins": 8.435647010803223, + "rewards/rejected": -8.993088722229004, + "step": 1270 + }, + { + "epoch": 1.66, + "learning_rate": 2.231790628020222e-05, + "logits/chosen": -1.7706189155578613, + "logits/rejected": -1.7382731437683105, + "logps/chosen": -211.0347442626953, + "logps/rejected": -271.48065185546875, + "loss": 0.1263, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5742791891098022, + "rewards/margins": 6.914097785949707, + "rewards/rejected": -7.488377094268799, + "step": 1271 + }, + { + "epoch": 1.66, + "learning_rate": 2.228228563492098e-05, + "logits/chosen": -1.847198724746704, + "logits/rejected": -1.8329821825027466, + "logps/chosen": -190.62921142578125, + "logps/rejected": -285.3981628417969, + "loss": 0.1488, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.47729751467704773, + "rewards/margins": 8.345022201538086, + "rewards/rejected": -8.822319984436035, + "step": 1272 + }, + { + "epoch": 1.67, + "learning_rate": 2.224667057205811e-05, + "logits/chosen": -1.880344033241272, + "logits/rejected": -1.9345133304595947, + "logps/chosen": -155.05136108398438, + "logps/rejected": -265.0672912597656, + "loss": 0.0885, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.11232158541679382, + "rewards/margins": 9.258454322814941, + "rewards/rejected": -9.14613151550293, + "step": 1273 + }, + { + "epoch": 1.67, + "learning_rate": 2.2211061164769997e-05, + "logits/chosen": -1.867252230644226, + "logits/rejected": -1.9265720844268799, + "logps/chosen": -186.79556274414062, + "logps/rejected": -281.2059020996094, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4724681079387665, + "rewards/margins": 8.82156753540039, + "rewards/rejected": -9.294034957885742, + "step": 1274 + }, + { + "epoch": 1.67, + "learning_rate": 2.2175457486201435e-05, + "logits/chosen": -1.7061909437179565, + "logits/rejected": -1.7160139083862305, + "logps/chosen": -158.32867431640625, + "logps/rejected": -234.9594268798828, + "loss": 0.1965, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5954041481018066, + "rewards/margins": 7.3393049240112305, + "rewards/rejected": -7.934709548950195, + "step": 1275 + }, + { + "epoch": 1.67, + "learning_rate": 2.2139859609485426e-05, + "logits/chosen": -1.5261914730072021, + "logits/rejected": -1.5450482368469238, + "logps/chosen": -177.5061492919922, + "logps/rejected": -304.0037841796875, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3365687131881714, + "rewards/margins": 10.240978240966797, + "rewards/rejected": -10.577546119689941, + "step": 1276 + }, + { + "epoch": 1.67, + "learning_rate": 2.2104267607743057e-05, + "logits/chosen": -1.81212317943573, + "logits/rejected": -1.8661532402038574, + "logps/chosen": -167.34146118164062, + "logps/rejected": -258.1644592285156, + "loss": 0.0896, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41514402627944946, + "rewards/margins": 8.24533748626709, + "rewards/rejected": -8.660481452941895, + "step": 1277 + }, + { + "epoch": 1.67, + "learning_rate": 2.2068681554083345e-05, + "logits/chosen": -1.492951512336731, + "logits/rejected": -1.4607274532318115, + "logps/chosen": -160.21688842773438, + "logps/rejected": -251.48419189453125, + "loss": 0.0738, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7796900272369385, + "rewards/margins": 8.634302139282227, + "rewards/rejected": -9.413991928100586, + "step": 1278 + }, + { + "epoch": 1.67, + "learning_rate": 2.2033101521603113e-05, + "logits/chosen": -1.5999889373779297, + "logits/rejected": -1.5579280853271484, + "logps/chosen": -180.76455688476562, + "logps/rejected": -230.758544921875, + "loss": 0.0448, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.033610016107559204, + "rewards/margins": 7.108610153198242, + "rewards/rejected": -7.1422200202941895, + "step": 1279 + }, + { + "epoch": 1.68, + "learning_rate": 2.199752758338679e-05, + "logits/chosen": -1.7404762506484985, + "logits/rejected": -1.8205273151397705, + "logps/chosen": -163.6934356689453, + "logps/rejected": -254.3972625732422, + "loss": 0.1054, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7429052591323853, + "rewards/margins": 7.068746089935303, + "rewards/rejected": -7.811651706695557, + "step": 1280 + }, + { + "epoch": 1.68, + "learning_rate": 2.19619598125063e-05, + "logits/chosen": -1.7438024282455444, + "logits/rejected": -1.6977444887161255, + "logps/chosen": -209.1776580810547, + "logps/rejected": -279.10986328125, + "loss": 0.2044, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6635751724243164, + "rewards/margins": 6.604758262634277, + "rewards/rejected": -7.268333911895752, + "step": 1281 + }, + { + "epoch": 1.68, + "learning_rate": 2.192639828202089e-05, + "logits/chosen": -1.7248817682266235, + "logits/rejected": -1.7331123352050781, + "logps/chosen": -179.87646484375, + "logps/rejected": -281.8304138183594, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.062381476163864136, + "rewards/margins": 10.009779930114746, + "rewards/rejected": -10.072161674499512, + "step": 1282 + }, + { + "epoch": 1.68, + "learning_rate": 2.1890843064976986e-05, + "logits/chosen": -1.7990466356277466, + "logits/rejected": -1.7838562726974487, + "logps/chosen": -196.99893188476562, + "logps/rejected": -287.0201416015625, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33407461643218994, + "rewards/margins": 10.04912281036377, + "rewards/rejected": -9.715047836303711, + "step": 1283 + }, + { + "epoch": 1.68, + "learning_rate": 2.1855294234408068e-05, + "logits/chosen": -1.820509910583496, + "logits/rejected": -1.8059285879135132, + "logps/chosen": -190.8967742919922, + "logps/rejected": -268.3674011230469, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42067429423332214, + "rewards/margins": 8.252467155456543, + "rewards/rejected": -7.831792831420898, + "step": 1284 + }, + { + "epoch": 1.68, + "learning_rate": 2.181975186333448e-05, + "logits/chosen": -1.6455038785934448, + "logits/rejected": -1.7375361919403076, + "logps/chosen": -134.8241424560547, + "logps/rejected": -221.16561889648438, + "loss": 0.0433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012232035398483276, + "rewards/margins": 8.418768882751465, + "rewards/rejected": -8.430999755859375, + "step": 1285 + }, + { + "epoch": 1.68, + "learning_rate": 2.1784216024763284e-05, + "logits/chosen": -1.7884451150894165, + "logits/rejected": -1.7565916776657104, + "logps/chosen": -195.28054809570312, + "logps/rejected": -271.68017578125, + "loss": 0.0539, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.33611348271369934, + "rewards/margins": 9.849245071411133, + "rewards/rejected": -9.513132095336914, + "step": 1286 + }, + { + "epoch": 1.68, + "learning_rate": 2.1748686791688176e-05, + "logits/chosen": -2.0476958751678467, + "logits/rejected": -2.038017749786377, + "logps/chosen": -239.9565887451172, + "logps/rejected": -326.0540771484375, + "loss": 0.174, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9971805810928345, + "rewards/margins": 7.539958477020264, + "rewards/rejected": -9.537139892578125, + "step": 1287 + }, + { + "epoch": 1.69, + "learning_rate": 2.1713164237089203e-05, + "logits/chosen": -1.604038119316101, + "logits/rejected": -1.5910694599151611, + "logps/chosen": -171.3157958984375, + "logps/rejected": -255.69631958007812, + "loss": 0.1409, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1205165386199951, + "rewards/margins": 5.1685075759887695, + "rewards/rejected": -6.289024353027344, + "step": 1288 + }, + { + "epoch": 1.69, + "learning_rate": 2.167764843393277e-05, + "logits/chosen": -1.7501810789108276, + "logits/rejected": -1.7762590646743774, + "logps/chosen": -167.06053161621094, + "logps/rejected": -273.9512939453125, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.331756055355072, + "rewards/margins": 8.977778434753418, + "rewards/rejected": -8.646021842956543, + "step": 1289 + }, + { + "epoch": 1.69, + "learning_rate": 2.1642139455171366e-05, + "logits/chosen": -1.7394685745239258, + "logits/rejected": -1.7270408868789673, + "logps/chosen": -153.25633239746094, + "logps/rejected": -253.04904174804688, + "loss": 0.0534, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.20436710119247437, + "rewards/margins": 8.951108932495117, + "rewards/rejected": -8.74674129486084, + "step": 1290 + }, + { + "epoch": 1.69, + "learning_rate": 2.160663737374348e-05, + "logits/chosen": -1.7180070877075195, + "logits/rejected": -1.7248402833938599, + "logps/chosen": -152.55325317382812, + "logps/rejected": -222.6513671875, + "loss": 0.1831, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1678484678268433, + "rewards/margins": 5.055422782897949, + "rewards/rejected": -6.223270893096924, + "step": 1291 + }, + { + "epoch": 1.69, + "learning_rate": 2.1571142262573457e-05, + "logits/chosen": -1.7961055040359497, + "logits/rejected": -1.7868380546569824, + "logps/chosen": -164.25143432617188, + "logps/rejected": -228.6146697998047, + "loss": 0.0524, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5518704652786255, + "rewards/margins": 8.316145896911621, + "rewards/rejected": -7.764275074005127, + "step": 1292 + }, + { + "epoch": 1.69, + "learning_rate": 2.153565419457126e-05, + "logits/chosen": -1.6956509351730347, + "logits/rejected": -1.6393764019012451, + "logps/chosen": -173.90548706054688, + "logps/rejected": -249.1767578125, + "loss": 0.0788, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.560000479221344, + "rewards/margins": 7.636308193206787, + "rewards/rejected": -8.196309089660645, + "step": 1293 + }, + { + "epoch": 1.69, + "learning_rate": 2.1500173242632446e-05, + "logits/chosen": -1.7347395420074463, + "logits/rejected": -1.8785431385040283, + "logps/chosen": -159.41952514648438, + "logps/rejected": -277.9622497558594, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48645877838134766, + "rewards/margins": 8.919507026672363, + "rewards/rejected": -8.433048248291016, + "step": 1294 + }, + { + "epoch": 1.69, + "learning_rate": 2.1464699479637934e-05, + "logits/chosen": -1.7078378200531006, + "logits/rejected": -1.6574723720550537, + "logps/chosen": -175.77883911132812, + "logps/rejected": -250.67721557617188, + "loss": 0.0582, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.21977680921554565, + "rewards/margins": 8.613102912902832, + "rewards/rejected": -8.832879066467285, + "step": 1295 + }, + { + "epoch": 1.7, + "learning_rate": 2.1429232978453862e-05, + "logits/chosen": -1.6148117780685425, + "logits/rejected": -1.5872223377227783, + "logps/chosen": -201.86769104003906, + "logps/rejected": -301.83056640625, + "loss": 0.1007, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1088818609714508, + "rewards/margins": 10.18004035949707, + "rewards/rejected": -10.288922309875488, + "step": 1296 + }, + { + "epoch": 1.7, + "learning_rate": 2.1393773811931483e-05, + "logits/chosen": -1.835828423500061, + "logits/rejected": -1.8179807662963867, + "logps/chosen": -164.35507202148438, + "logps/rejected": -210.44248962402344, + "loss": 0.1349, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8428851962089539, + "rewards/margins": 6.351404666900635, + "rewards/rejected": -7.1942901611328125, + "step": 1297 + }, + { + "epoch": 1.7, + "learning_rate": 2.135832205290696e-05, + "logits/chosen": -1.5887879133224487, + "logits/rejected": -1.6757391691207886, + "logps/chosen": -180.8358154296875, + "logps/rejected": -307.1868591308594, + "loss": 0.0572, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6378967761993408, + "rewards/margins": 9.824544906616211, + "rewards/rejected": -10.462441444396973, + "step": 1298 + }, + { + "epoch": 1.7, + "learning_rate": 2.132287777420124e-05, + "logits/chosen": -1.7509876489639282, + "logits/rejected": -1.7373014688491821, + "logps/chosen": -246.9702606201172, + "logps/rejected": -283.8993225097656, + "loss": 0.2323, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1223161220550537, + "rewards/margins": 5.97442626953125, + "rewards/rejected": -8.096742630004883, + "step": 1299 + }, + { + "epoch": 1.7, + "learning_rate": 2.128744104861991e-05, + "logits/chosen": -1.9548816680908203, + "logits/rejected": -2.009321928024292, + "logps/chosen": -157.5906982421875, + "logps/rejected": -228.1550750732422, + "loss": 0.2636, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9159742593765259, + "rewards/margins": 6.327787399291992, + "rewards/rejected": -7.243762016296387, + "step": 1300 + }, + { + "epoch": 1.7, + "learning_rate": 2.125201194895305e-05, + "logits/chosen": -1.9847517013549805, + "logits/rejected": -1.9725074768066406, + "logps/chosen": -192.99295043945312, + "logps/rejected": -294.5769958496094, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010755106806755066, + "rewards/margins": 10.29660701751709, + "rewards/rejected": -10.307361602783203, + "step": 1301 + }, + { + "epoch": 1.7, + "learning_rate": 2.121659054797507e-05, + "logits/chosen": -1.7674875259399414, + "logits/rejected": -1.764140009880066, + "logps/chosen": -169.094482421875, + "logps/rejected": -274.3408508300781, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15594688057899475, + "rewards/margins": 9.34589672088623, + "rewards/rejected": -9.189949989318848, + "step": 1302 + }, + { + "epoch": 1.71, + "learning_rate": 2.118117691844456e-05, + "logits/chosen": -1.745078444480896, + "logits/rejected": -1.6791242361068726, + "logps/chosen": -189.05062866210938, + "logps/rejected": -276.6964111328125, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9232670068740845, + "rewards/margins": 8.875031471252441, + "rewards/rejected": -9.798297882080078, + "step": 1303 + }, + { + "epoch": 1.71, + "learning_rate": 2.1145771133104157e-05, + "logits/chosen": -1.7861813306808472, + "logits/rejected": -1.8145291805267334, + "logps/chosen": -185.99569702148438, + "logps/rejected": -277.31292724609375, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2370024472475052, + "rewards/margins": 8.88377857208252, + "rewards/rejected": -9.120781898498535, + "step": 1304 + }, + { + "epoch": 1.71, + "learning_rate": 2.111037326468037e-05, + "logits/chosen": -1.559670090675354, + "logits/rejected": -1.511789083480835, + "logps/chosen": -207.8978271484375, + "logps/rejected": -273.0669250488281, + "loss": 0.2938, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.404197096824646, + "rewards/margins": 6.444756984710693, + "rewards/rejected": -7.848954200744629, + "step": 1305 + }, + { + "epoch": 1.71, + "learning_rate": 2.107498338588347e-05, + "logits/chosen": -1.8835152387619019, + "logits/rejected": -1.8800623416900635, + "logps/chosen": -185.1143798828125, + "logps/rejected": -284.5544128417969, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6539617776870728, + "rewards/margins": 9.118383407592773, + "rewards/rejected": -9.772345542907715, + "step": 1306 + }, + { + "epoch": 1.71, + "learning_rate": 2.1039601569407298e-05, + "logits/chosen": -1.621598243713379, + "logits/rejected": -1.6100132465362549, + "logps/chosen": -173.21688842773438, + "logps/rejected": -247.858642578125, + "loss": 0.1131, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4944629371166229, + "rewards/margins": 8.553914070129395, + "rewards/rejected": -8.05945110321045, + "step": 1307 + }, + { + "epoch": 1.71, + "learning_rate": 2.1004227887929133e-05, + "logits/chosen": -1.7362234592437744, + "logits/rejected": -1.7847158908843994, + "logps/chosen": -164.24559020996094, + "logps/rejected": -266.5970458984375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7328810691833496, + "rewards/margins": 9.830469131469727, + "rewards/rejected": -9.097589492797852, + "step": 1308 + }, + { + "epoch": 1.71, + "learning_rate": 2.0968862414109567e-05, + "logits/chosen": -1.8340836763381958, + "logits/rejected": -1.8327112197875977, + "logps/chosen": -151.6320343017578, + "logps/rejected": -255.59451293945312, + "loss": 0.0601, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7284873127937317, + "rewards/margins": 9.933165550231934, + "rewards/rejected": -9.204678535461426, + "step": 1309 + }, + { + "epoch": 1.71, + "learning_rate": 2.0933505220592295e-05, + "logits/chosen": -1.7697275876998901, + "logits/rejected": -1.719412088394165, + "logps/chosen": -195.12387084960938, + "logps/rejected": -312.44342041015625, + "loss": 0.0585, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5034978985786438, + "rewards/margins": 9.798672676086426, + "rewards/rejected": -10.302170753479004, + "step": 1310 + }, + { + "epoch": 1.72, + "learning_rate": 2.0898156380004034e-05, + "logits/chosen": -1.5935040712356567, + "logits/rejected": -1.5950825214385986, + "logps/chosen": -203.40814208984375, + "logps/rejected": -260.3504943847656, + "loss": 0.1262, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4168576002120972, + "rewards/margins": 5.559474468231201, + "rewards/rejected": -6.976332664489746, + "step": 1311 + }, + { + "epoch": 1.72, + "learning_rate": 2.086281596495434e-05, + "logits/chosen": -1.5992827415466309, + "logits/rejected": -1.6461384296417236, + "logps/chosen": -179.04098510742188, + "logps/rejected": -259.34039306640625, + "loss": 0.0613, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2309633493423462, + "rewards/margins": 8.073208808898926, + "rewards/rejected": -9.304171562194824, + "step": 1312 + }, + { + "epoch": 1.72, + "learning_rate": 2.0827484048035445e-05, + "logits/chosen": -1.8122634887695312, + "logits/rejected": -1.835162878036499, + "logps/chosen": -200.29345703125, + "logps/rejected": -304.5955810546875, + "loss": 0.068, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.44960421323776245, + "rewards/margins": 8.956765174865723, + "rewards/rejected": -9.406368255615234, + "step": 1313 + }, + { + "epoch": 1.72, + "learning_rate": 2.0792160701822157e-05, + "logits/chosen": -1.8225831985473633, + "logits/rejected": -1.8451952934265137, + "logps/chosen": -174.33929443359375, + "logps/rejected": -291.6876220703125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1373300552368164, + "rewards/margins": 12.134490013122559, + "rewards/rejected": -10.997159004211426, + "step": 1314 + }, + { + "epoch": 1.72, + "learning_rate": 2.0756845998871623e-05, + "logits/chosen": -1.530027151107788, + "logits/rejected": -1.582059383392334, + "logps/chosen": -268.1092529296875, + "logps/rejected": -405.32147216796875, + "loss": 0.0958, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6886293888092041, + "rewards/margins": 9.853059768676758, + "rewards/rejected": -10.541688919067383, + "step": 1315 + }, + { + "epoch": 1.72, + "learning_rate": 2.07215400117233e-05, + "logits/chosen": -1.7971422672271729, + "logits/rejected": -1.8060518503189087, + "logps/chosen": -190.48155212402344, + "logps/rejected": -260.5394287109375, + "loss": 0.0841, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7050102949142456, + "rewards/margins": 8.51248550415039, + "rewards/rejected": -9.217495918273926, + "step": 1316 + }, + { + "epoch": 1.72, + "learning_rate": 2.068624281289871e-05, + "logits/chosen": -1.7738057374954224, + "logits/rejected": -1.8274152278900146, + "logps/chosen": -148.57972717285156, + "logps/rejected": -297.8290710449219, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23370328545570374, + "rewards/margins": 10.938631057739258, + "rewards/rejected": -10.704928398132324, + "step": 1317 + }, + { + "epoch": 1.72, + "learning_rate": 2.065095447490131e-05, + "logits/chosen": -1.6670665740966797, + "logits/rejected": -1.6584620475769043, + "logps/chosen": -178.33770751953125, + "logps/rejected": -240.07217407226562, + "loss": 0.0824, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5157214999198914, + "rewards/margins": 5.871995449066162, + "rewards/rejected": -6.387716770172119, + "step": 1318 + }, + { + "epoch": 1.73, + "learning_rate": 2.0615675070216393e-05, + "logits/chosen": -1.8823175430297852, + "logits/rejected": -1.8708575963974, + "logps/chosen": -186.8196563720703, + "logps/rejected": -257.4454345703125, + "loss": 0.1031, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1613101959228516, + "rewards/margins": 6.164236545562744, + "rewards/rejected": -7.325546741485596, + "step": 1319 + }, + { + "epoch": 1.73, + "learning_rate": 2.0580404671310878e-05, + "logits/chosen": -1.7138571739196777, + "logits/rejected": -1.7480685710906982, + "logps/chosen": -183.43792724609375, + "logps/rejected": -254.72315979003906, + "loss": 0.1456, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9653207063674927, + "rewards/margins": 7.115040302276611, + "rewards/rejected": -8.080361366271973, + "step": 1320 + }, + { + "epoch": 1.73, + "learning_rate": 2.0545143350633177e-05, + "logits/chosen": -1.815916895866394, + "logits/rejected": -1.8253110647201538, + "logps/chosen": -175.69773864746094, + "logps/rejected": -279.3526611328125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013257145881652832, + "rewards/margins": 9.166781425476074, + "rewards/rejected": -9.180039405822754, + "step": 1321 + }, + { + "epoch": 1.73, + "learning_rate": 2.0509891180613066e-05, + "logits/chosen": -1.8629612922668457, + "logits/rejected": -1.9563696384429932, + "logps/chosen": -227.02993774414062, + "logps/rejected": -319.5457763671875, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05277013033628464, + "rewards/margins": 9.28923225402832, + "rewards/rejected": -9.342002868652344, + "step": 1322 + }, + { + "epoch": 1.73, + "learning_rate": 2.0474648233661543e-05, + "logits/chosen": -1.7979570627212524, + "logits/rejected": -1.7925100326538086, + "logps/chosen": -177.50460815429688, + "logps/rejected": -258.33087158203125, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6101075410842896, + "rewards/margins": 7.355088710784912, + "rewards/rejected": -8.965195655822754, + "step": 1323 + }, + { + "epoch": 1.73, + "learning_rate": 2.0439414582170628e-05, + "logits/chosen": -1.7554646730422974, + "logits/rejected": -1.71324622631073, + "logps/chosen": -159.15635681152344, + "logps/rejected": -266.59832763671875, + "loss": 0.1343, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06593817472457886, + "rewards/margins": 8.862637519836426, + "rewards/rejected": -8.92857551574707, + "step": 1324 + }, + { + "epoch": 1.73, + "learning_rate": 2.040419029851328e-05, + "logits/chosen": -1.4438438415527344, + "logits/rejected": -1.4386115074157715, + "logps/chosen": -183.35528564453125, + "logps/rejected": -285.39447021484375, + "loss": 0.0774, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.21853740513324738, + "rewards/margins": 9.111807823181152, + "rewards/rejected": -9.330345153808594, + "step": 1325 + }, + { + "epoch": 1.74, + "learning_rate": 2.0368975455043178e-05, + "logits/chosen": -1.765581727027893, + "logits/rejected": -1.8092378377914429, + "logps/chosen": -170.90481567382812, + "logps/rejected": -260.81414794921875, + "loss": 0.0555, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00043527036905288696, + "rewards/margins": 8.51809310913086, + "rewards/rejected": -8.518528938293457, + "step": 1326 + }, + { + "epoch": 1.74, + "learning_rate": 2.033377012409463e-05, + "logits/chosen": -1.9696861505508423, + "logits/rejected": -1.973785638809204, + "logps/chosen": -254.8365478515625, + "logps/rejected": -322.2008972167969, + "loss": 0.1639, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3494682312011719, + "rewards/margins": 7.652532577514648, + "rewards/rejected": -9.00200080871582, + "step": 1327 + }, + { + "epoch": 1.74, + "learning_rate": 2.0298574377982427e-05, + "logits/chosen": -1.6516728401184082, + "logits/rejected": -1.6783921718597412, + "logps/chosen": -191.75732421875, + "logps/rejected": -286.771484375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9843169450759888, + "rewards/margins": 8.601726531982422, + "rewards/rejected": -9.586044311523438, + "step": 1328 + }, + { + "epoch": 1.74, + "learning_rate": 2.026338828900163e-05, + "logits/chosen": -1.4632774591445923, + "logits/rejected": -1.479552984237671, + "logps/chosen": -160.7485809326172, + "logps/rejected": -269.3157958984375, + "loss": 0.0667, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4180791676044464, + "rewards/margins": 9.03553295135498, + "rewards/rejected": -8.617453575134277, + "step": 1329 + }, + { + "epoch": 1.74, + "learning_rate": 2.022821192942749e-05, + "logits/chosen": -1.9523584842681885, + "logits/rejected": -1.91499662399292, + "logps/chosen": -213.07022094726562, + "logps/rejected": -321.30474853515625, + "loss": 0.1019, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3731907606124878, + "rewards/margins": 6.2876296043396, + "rewards/rejected": -7.660820007324219, + "step": 1330 + }, + { + "epoch": 1.74, + "learning_rate": 2.0193045371515276e-05, + "logits/chosen": -1.7651207447052002, + "logits/rejected": -1.7377843856811523, + "logps/chosen": -202.41172790527344, + "logps/rejected": -246.73471069335938, + "loss": 0.0742, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6990616917610168, + "rewards/margins": 5.546772480010986, + "rewards/rejected": -6.2458343505859375, + "step": 1331 + }, + { + "epoch": 1.74, + "learning_rate": 2.015788868750009e-05, + "logits/chosen": -1.665324330329895, + "logits/rejected": -1.7027934789657593, + "logps/chosen": -167.6494140625, + "logps/rejected": -249.49591064453125, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0408804416656494, + "rewards/margins": 7.963379383087158, + "rewards/rejected": -9.00425910949707, + "step": 1332 + }, + { + "epoch": 1.74, + "learning_rate": 2.0122741949596797e-05, + "logits/chosen": -1.7280189990997314, + "logits/rejected": -1.67861008644104, + "logps/chosen": -155.95278930664062, + "logps/rejected": -251.91842651367188, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05395439267158508, + "rewards/margins": 7.499855041503906, + "rewards/rejected": -7.445900917053223, + "step": 1333 + }, + { + "epoch": 1.75, + "learning_rate": 2.00876052299998e-05, + "logits/chosen": -1.8224700689315796, + "logits/rejected": -1.81111741065979, + "logps/chosen": -190.61187744140625, + "logps/rejected": -271.1971435546875, + "loss": 0.0218, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6293432116508484, + "rewards/margins": 8.750373840332031, + "rewards/rejected": -9.379716873168945, + "step": 1334 + }, + { + "epoch": 1.75, + "learning_rate": 2.0052478600882935e-05, + "logits/chosen": -1.9608765840530396, + "logits/rejected": -1.9222817420959473, + "logps/chosen": -169.9017333984375, + "logps/rejected": -252.9747314453125, + "loss": 0.0845, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1037288904190063, + "rewards/margins": 6.922942638397217, + "rewards/rejected": -8.026671409606934, + "step": 1335 + }, + { + "epoch": 1.75, + "learning_rate": 2.001736213439933e-05, + "logits/chosen": -1.8816357851028442, + "logits/rejected": -1.877584457397461, + "logps/chosen": -204.36166381835938, + "logps/rejected": -269.41058349609375, + "loss": 0.0543, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6804943084716797, + "rewards/margins": 7.047283172607422, + "rewards/rejected": -7.727778434753418, + "step": 1336 + }, + { + "epoch": 1.75, + "learning_rate": 1.9982255902681186e-05, + "logits/chosen": -1.8288568258285522, + "logits/rejected": -1.8648662567138672, + "logps/chosen": -149.5530548095703, + "logps/rejected": -229.44894409179688, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1307733654975891, + "rewards/margins": 7.908245086669922, + "rewards/rejected": -8.039018630981445, + "step": 1337 + }, + { + "epoch": 1.75, + "learning_rate": 1.9947159977839736e-05, + "logits/chosen": -1.5987966060638428, + "logits/rejected": -1.6406404972076416, + "logps/chosen": -186.7511444091797, + "logps/rejected": -268.5368957519531, + "loss": 0.0921, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.33627110719680786, + "rewards/margins": 7.97405481338501, + "rewards/rejected": -8.310325622558594, + "step": 1338 + }, + { + "epoch": 1.75, + "learning_rate": 1.991207443196501e-05, + "logits/chosen": -1.9243892431259155, + "logits/rejected": -1.9510371685028076, + "logps/chosen": -181.88169860839844, + "logps/rejected": -256.4344177246094, + "loss": 0.0943, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0038692057132720947, + "rewards/margins": 7.833153247833252, + "rewards/rejected": -7.829283714294434, + "step": 1339 + }, + { + "epoch": 1.75, + "learning_rate": 1.987699933712573e-05, + "logits/chosen": -1.9363564252853394, + "logits/rejected": -1.967279076576233, + "logps/chosen": -152.5673065185547, + "logps/rejected": -211.77642822265625, + "loss": 0.1225, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9284152388572693, + "rewards/margins": 5.928248405456543, + "rewards/rejected": -6.856664657592773, + "step": 1340 + }, + { + "epoch": 1.75, + "learning_rate": 1.9841934765369153e-05, + "logits/chosen": -1.6056597232818604, + "logits/rejected": -1.6428066492080688, + "logps/chosen": -181.000732421875, + "logps/rejected": -279.1212158203125, + "loss": 0.0672, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4446088075637817, + "rewards/margins": 8.463523864746094, + "rewards/rejected": -9.908133506774902, + "step": 1341 + }, + { + "epoch": 1.76, + "learning_rate": 1.9806880788720916e-05, + "logits/chosen": -1.6229007244110107, + "logits/rejected": -1.7070560455322266, + "logps/chosen": -181.4785919189453, + "logps/rejected": -303.48126220703125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1148478984832764, + "rewards/margins": 10.633306503295898, + "rewards/rejected": -11.748153686523438, + "step": 1342 + }, + { + "epoch": 1.76, + "learning_rate": 1.977183747918489e-05, + "logits/chosen": -1.9572162628173828, + "logits/rejected": -1.9756476879119873, + "logps/chosen": -166.25404357910156, + "logps/rejected": -246.96766662597656, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6223886013031006, + "rewards/margins": 7.681982517242432, + "rewards/rejected": -8.30437183380127, + "step": 1343 + }, + { + "epoch": 1.76, + "learning_rate": 1.9736804908743033e-05, + "logits/chosen": -1.9741904735565186, + "logits/rejected": -1.9375983476638794, + "logps/chosen": -182.2660369873047, + "logps/rejected": -255.11367797851562, + "loss": 0.0681, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6400654315948486, + "rewards/margins": 6.824597358703613, + "rewards/rejected": -7.464663505554199, + "step": 1344 + }, + { + "epoch": 1.76, + "learning_rate": 1.9701783149355255e-05, + "logits/chosen": -1.6570680141448975, + "logits/rejected": -1.6639678478240967, + "logps/chosen": -181.16786193847656, + "logps/rejected": -285.6687316894531, + "loss": 0.0649, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7195628881454468, + "rewards/margins": 8.433355331420898, + "rewards/rejected": -9.152917861938477, + "step": 1345 + }, + { + "epoch": 1.76, + "learning_rate": 1.9666772272959253e-05, + "logits/chosen": -1.2811121940612793, + "logits/rejected": -1.245220422744751, + "logps/chosen": -197.99786376953125, + "logps/rejected": -253.82801818847656, + "loss": 0.1359, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9691623449325562, + "rewards/margins": 7.014854431152344, + "rewards/rejected": -7.9840168952941895, + "step": 1346 + }, + { + "epoch": 1.76, + "learning_rate": 1.9631772351470383e-05, + "logits/chosen": -1.294917345046997, + "logits/rejected": -1.3416192531585693, + "logps/chosen": -199.01275634765625, + "logps/rejected": -314.4172668457031, + "loss": 0.0454, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.23009619116783142, + "rewards/margins": 10.637429237365723, + "rewards/rejected": -10.86752700805664, + "step": 1347 + }, + { + "epoch": 1.76, + "learning_rate": 1.959678345678146e-05, + "logits/chosen": -2.0021121501922607, + "logits/rejected": -2.0186562538146973, + "logps/chosen": -199.15237426757812, + "logps/rejected": -268.7312927246094, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5120872259140015, + "rewards/margins": 8.405652046203613, + "rewards/rejected": -8.917738914489746, + "step": 1348 + }, + { + "epoch": 1.77, + "learning_rate": 1.9561805660762684e-05, + "logits/chosen": -1.9980536699295044, + "logits/rejected": -2.0102148056030273, + "logps/chosen": -234.97825622558594, + "logps/rejected": -276.199462890625, + "loss": 0.1, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3256834745407104, + "rewards/margins": 6.028922080993652, + "rewards/rejected": -7.354605674743652, + "step": 1349 + }, + { + "epoch": 1.77, + "learning_rate": 1.952683903526145e-05, + "logits/chosen": -1.8489958047866821, + "logits/rejected": -1.9247429370880127, + "logps/chosen": -197.1573028564453, + "logps/rejected": -292.5380554199219, + "loss": 0.0822, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.32317784428596497, + "rewards/margins": 10.997550010681152, + "rewards/rejected": -10.674371719360352, + "step": 1350 + }, + { + "epoch": 1.77, + "learning_rate": 1.9491883652102208e-05, + "logits/chosen": -1.6819422245025635, + "logits/rejected": -1.68989098072052, + "logps/chosen": -193.82147216796875, + "logps/rejected": -293.77239990234375, + "loss": 0.0684, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7122280597686768, + "rewards/margins": 6.502114295959473, + "rewards/rejected": -8.21434211730957, + "step": 1351 + }, + { + "epoch": 1.77, + "learning_rate": 1.9456939583086303e-05, + "logits/chosen": -1.9144465923309326, + "logits/rejected": -1.8897491693496704, + "logps/chosen": -203.8362274169922, + "logps/rejected": -308.9920349121094, + "loss": 0.05, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07996997237205505, + "rewards/margins": 11.276268005371094, + "rewards/rejected": -11.356237411499023, + "step": 1352 + }, + { + "epoch": 1.77, + "learning_rate": 1.9422006899991878e-05, + "logits/chosen": -1.8507614135742188, + "logits/rejected": -1.9148918390274048, + "logps/chosen": -245.236328125, + "logps/rejected": -358.8700256347656, + "loss": 0.1067, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8041815757751465, + "rewards/margins": 8.082918167114258, + "rewards/rejected": -10.887099266052246, + "step": 1353 + }, + { + "epoch": 1.77, + "learning_rate": 1.9387085674573616e-05, + "logits/chosen": -1.896280288696289, + "logits/rejected": -1.906815528869629, + "logps/chosen": -185.74771118164062, + "logps/rejected": -264.2749938964844, + "loss": 0.3393, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1578582525253296, + "rewards/margins": 8.441926002502441, + "rewards/rejected": -9.599784851074219, + "step": 1354 + }, + { + "epoch": 1.77, + "learning_rate": 1.9352175978562736e-05, + "logits/chosen": -1.8297330141067505, + "logits/rejected": -1.8974789381027222, + "logps/chosen": -179.56103515625, + "logps/rejected": -294.88604736328125, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3289389908313751, + "rewards/margins": 9.594542503356934, + "rewards/rejected": -9.923481941223145, + "step": 1355 + }, + { + "epoch": 1.77, + "learning_rate": 1.9317277883666745e-05, + "logits/chosen": -1.938228726387024, + "logits/rejected": -1.9337149858474731, + "logps/chosen": -144.25442504882812, + "logps/rejected": -244.10044860839844, + "loss": 0.0923, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30864885449409485, + "rewards/margins": 9.024149894714355, + "rewards/rejected": -8.715499877929688, + "step": 1356 + }, + { + "epoch": 1.78, + "learning_rate": 1.9282391461569316e-05, + "logits/chosen": -1.976447343826294, + "logits/rejected": -1.9635674953460693, + "logps/chosen": -161.55181884765625, + "logps/rejected": -247.10130310058594, + "loss": 0.0598, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20845425128936768, + "rewards/margins": 8.298850059509277, + "rewards/rejected": -8.507304191589355, + "step": 1357 + }, + { + "epoch": 1.78, + "learning_rate": 1.924751678393017e-05, + "logits/chosen": -1.7709836959838867, + "logits/rejected": -1.7851064205169678, + "logps/chosen": -216.12120056152344, + "logps/rejected": -279.8563232421875, + "loss": 0.169, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1158527135849, + "rewards/margins": 6.5449628829956055, + "rewards/rejected": -7.660815715789795, + "step": 1358 + }, + { + "epoch": 1.78, + "learning_rate": 1.9212653922384854e-05, + "logits/chosen": -1.929402470588684, + "logits/rejected": -1.9111336469650269, + "logps/chosen": -181.57080078125, + "logps/rejected": -284.9664001464844, + "loss": 0.081, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2838565111160278, + "rewards/margins": 8.941863059997559, + "rewards/rejected": -10.225719451904297, + "step": 1359 + }, + { + "epoch": 1.78, + "learning_rate": 1.91778029485447e-05, + "logits/chosen": -1.700954794883728, + "logits/rejected": -1.7304350137710571, + "logps/chosen": -192.46434020996094, + "logps/rejected": -282.5699157714844, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.301650881767273, + "rewards/margins": 8.414552688598633, + "rewards/rejected": -9.716202735900879, + "step": 1360 + }, + { + "epoch": 1.78, + "learning_rate": 1.914296393399659e-05, + "logits/chosen": -1.4738514423370361, + "logits/rejected": -1.5397871732711792, + "logps/chosen": -178.16534423828125, + "logps/rejected": -332.8831481933594, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33413565158843994, + "rewards/margins": 11.764269828796387, + "rewards/rejected": -11.430133819580078, + "step": 1361 + }, + { + "epoch": 1.78, + "learning_rate": 1.910813695030284e-05, + "logits/chosen": -1.7844324111938477, + "logits/rejected": -1.8149797916412354, + "logps/chosen": -205.7171173095703, + "logps/rejected": -275.0159912109375, + "loss": 0.1173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.670138955116272, + "rewards/margins": 7.381158828735352, + "rewards/rejected": -8.051297187805176, + "step": 1362 + }, + { + "epoch": 1.78, + "learning_rate": 1.9073322069001075e-05, + "logits/chosen": -1.9240690469741821, + "logits/rejected": -1.8996899127960205, + "logps/chosen": -178.87539672851562, + "logps/rejected": -292.92791748046875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4243634343147278, + "rewards/margins": 10.228076934814453, + "rewards/rejected": -10.652441024780273, + "step": 1363 + }, + { + "epoch": 1.79, + "learning_rate": 1.9038519361604046e-05, + "logits/chosen": -1.7108713388442993, + "logits/rejected": -1.6708178520202637, + "logps/chosen": -174.67269897460938, + "logps/rejected": -253.38616943359375, + "loss": 0.0662, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.41629472374916077, + "rewards/margins": 8.919060707092285, + "rewards/rejected": -9.335355758666992, + "step": 1364 + }, + { + "epoch": 1.79, + "learning_rate": 1.900372889959949e-05, + "logits/chosen": -1.8205201625823975, + "logits/rejected": -1.8261682987213135, + "logps/chosen": -194.78997802734375, + "logps/rejected": -298.11138916015625, + "loss": 0.0789, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9462127685546875, + "rewards/margins": 9.011022567749023, + "rewards/rejected": -9.957235336303711, + "step": 1365 + }, + { + "epoch": 1.79, + "learning_rate": 1.896895075445e-05, + "logits/chosen": -1.7903575897216797, + "logits/rejected": -1.806698203086853, + "logps/chosen": -211.8292694091797, + "logps/rejected": -323.73870849609375, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5305525064468384, + "rewards/margins": 10.764676094055176, + "rewards/rejected": -11.295228958129883, + "step": 1366 + }, + { + "epoch": 1.79, + "learning_rate": 1.8934184997592866e-05, + "logits/chosen": -1.9133182764053345, + "logits/rejected": -1.8578988313674927, + "logps/chosen": -177.38563537597656, + "logps/rejected": -257.34063720703125, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2563009262084961, + "rewards/margins": 8.956238746643066, + "rewards/rejected": -9.212539672851562, + "step": 1367 + }, + { + "epoch": 1.79, + "learning_rate": 1.8899431700439946e-05, + "logits/chosen": -1.644356369972229, + "logits/rejected": -1.6879427433013916, + "logps/chosen": -156.5514373779297, + "logps/rejected": -287.8313903808594, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08272463083267212, + "rewards/margins": 11.991378784179688, + "rewards/rejected": -11.908655166625977, + "step": 1368 + }, + { + "epoch": 1.79, + "learning_rate": 1.8864690934377492e-05, + "logits/chosen": -1.8826680183410645, + "logits/rejected": -1.9287712574005127, + "logps/chosen": -178.97634887695312, + "logps/rejected": -284.9380798339844, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7563552856445312, + "rewards/margins": 8.26695728302002, + "rewards/rejected": -9.023313522338867, + "step": 1369 + }, + { + "epoch": 1.79, + "learning_rate": 1.8829962770766003e-05, + "logits/chosen": -2.061532735824585, + "logits/rejected": -2.078533887863159, + "logps/chosen": -155.79396057128906, + "logps/rejected": -215.4700469970703, + "loss": 0.1606, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4049350619316101, + "rewards/margins": 6.18373966217041, + "rewards/rejected": -6.588675498962402, + "step": 1370 + }, + { + "epoch": 1.79, + "learning_rate": 1.8795247280940108e-05, + "logits/chosen": -1.8371599912643433, + "logits/rejected": -1.7960604429244995, + "logps/chosen": -203.6299285888672, + "logps/rejected": -276.4897766113281, + "loss": 0.1217, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3884665966033936, + "rewards/margins": 7.318436622619629, + "rewards/rejected": -8.706903457641602, + "step": 1371 + }, + { + "epoch": 1.8, + "learning_rate": 1.876054453620841e-05, + "logits/chosen": -1.7582720518112183, + "logits/rejected": -1.7458560466766357, + "logps/chosen": -205.42181396484375, + "logps/rejected": -280.1778564453125, + "loss": 0.0954, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8587182760238647, + "rewards/margins": 8.185296058654785, + "rewards/rejected": -9.044014930725098, + "step": 1372 + }, + { + "epoch": 1.8, + "learning_rate": 1.872585460785332e-05, + "logits/chosen": -1.9034453630447388, + "logits/rejected": -1.937851071357727, + "logps/chosen": -183.81031799316406, + "logps/rejected": -286.0010681152344, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7084670662879944, + "rewards/margins": 8.461755752563477, + "rewards/rejected": -9.170222282409668, + "step": 1373 + }, + { + "epoch": 1.8, + "learning_rate": 1.869117756713092e-05, + "logits/chosen": -1.8281465768814087, + "logits/rejected": -1.8007808923721313, + "logps/chosen": -184.818359375, + "logps/rejected": -264.1748352050781, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9475806355476379, + "rewards/margins": 8.247013092041016, + "rewards/rejected": -9.194594383239746, + "step": 1374 + }, + { + "epoch": 1.8, + "learning_rate": 1.8656513485270843e-05, + "logits/chosen": -1.9184118509292603, + "logits/rejected": -1.9373579025268555, + "logps/chosen": -198.76942443847656, + "logps/rejected": -284.4110412597656, + "loss": 0.0313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5101205110549927, + "rewards/margins": 9.549039840698242, + "rewards/rejected": -10.059161186218262, + "step": 1375 + }, + { + "epoch": 1.8, + "learning_rate": 1.8621862433476054e-05, + "logits/chosen": -1.8116027116775513, + "logits/rejected": -1.8369946479797363, + "logps/chosen": -204.1658477783203, + "logps/rejected": -294.537353515625, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43194830417633057, + "rewards/margins": 9.725103378295898, + "rewards/rejected": -10.157052040100098, + "step": 1376 + }, + { + "epoch": 1.8, + "learning_rate": 1.858722448292281e-05, + "logits/chosen": -1.839583158493042, + "logits/rejected": -1.8867522478103638, + "logps/chosen": -179.859375, + "logps/rejected": -274.320068359375, + "loss": 0.0471, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7829848527908325, + "rewards/margins": 7.695400714874268, + "rewards/rejected": -9.478385925292969, + "step": 1377 + }, + { + "epoch": 1.8, + "learning_rate": 1.8552599704760424e-05, + "logits/chosen": -1.712364673614502, + "logits/rejected": -1.7461382150650024, + "logps/chosen": -219.38858032226562, + "logps/rejected": -295.57928466796875, + "loss": 0.0948, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5989739298820496, + "rewards/margins": 7.272809028625488, + "rewards/rejected": -7.8717827796936035, + "step": 1378 + }, + { + "epoch": 1.8, + "learning_rate": 1.851798817011116e-05, + "logits/chosen": -1.5831432342529297, + "logits/rejected": -1.5590590238571167, + "logps/chosen": -161.47946166992188, + "logps/rejected": -259.676025390625, + "loss": 0.1302, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.055599115788936615, + "rewards/margins": 9.801471710205078, + "rewards/rejected": -9.857069969177246, + "step": 1379 + }, + { + "epoch": 1.81, + "learning_rate": 1.8483389950070097e-05, + "logits/chosen": -1.9289509057998657, + "logits/rejected": -1.9228482246398926, + "logps/chosen": -170.21571350097656, + "logps/rejected": -276.3854675292969, + "loss": 0.0494, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.058711886405944824, + "rewards/margins": 10.268485069274902, + "rewards/rejected": -10.32719612121582, + "step": 1380 + }, + { + "epoch": 1.81, + "learning_rate": 1.8448805115704903e-05, + "logits/chosen": -1.509027361869812, + "logits/rejected": -1.6233060359954834, + "logps/chosen": -181.89849853515625, + "logps/rejected": -309.51019287109375, + "loss": 0.0541, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7430284023284912, + "rewards/margins": 10.584050178527832, + "rewards/rejected": -11.327079772949219, + "step": 1381 + }, + { + "epoch": 1.81, + "learning_rate": 1.841423373805583e-05, + "logits/chosen": -1.9452784061431885, + "logits/rejected": -1.976248860359192, + "logps/chosen": -216.8744354248047, + "logps/rejected": -310.6162414550781, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.21173894405365, + "rewards/margins": 8.369990348815918, + "rewards/rejected": -9.581729888916016, + "step": 1382 + }, + { + "epoch": 1.81, + "learning_rate": 1.837967588813544e-05, + "logits/chosen": -1.6651805639266968, + "logits/rejected": -1.640872597694397, + "logps/chosen": -225.46983337402344, + "logps/rejected": -326.9107971191406, + "loss": 0.0922, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9772020578384399, + "rewards/margins": 8.824700355529785, + "rewards/rejected": -9.801902770996094, + "step": 1383 + }, + { + "epoch": 1.81, + "learning_rate": 1.8345131636928518e-05, + "logits/chosen": -1.7195566892623901, + "logits/rejected": -1.7636768817901611, + "logps/chosen": -169.68186950683594, + "logps/rejected": -281.45281982421875, + "loss": 0.0587, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.37158307433128357, + "rewards/margins": 8.504358291625977, + "rewards/rejected": -8.875941276550293, + "step": 1384 + }, + { + "epoch": 1.81, + "learning_rate": 1.8310601055391923e-05, + "logits/chosen": -1.6704319715499878, + "logits/rejected": -1.6918952465057373, + "logps/chosen": -186.3609619140625, + "logps/rejected": -299.6885681152344, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37271231412887573, + "rewards/margins": 10.5128173828125, + "rewards/rejected": -10.885530471801758, + "step": 1385 + }, + { + "epoch": 1.81, + "learning_rate": 1.8276084214454443e-05, + "logits/chosen": -1.3156566619873047, + "logits/rejected": -1.3323159217834473, + "logps/chosen": -188.9298553466797, + "logps/rejected": -268.36395263671875, + "loss": 0.0655, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3583330512046814, + "rewards/margins": 9.058720588684082, + "rewards/rejected": -9.41705322265625, + "step": 1386 + }, + { + "epoch": 1.82, + "learning_rate": 1.8241581185016603e-05, + "logits/chosen": -1.8797374963760376, + "logits/rejected": -1.8572651147842407, + "logps/chosen": -164.59197998046875, + "logps/rejected": -247.1000213623047, + "loss": 0.1293, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4711276888847351, + "rewards/margins": 7.307163238525391, + "rewards/rejected": -7.778290748596191, + "step": 1387 + }, + { + "epoch": 1.82, + "learning_rate": 1.8207092037950602e-05, + "logits/chosen": -1.8354806900024414, + "logits/rejected": -1.8053866624832153, + "logps/chosen": -247.6166229248047, + "logps/rejected": -325.677001953125, + "loss": 0.0552, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8674428462982178, + "rewards/margins": 8.221887588500977, + "rewards/rejected": -9.089329719543457, + "step": 1388 + }, + { + "epoch": 1.82, + "learning_rate": 1.8172616844100096e-05, + "logits/chosen": -1.5970487594604492, + "logits/rejected": -1.5727487802505493, + "logps/chosen": -195.846435546875, + "logps/rejected": -241.15866088867188, + "loss": 0.1571, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5282628536224365, + "rewards/margins": 5.285512447357178, + "rewards/rejected": -6.813775062561035, + "step": 1389 + }, + { + "epoch": 1.82, + "learning_rate": 1.81381556742801e-05, + "logits/chosen": -1.8585798740386963, + "logits/rejected": -1.892126202583313, + "logps/chosen": -169.0955810546875, + "logps/rejected": -248.38232421875, + "loss": 0.0496, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2832234501838684, + "rewards/margins": 8.11409854888916, + "rewards/rejected": -8.397321701049805, + "step": 1390 + }, + { + "epoch": 1.82, + "learning_rate": 1.8103708599276812e-05, + "logits/chosen": -1.6312322616577148, + "logits/rejected": -1.5891399383544922, + "logps/chosen": -204.6843719482422, + "logps/rejected": -301.3951416015625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006355363875627518, + "rewards/margins": 10.709888458251953, + "rewards/rejected": -10.716243743896484, + "step": 1391 + }, + { + "epoch": 1.82, + "learning_rate": 1.8069275689847466e-05, + "logits/chosen": -1.773181438446045, + "logits/rejected": -1.836763858795166, + "logps/chosen": -209.11135864257812, + "logps/rejected": -273.56329345703125, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8344926238059998, + "rewards/margins": 8.107781410217285, + "rewards/rejected": -8.942273139953613, + "step": 1392 + }, + { + "epoch": 1.82, + "learning_rate": 1.803485701672022e-05, + "logits/chosen": -1.8285841941833496, + "logits/rejected": -1.8094639778137207, + "logps/chosen": -173.488525390625, + "logps/rejected": -306.4235534667969, + "loss": 0.0641, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.44278109073638916, + "rewards/margins": 10.727326393127441, + "rewards/rejected": -10.2845458984375, + "step": 1393 + }, + { + "epoch": 1.82, + "learning_rate": 1.8000452650593976e-05, + "logits/chosen": -1.928299069404602, + "logits/rejected": -1.9218006134033203, + "logps/chosen": -223.9350128173828, + "logps/rejected": -319.5177917480469, + "loss": 0.0445, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5577883720397949, + "rewards/margins": 8.837398529052734, + "rewards/rejected": -9.395186424255371, + "step": 1394 + }, + { + "epoch": 1.83, + "learning_rate": 1.7966062662138262e-05, + "logits/chosen": -2.101349353790283, + "logits/rejected": -2.0876026153564453, + "logps/chosen": -214.87281799316406, + "logps/rejected": -276.7854919433594, + "loss": 0.1861, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9199066758155823, + "rewards/margins": 6.135284423828125, + "rewards/rejected": -7.055191516876221, + "step": 1395 + }, + { + "epoch": 1.83, + "learning_rate": 1.7931687121993047e-05, + "logits/chosen": -1.524641513824463, + "logits/rejected": -1.5513595342636108, + "logps/chosen": -233.6764373779297, + "logps/rejected": -334.0563049316406, + "loss": 0.0457, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.031426191329956055, + "rewards/margins": 10.566980361938477, + "rewards/rejected": -10.598406791687012, + "step": 1396 + }, + { + "epoch": 1.83, + "learning_rate": 1.7897326100768664e-05, + "logits/chosen": -1.8758831024169922, + "logits/rejected": -1.9175246953964233, + "logps/chosen": -181.45240783691406, + "logps/rejected": -298.92578125, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3991521894931793, + "rewards/margins": 10.821096420288086, + "rewards/rejected": -10.421943664550781, + "step": 1397 + }, + { + "epoch": 1.83, + "learning_rate": 1.7862979669045566e-05, + "logits/chosen": -1.68924081325531, + "logits/rejected": -1.7541979551315308, + "logps/chosen": -160.356689453125, + "logps/rejected": -230.01263427734375, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7978807687759399, + "rewards/margins": 7.1806488037109375, + "rewards/rejected": -7.978529453277588, + "step": 1398 + }, + { + "epoch": 1.83, + "learning_rate": 1.782864789737429e-05, + "logits/chosen": -1.8248835802078247, + "logits/rejected": -1.8319802284240723, + "logps/chosen": -188.9367218017578, + "logps/rejected": -284.5295104980469, + "loss": 0.0484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.250162959098816, + "rewards/margins": 7.951763153076172, + "rewards/rejected": -9.201927185058594, + "step": 1399 + }, + { + "epoch": 1.83, + "learning_rate": 1.779433085627523e-05, + "logits/chosen": -1.745599389076233, + "logits/rejected": -1.7931177616119385, + "logps/chosen": -162.2366943359375, + "logps/rejected": -265.8896484375, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7721738815307617, + "rewards/margins": 8.443265914916992, + "rewards/rejected": -9.215439796447754, + "step": 1400 + }, + { + "epoch": 1.83, + "learning_rate": 1.7760028616238535e-05, + "logits/chosen": -1.7449626922607422, + "logits/rejected": -1.7469149827957153, + "logps/chosen": -162.74609375, + "logps/rejected": -255.55038452148438, + "loss": 0.0714, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2001023143529892, + "rewards/margins": 8.88007926940918, + "rewards/rejected": -8.679977416992188, + "step": 1401 + }, + { + "epoch": 1.83, + "learning_rate": 1.7725741247723965e-05, + "logits/chosen": -1.805445671081543, + "logits/rejected": -1.8266456127166748, + "logps/chosen": -211.0671844482422, + "logps/rejected": -345.4092102050781, + "loss": 0.0882, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35397952795028687, + "rewards/margins": 11.979537963867188, + "rewards/rejected": -12.333518028259277, + "step": 1402 + }, + { + "epoch": 1.84, + "learning_rate": 1.769146882116068e-05, + "logits/chosen": -1.960497260093689, + "logits/rejected": -1.9323253631591797, + "logps/chosen": -215.3045654296875, + "logps/rejected": -299.6772766113281, + "loss": 0.0528, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.32492709159851074, + "rewards/margins": 8.393838882446289, + "rewards/rejected": -8.718765258789062, + "step": 1403 + }, + { + "epoch": 1.84, + "learning_rate": 1.7657211406947206e-05, + "logits/chosen": -2.0137336254119873, + "logits/rejected": -2.0196399688720703, + "logps/chosen": -226.07571411132812, + "logps/rejected": -273.3217468261719, + "loss": 0.0933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.495697945356369, + "rewards/margins": 6.676199913024902, + "rewards/rejected": -7.171897888183594, + "step": 1404 + }, + { + "epoch": 1.84, + "learning_rate": 1.7622969075451204e-05, + "logits/chosen": -1.8271023035049438, + "logits/rejected": -1.871528148651123, + "logps/chosen": -216.00689697265625, + "logps/rejected": -294.8734130859375, + "loss": 0.055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9575166702270508, + "rewards/margins": 7.486212730407715, + "rewards/rejected": -8.443729400634766, + "step": 1405 + }, + { + "epoch": 1.84, + "learning_rate": 1.758874189700936e-05, + "logits/chosen": -1.7370017766952515, + "logits/rejected": -1.8035414218902588, + "logps/chosen": -167.0490264892578, + "logps/rejected": -287.7859802246094, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.567207932472229, + "rewards/margins": 11.964707374572754, + "rewards/rejected": -11.397499084472656, + "step": 1406 + }, + { + "epoch": 1.84, + "learning_rate": 1.7554529941927243e-05, + "logits/chosen": -1.7446564435958862, + "logits/rejected": -1.783481240272522, + "logps/chosen": -184.27919006347656, + "logps/rejected": -270.3516845703125, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19579777121543884, + "rewards/margins": 10.0877103805542, + "rewards/rejected": -10.28350830078125, + "step": 1407 + }, + { + "epoch": 1.84, + "learning_rate": 1.7520333280479124e-05, + "logits/chosen": -1.692033052444458, + "logits/rejected": -1.7589759826660156, + "logps/chosen": -141.3279571533203, + "logps/rejected": -233.7997589111328, + "loss": 0.0936, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08037114143371582, + "rewards/margins": 8.160627365112305, + "rewards/rejected": -8.080256462097168, + "step": 1408 + }, + { + "epoch": 1.84, + "learning_rate": 1.7486151982907896e-05, + "logits/chosen": -1.8861494064331055, + "logits/rejected": -1.8336795568466187, + "logps/chosen": -168.50802612304688, + "logps/rejected": -235.15963745117188, + "loss": 0.1331, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08947964012622833, + "rewards/margins": 8.115923881530762, + "rewards/rejected": -8.205403327941895, + "step": 1409 + }, + { + "epoch": 1.85, + "learning_rate": 1.7451986119424863e-05, + "logits/chosen": -1.6849112510681152, + "logits/rejected": -1.7732785940170288, + "logps/chosen": -165.73822021484375, + "logps/rejected": -282.4337158203125, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5409442782402039, + "rewards/margins": 9.329056739807129, + "rewards/rejected": -9.870000839233398, + "step": 1410 + }, + { + "epoch": 1.85, + "learning_rate": 1.7417835760209638e-05, + "logits/chosen": -1.8535761833190918, + "logits/rejected": -1.8732385635375977, + "logps/chosen": -159.54151916503906, + "logps/rejected": -226.65286254882812, + "loss": 0.0375, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6963263750076294, + "rewards/margins": 7.246090888977051, + "rewards/rejected": -7.942417144775391, + "step": 1411 + }, + { + "epoch": 1.85, + "learning_rate": 1.738370097541e-05, + "logits/chosen": -1.7530170679092407, + "logits/rejected": -1.7979145050048828, + "logps/chosen": -249.77392578125, + "logps/rejected": -293.19195556640625, + "loss": 0.1742, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9893063306808472, + "rewards/margins": 5.181007385253906, + "rewards/rejected": -6.170313835144043, + "step": 1412 + }, + { + "epoch": 1.85, + "learning_rate": 1.7349581835141725e-05, + "logits/chosen": -1.7407127618789673, + "logits/rejected": -1.7402825355529785, + "logps/chosen": -175.80198669433594, + "logps/rejected": -283.78924560546875, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36441129446029663, + "rewards/margins": 10.075911521911621, + "rewards/rejected": -9.71150016784668, + "step": 1413 + }, + { + "epoch": 1.85, + "learning_rate": 1.7315478409488436e-05, + "logits/chosen": -1.8360697031021118, + "logits/rejected": -1.8635212182998657, + "logps/chosen": -141.52931213378906, + "logps/rejected": -230.49456787109375, + "loss": 0.1156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5649850964546204, + "rewards/margins": 8.313777923583984, + "rewards/rejected": -7.748793125152588, + "step": 1414 + }, + { + "epoch": 1.85, + "learning_rate": 1.72813907685015e-05, + "logits/chosen": -1.8100566864013672, + "logits/rejected": -1.8562887907028198, + "logps/chosen": -167.28477478027344, + "logps/rejected": -251.83139038085938, + "loss": 0.0554, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.25724613666534424, + "rewards/margins": 7.770247459411621, + "rewards/rejected": -7.513001441955566, + "step": 1415 + }, + { + "epoch": 1.85, + "learning_rate": 1.7247318982199862e-05, + "logits/chosen": -1.647947072982788, + "logits/rejected": -1.689767837524414, + "logps/chosen": -203.71484375, + "logps/rejected": -253.16221618652344, + "loss": 0.2334, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5052939653396606, + "rewards/margins": 4.874670028686523, + "rewards/rejected": -6.379963397979736, + "step": 1416 + }, + { + "epoch": 1.85, + "learning_rate": 1.721326312056989e-05, + "logits/chosen": -1.6353328227996826, + "logits/rejected": -1.6430772542953491, + "logps/chosen": -183.1324920654297, + "logps/rejected": -267.33148193359375, + "loss": 0.0506, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2285293340682983, + "rewards/margins": 7.72669792175293, + "rewards/rejected": -8.95522689819336, + "step": 1417 + }, + { + "epoch": 1.86, + "learning_rate": 1.717922325356525e-05, + "logits/chosen": -1.9711103439331055, + "logits/rejected": -1.959160566329956, + "logps/chosen": -164.99798583984375, + "logps/rejected": -247.86984252929688, + "loss": 0.066, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04904147982597351, + "rewards/margins": 9.147300720214844, + "rewards/rejected": -9.098258972167969, + "step": 1418 + }, + { + "epoch": 1.86, + "learning_rate": 1.7145199451106736e-05, + "logits/chosen": -1.6895122528076172, + "logits/rejected": -1.6815532445907593, + "logps/chosen": -197.25914001464844, + "logps/rejected": -286.860107421875, + "loss": 0.0398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5482984781265259, + "rewards/margins": 8.841209411621094, + "rewards/rejected": -9.389508247375488, + "step": 1419 + }, + { + "epoch": 1.86, + "learning_rate": 1.7111191783082155e-05, + "logits/chosen": -1.6723945140838623, + "logits/rejected": -1.6625168323516846, + "logps/chosen": -193.51145935058594, + "logps/rejected": -324.81427001953125, + "loss": 0.0914, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44996216893196106, + "rewards/margins": 10.34982681274414, + "rewards/rejected": -10.799789428710938, + "step": 1420 + }, + { + "epoch": 1.86, + "learning_rate": 1.7077200319346186e-05, + "logits/chosen": -1.7067145109176636, + "logits/rejected": -1.7199944257736206, + "logps/chosen": -198.54412841796875, + "logps/rejected": -259.4043273925781, + "loss": 0.0337, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7479231357574463, + "rewards/margins": 6.968247890472412, + "rewards/rejected": -7.716170787811279, + "step": 1421 + }, + { + "epoch": 1.86, + "learning_rate": 1.7043225129720207e-05, + "logits/chosen": -2.024294137954712, + "logits/rejected": -1.990172266960144, + "logps/chosen": -257.2493896484375, + "logps/rejected": -334.7914733886719, + "loss": 0.1273, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7677092552185059, + "rewards/margins": 7.4466681480407715, + "rewards/rejected": -9.214378356933594, + "step": 1422 + }, + { + "epoch": 1.86, + "learning_rate": 1.7009266283992163e-05, + "logits/chosen": -1.7532830238342285, + "logits/rejected": -1.8029271364212036, + "logps/chosen": -155.24771118164062, + "logps/rejected": -244.72857666015625, + "loss": 0.0715, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4729558825492859, + "rewards/margins": 8.136865615844727, + "rewards/rejected": -8.609821319580078, + "step": 1423 + }, + { + "epoch": 1.86, + "learning_rate": 1.6975323851916454e-05, + "logits/chosen": -1.5991289615631104, + "logits/rejected": -1.6035428047180176, + "logps/chosen": -341.51025390625, + "logps/rejected": -419.82818603515625, + "loss": 0.0501, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3995158076286316, + "rewards/margins": 8.358810424804688, + "rewards/rejected": -8.75832748413086, + "step": 1424 + }, + { + "epoch": 1.86, + "learning_rate": 1.6941397903213717e-05, + "logits/chosen": -1.6337162256240845, + "logits/rejected": -1.6523741483688354, + "logps/chosen": -202.48057556152344, + "logps/rejected": -304.3955993652344, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7980514168739319, + "rewards/margins": 8.779748916625977, + "rewards/rejected": -9.577801704406738, + "step": 1425 + }, + { + "epoch": 1.87, + "learning_rate": 1.6907488507570786e-05, + "logits/chosen": -1.598244309425354, + "logits/rejected": -1.5849950313568115, + "logps/chosen": -223.5976104736328, + "logps/rejected": -338.7594299316406, + "loss": 0.1712, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19535928964614868, + "rewards/margins": 11.259513854980469, + "rewards/rejected": -11.454873085021973, + "step": 1426 + }, + { + "epoch": 1.87, + "learning_rate": 1.6873595734640457e-05, + "logits/chosen": -1.8326441049575806, + "logits/rejected": -1.850005865097046, + "logps/chosen": -171.56468200683594, + "logps/rejected": -282.44244384765625, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7462323307991028, + "rewards/margins": 9.731050491333008, + "rewards/rejected": -10.477283477783203, + "step": 1427 + }, + { + "epoch": 1.87, + "learning_rate": 1.683971965404139e-05, + "logits/chosen": -1.6898667812347412, + "logits/rejected": -1.7379153966903687, + "logps/chosen": -170.7454833984375, + "logps/rejected": -235.50010681152344, + "loss": 0.0914, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09900404512882233, + "rewards/margins": 7.605112075805664, + "rewards/rejected": -7.704116344451904, + "step": 1428 + }, + { + "epoch": 1.87, + "learning_rate": 1.6805860335357977e-05, + "logits/chosen": -1.8170245885849, + "logits/rejected": -1.8656136989593506, + "logps/chosen": -160.50030517578125, + "logps/rejected": -232.7546844482422, + "loss": 0.1129, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8033601641654968, + "rewards/margins": 7.687594413757324, + "rewards/rejected": -8.490955352783203, + "step": 1429 + }, + { + "epoch": 1.87, + "learning_rate": 1.6772017848140132e-05, + "logits/chosen": -1.8729625940322876, + "logits/rejected": -1.9616113901138306, + "logps/chosen": -179.74957275390625, + "logps/rejected": -267.2135009765625, + "loss": 0.0987, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2719106674194336, + "rewards/margins": 6.83138370513916, + "rewards/rejected": -8.103294372558594, + "step": 1430 + }, + { + "epoch": 1.87, + "learning_rate": 1.6738192261903248e-05, + "logits/chosen": -1.4119092226028442, + "logits/rejected": -1.446468472480774, + "logps/chosen": -203.5169677734375, + "logps/rejected": -294.53717041015625, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.988545835018158, + "rewards/margins": 8.418998718261719, + "rewards/rejected": -9.40754508972168, + "step": 1431 + }, + { + "epoch": 1.87, + "learning_rate": 1.6704383646127973e-05, + "logits/chosen": -1.5493141412734985, + "logits/rejected": -1.5280919075012207, + "logps/chosen": -231.03411865234375, + "logps/rejected": -308.0693054199219, + "loss": 0.0619, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3308624029159546, + "rewards/margins": 8.97515869140625, + "rewards/rejected": -9.306020736694336, + "step": 1432 + }, + { + "epoch": 1.88, + "learning_rate": 1.6670592070260106e-05, + "logits/chosen": -1.3048310279846191, + "logits/rejected": -1.305355191230774, + "logps/chosen": -179.85061645507812, + "logps/rejected": -278.1815490722656, + "loss": 0.1091, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6974951028823853, + "rewards/margins": 6.949095249176025, + "rewards/rejected": -8.646591186523438, + "step": 1433 + }, + { + "epoch": 1.88, + "learning_rate": 1.6636817603710437e-05, + "logits/chosen": -1.7542773485183716, + "logits/rejected": -1.769920825958252, + "logps/chosen": -162.82044982910156, + "logps/rejected": -246.6299591064453, + "loss": 0.0549, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5554962158203125, + "rewards/margins": 7.376394271850586, + "rewards/rejected": -7.931890487670898, + "step": 1434 + }, + { + "epoch": 1.88, + "learning_rate": 1.660306031585463e-05, + "logits/chosen": -1.8431326150894165, + "logits/rejected": -1.8779798746109009, + "logps/chosen": -222.20489501953125, + "logps/rejected": -273.60198974609375, + "loss": 0.0789, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2964153289794922, + "rewards/margins": 5.91016149520874, + "rewards/rejected": -7.206576347351074, + "step": 1435 + }, + { + "epoch": 1.88, + "learning_rate": 1.6569320276033034e-05, + "logits/chosen": -1.5375458002090454, + "logits/rejected": -1.6511176824569702, + "logps/chosen": -189.60899353027344, + "logps/rejected": -232.22998046875, + "loss": 0.1195, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7196157574653625, + "rewards/margins": 6.550069808959961, + "rewards/rejected": -7.269685745239258, + "step": 1436 + }, + { + "epoch": 1.88, + "learning_rate": 1.653559755355058e-05, + "logits/chosen": -1.7177815437316895, + "logits/rejected": -1.7361950874328613, + "logps/chosen": -184.82997131347656, + "logps/rejected": -247.07594299316406, + "loss": 0.1365, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.186899185180664, + "rewards/margins": 7.325465202331543, + "rewards/rejected": -8.512364387512207, + "step": 1437 + }, + { + "epoch": 1.88, + "learning_rate": 1.6501892217676653e-05, + "logits/chosen": -1.7835614681243896, + "logits/rejected": -1.8810856342315674, + "logps/chosen": -193.78964233398438, + "logps/rejected": -299.892333984375, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6743454933166504, + "rewards/margins": 9.307034492492676, + "rewards/rejected": -9.981379508972168, + "step": 1438 + }, + { + "epoch": 1.88, + "learning_rate": 1.6468204337644887e-05, + "logits/chosen": -1.708690881729126, + "logits/rejected": -1.7387505769729614, + "logps/chosen": -181.4947052001953, + "logps/rejected": -282.2154235839844, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1101932525634766, + "rewards/margins": 9.125398635864258, + "rewards/rejected": -10.235591888427734, + "step": 1439 + }, + { + "epoch": 1.88, + "learning_rate": 1.643453398265309e-05, + "logits/chosen": -1.6996794939041138, + "logits/rejected": -1.7012697458267212, + "logps/chosen": -172.50746154785156, + "logps/rejected": -251.802734375, + "loss": 0.0429, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31164246797561646, + "rewards/margins": 8.410935401916504, + "rewards/rejected": -8.722577095031738, + "step": 1440 + }, + { + "epoch": 1.89, + "learning_rate": 1.6400881221863044e-05, + "logits/chosen": -1.46378755569458, + "logits/rejected": -1.509097933769226, + "logps/chosen": -151.7143096923828, + "logps/rejected": -251.1295623779297, + "loss": 0.1597, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6235194802284241, + "rewards/margins": 7.173708438873291, + "rewards/rejected": -7.79722785949707, + "step": 1441 + }, + { + "epoch": 1.89, + "learning_rate": 1.6367246124400402e-05, + "logits/chosen": -1.8047696352005005, + "logits/rejected": -1.8058642148971558, + "logps/chosen": -221.96542358398438, + "logps/rejected": -291.5032958984375, + "loss": 0.0586, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0140197277069092, + "rewards/margins": 6.861208438873291, + "rewards/rejected": -7.875227928161621, + "step": 1442 + }, + { + "epoch": 1.89, + "learning_rate": 1.633362875935456e-05, + "logits/chosen": -1.8516772985458374, + "logits/rejected": -1.8654757738113403, + "logps/chosen": -161.64749145507812, + "logps/rejected": -233.4096221923828, + "loss": 0.1365, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3239396810531616, + "rewards/margins": 6.728903770446777, + "rewards/rejected": -7.05284309387207, + "step": 1443 + }, + { + "epoch": 1.89, + "learning_rate": 1.6300029195778455e-05, + "logits/chosen": -1.8846726417541504, + "logits/rejected": -1.900369644165039, + "logps/chosen": -168.9714813232422, + "logps/rejected": -238.44334411621094, + "loss": 0.1403, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1037547588348389, + "rewards/margins": 6.01359748840332, + "rewards/rejected": -7.117352485656738, + "step": 1444 + }, + { + "epoch": 1.89, + "learning_rate": 1.626644750268847e-05, + "logits/chosen": -1.6737010478973389, + "logits/rejected": -1.738445520401001, + "logps/chosen": -175.99514770507812, + "logps/rejected": -296.8955383300781, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1116701364517212, + "rewards/margins": 7.740163326263428, + "rewards/rejected": -8.85183334350586, + "step": 1445 + }, + { + "epoch": 1.89, + "learning_rate": 1.62328837490643e-05, + "logits/chosen": -1.785094976425171, + "logits/rejected": -1.7127764225006104, + "logps/chosen": -158.10035705566406, + "logps/rejected": -248.91912841796875, + "loss": 0.0665, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.401688575744629, + "rewards/margins": 10.656965255737305, + "rewards/rejected": -9.255276679992676, + "step": 1446 + }, + { + "epoch": 1.89, + "learning_rate": 1.6199338003848745e-05, + "logits/chosen": -1.808382511138916, + "logits/rejected": -1.9022395610809326, + "logps/chosen": -177.55540466308594, + "logps/rejected": -260.3826904296875, + "loss": 0.0938, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39686864614486694, + "rewards/margins": 7.598231792449951, + "rewards/rejected": -7.995101451873779, + "step": 1447 + }, + { + "epoch": 1.89, + "learning_rate": 1.6165810335947664e-05, + "logits/chosen": -1.5101224184036255, + "logits/rejected": -1.6306438446044922, + "logps/chosen": -166.27972412109375, + "logps/rejected": -225.48458862304688, + "loss": 0.193, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8641462326049805, + "rewards/margins": 5.643004417419434, + "rewards/rejected": -7.507151126861572, + "step": 1448 + }, + { + "epoch": 1.9, + "learning_rate": 1.6132300814229755e-05, + "logits/chosen": -1.7956031560897827, + "logits/rejected": -1.8231531381607056, + "logps/chosen": -231.86920166015625, + "logps/rejected": -300.3326416015625, + "loss": 0.0904, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6177788376808167, + "rewards/margins": 8.395776748657227, + "rewards/rejected": -9.013557434082031, + "step": 1449 + }, + { + "epoch": 1.9, + "learning_rate": 1.6098809507526445e-05, + "logits/chosen": -1.9588472843170166, + "logits/rejected": -1.9325575828552246, + "logps/chosen": -185.962158203125, + "logps/rejected": -300.9217529296875, + "loss": 0.0446, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.24454191327095032, + "rewards/margins": 11.526311874389648, + "rewards/rejected": -11.770853042602539, + "step": 1450 + }, + { + "epoch": 1.9, + "learning_rate": 1.606533648463177e-05, + "logits/chosen": -1.8264312744140625, + "logits/rejected": -1.7577422857284546, + "logps/chosen": -168.47378540039062, + "logps/rejected": -251.87905883789062, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5017776489257812, + "rewards/margins": 9.688447952270508, + "rewards/rejected": -9.18666934967041, + "step": 1451 + }, + { + "epoch": 1.9, + "learning_rate": 1.603188181430216e-05, + "logits/chosen": -1.7816956043243408, + "logits/rejected": -1.775305151939392, + "logps/chosen": -228.3436279296875, + "logps/rejected": -303.0387268066406, + "loss": 0.1777, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15321630239486694, + "rewards/margins": 8.819178581237793, + "rewards/rejected": -8.972394943237305, + "step": 1452 + }, + { + "epoch": 1.9, + "learning_rate": 1.5998445565256398e-05, + "logits/chosen": -1.89665949344635, + "logits/rejected": -1.960755467414856, + "logps/chosen": -180.8733367919922, + "logps/rejected": -309.20184326171875, + "loss": 0.0779, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04430273175239563, + "rewards/margins": 10.437679290771484, + "rewards/rejected": -10.393375396728516, + "step": 1453 + }, + { + "epoch": 1.9, + "learning_rate": 1.59650278061754e-05, + "logits/chosen": -1.8072558641433716, + "logits/rejected": -1.8131906986236572, + "logps/chosen": -159.24815368652344, + "logps/rejected": -232.19415283203125, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3838723599910736, + "rewards/margins": 7.252429962158203, + "rewards/rejected": -7.636301517486572, + "step": 1454 + }, + { + "epoch": 1.9, + "learning_rate": 1.5931628605702102e-05, + "logits/chosen": -1.7675809860229492, + "logits/rejected": -1.7806410789489746, + "logps/chosen": -184.3780975341797, + "logps/rejected": -283.67974853515625, + "loss": 0.0958, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1194177195429802, + "rewards/margins": 9.795171737670898, + "rewards/rejected": -9.914588928222656, + "step": 1455 + }, + { + "epoch": 1.91, + "learning_rate": 1.5898248032441336e-05, + "logits/chosen": -1.6262151002883911, + "logits/rejected": -1.6245311498641968, + "logps/chosen": -188.3709259033203, + "logps/rejected": -293.62213134765625, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9368066787719727, + "rewards/margins": 11.172704696655273, + "rewards/rejected": -10.235898971557617, + "step": 1456 + }, + { + "epoch": 1.91, + "learning_rate": 1.5864886154959673e-05, + "logits/chosen": -1.719357967376709, + "logits/rejected": -1.6896544694900513, + "logps/chosen": -192.75010681152344, + "logps/rejected": -294.51593017578125, + "loss": 0.0512, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4151019752025604, + "rewards/margins": 8.284332275390625, + "rewards/rejected": -8.699434280395508, + "step": 1457 + }, + { + "epoch": 1.91, + "learning_rate": 1.5831543041785247e-05, + "logits/chosen": -1.8007892370224, + "logits/rejected": -1.8957583904266357, + "logps/chosen": -200.95521545410156, + "logps/rejected": -322.8530578613281, + "loss": 0.0598, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6054823398590088, + "rewards/margins": 11.572709083557129, + "rewards/rejected": -10.9672269821167, + "step": 1458 + }, + { + "epoch": 1.91, + "learning_rate": 1.579821876140768e-05, + "logits/chosen": -1.627040982246399, + "logits/rejected": -1.6189711093902588, + "logps/chosen": -190.68942260742188, + "logps/rejected": -262.9361877441406, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0393062829971313, + "rewards/margins": 7.976235866546631, + "rewards/rejected": -9.015542030334473, + "step": 1459 + }, + { + "epoch": 1.91, + "learning_rate": 1.5764913382277903e-05, + "logits/chosen": -1.7371068000793457, + "logits/rejected": -1.742724895477295, + "logps/chosen": -159.46339416503906, + "logps/rejected": -263.73260498046875, + "loss": 0.0574, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.079403281211853, + "rewards/margins": 11.19236946105957, + "rewards/rejected": -10.112967491149902, + "step": 1460 + }, + { + "epoch": 1.91, + "learning_rate": 1.5731626972808027e-05, + "logits/chosen": -1.913144826889038, + "logits/rejected": -1.9095001220703125, + "logps/chosen": -168.39039611816406, + "logps/rejected": -222.14102172851562, + "loss": 0.1781, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0907138586044312, + "rewards/margins": 6.8234429359436035, + "rewards/rejected": -7.914155960083008, + "step": 1461 + }, + { + "epoch": 1.91, + "learning_rate": 1.5698359601371187e-05, + "logits/chosen": -1.7367479801177979, + "logits/rejected": -1.7759535312652588, + "logps/chosen": -166.7388153076172, + "logps/rejected": -244.82525634765625, + "loss": 0.061, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7588629126548767, + "rewards/margins": 7.508255958557129, + "rewards/rejected": -8.267118453979492, + "step": 1462 + }, + { + "epoch": 1.91, + "learning_rate": 1.5665111336301415e-05, + "logits/chosen": -1.9168567657470703, + "logits/rejected": -1.882684588432312, + "logps/chosen": -161.9492950439453, + "logps/rejected": -234.33885192871094, + "loss": 0.0975, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14162234961986542, + "rewards/margins": 7.159221172332764, + "rewards/rejected": -7.300844192504883, + "step": 1463 + }, + { + "epoch": 1.92, + "learning_rate": 1.563188224589349e-05, + "logits/chosen": -1.579738974571228, + "logits/rejected": -1.6257154941558838, + "logps/chosen": -151.87002563476562, + "logps/rejected": -229.0591583251953, + "loss": 0.1396, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18479037284851074, + "rewards/margins": 7.169018268585205, + "rewards/rejected": -7.353808403015137, + "step": 1464 + }, + { + "epoch": 1.92, + "learning_rate": 1.5598672398402835e-05, + "logits/chosen": -1.6260305643081665, + "logits/rejected": -1.8175828456878662, + "logps/chosen": -122.39543151855469, + "logps/rejected": -211.99745178222656, + "loss": 0.139, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4261925220489502, + "rewards/margins": 7.208876132965088, + "rewards/rejected": -6.782683372497559, + "step": 1465 + }, + { + "epoch": 1.92, + "learning_rate": 1.5565481862045312e-05, + "logits/chosen": -1.7441940307617188, + "logits/rejected": -1.7418287992477417, + "logps/chosen": -177.23765563964844, + "logps/rejected": -260.0029296875, + "loss": 0.0874, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0813055038452148, + "rewards/margins": 9.815895080566406, + "rewards/rejected": -8.734588623046875, + "step": 1466 + }, + { + "epoch": 1.92, + "learning_rate": 1.553231070499712e-05, + "logits/chosen": -1.91140615940094, + "logits/rejected": -1.9313936233520508, + "logps/chosen": -182.1000213623047, + "logps/rejected": -232.25469970703125, + "loss": 0.1326, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1702615022659302, + "rewards/margins": 5.33015251159668, + "rewards/rejected": -6.50041389465332, + "step": 1467 + }, + { + "epoch": 1.92, + "learning_rate": 1.549915899539469e-05, + "logits/chosen": -1.5131639242172241, + "logits/rejected": -1.5436100959777832, + "logps/chosen": -183.39100646972656, + "logps/rejected": -292.4904479980469, + "loss": 0.0895, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04510200023651123, + "rewards/margins": 8.856659889221191, + "rewards/rejected": -8.81155776977539, + "step": 1468 + }, + { + "epoch": 1.92, + "learning_rate": 1.5466026801334437e-05, + "logits/chosen": -1.7742986679077148, + "logits/rejected": -1.744247317314148, + "logps/chosen": -189.68417358398438, + "logps/rejected": -300.5751647949219, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.898592472076416, + "rewards/margins": 8.97984790802002, + "rewards/rejected": -9.878440856933594, + "step": 1469 + }, + { + "epoch": 1.92, + "learning_rate": 1.5432914190872757e-05, + "logits/chosen": -1.9060922861099243, + "logits/rejected": -1.788303017616272, + "logps/chosen": -181.42251586914062, + "logps/rejected": -258.27716064453125, + "loss": 0.0952, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0517306327819824, + "rewards/margins": 7.567696571350098, + "rewards/rejected": -8.619428634643555, + "step": 1470 + }, + { + "epoch": 1.93, + "learning_rate": 1.5399821232025786e-05, + "logits/chosen": -1.7203636169433594, + "logits/rejected": -1.7400050163269043, + "logps/chosen": -201.70571899414062, + "logps/rejected": -314.5263977050781, + "loss": 0.2199, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5860781073570251, + "rewards/margins": 9.277680397033691, + "rewards/rejected": -9.863757133483887, + "step": 1471 + }, + { + "epoch": 1.93, + "learning_rate": 1.5366747992769287e-05, + "logits/chosen": -1.7324007749557495, + "logits/rejected": -1.7265377044677734, + "logps/chosen": -158.85125732421875, + "logps/rejected": -261.3233947753906, + "loss": 0.0912, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.612006425857544, + "rewards/margins": 9.145644187927246, + "rewards/rejected": -9.757649421691895, + "step": 1472 + }, + { + "epoch": 1.93, + "learning_rate": 1.5333694541038557e-05, + "logits/chosen": -1.8380104303359985, + "logits/rejected": -1.8475761413574219, + "logps/chosen": -164.52256774902344, + "logps/rejected": -281.95654296875, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9050693511962891, + "rewards/margins": 10.95753288269043, + "rewards/rejected": -10.052464485168457, + "step": 1473 + }, + { + "epoch": 1.93, + "learning_rate": 1.5300660944728187e-05, + "logits/chosen": -1.5354928970336914, + "logits/rejected": -1.555688738822937, + "logps/chosen": -206.12307739257812, + "logps/rejected": -265.6907043457031, + "loss": 0.0963, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8549438714981079, + "rewards/margins": 6.163863182067871, + "rewards/rejected": -7.018807411193848, + "step": 1474 + }, + { + "epoch": 1.93, + "learning_rate": 1.5267647271692036e-05, + "logits/chosen": -1.7825320959091187, + "logits/rejected": -1.8690727949142456, + "logps/chosen": -167.7135009765625, + "logps/rejected": -260.2290344238281, + "loss": 0.0487, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.49028992652893066, + "rewards/margins": 8.007672309875488, + "rewards/rejected": -8.49796199798584, + "step": 1475 + }, + { + "epoch": 1.93, + "learning_rate": 1.523465358974302e-05, + "logits/chosen": -1.7764742374420166, + "logits/rejected": -1.782279133796692, + "logps/chosen": -195.43894958496094, + "logps/rejected": -258.77740478515625, + "loss": 0.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2825368642807007, + "rewards/margins": 8.464844703674316, + "rewards/rejected": -8.182307243347168, + "step": 1476 + }, + { + "epoch": 1.93, + "learning_rate": 1.5201679966652981e-05, + "logits/chosen": -1.8976256847381592, + "logits/rejected": -1.854222059249878, + "logps/chosen": -192.22607421875, + "logps/rejected": -282.2470703125, + "loss": 0.0486, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.32534149289131165, + "rewards/margins": 9.524739265441895, + "rewards/rejected": -9.199397087097168, + "step": 1477 + }, + { + "epoch": 1.93, + "learning_rate": 1.5168726470152583e-05, + "logits/chosen": -1.8875632286071777, + "logits/rejected": -1.9206953048706055, + "logps/chosen": -175.67955017089844, + "logps/rejected": -268.0423889160156, + "loss": 0.0782, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06109175086021423, + "rewards/margins": 8.891679763793945, + "rewards/rejected": -8.952771186828613, + "step": 1478 + }, + { + "epoch": 1.94, + "learning_rate": 1.5135793167931128e-05, + "logits/chosen": -1.8069802522659302, + "logits/rejected": -1.795629620552063, + "logps/chosen": -176.66763305664062, + "logps/rejected": -233.2548370361328, + "loss": 0.0738, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7341656684875488, + "rewards/margins": 6.5890116691589355, + "rewards/rejected": -7.323177337646484, + "step": 1479 + }, + { + "epoch": 1.94, + "learning_rate": 1.5102880127636438e-05, + "logits/chosen": -1.7576944828033447, + "logits/rejected": -1.7891267538070679, + "logps/chosen": -167.64984130859375, + "logps/rejected": -250.33139038085938, + "loss": 0.0713, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.41087377071380615, + "rewards/margins": 7.205945014953613, + "rewards/rejected": -7.616818428039551, + "step": 1480 + }, + { + "epoch": 1.94, + "learning_rate": 1.506998741687472e-05, + "logits/chosen": -1.684385895729065, + "logits/rejected": -1.723787546157837, + "logps/chosen": -172.01609802246094, + "logps/rejected": -287.2373352050781, + "loss": 0.0561, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0953454002737999, + "rewards/margins": 9.943408012390137, + "rewards/rejected": -10.038753509521484, + "step": 1481 + }, + { + "epoch": 1.94, + "learning_rate": 1.5037115103210419e-05, + "logits/chosen": -1.8009709119796753, + "logits/rejected": -1.7996408939361572, + "logps/chosen": -215.1142578125, + "logps/rejected": -290.99591064453125, + "loss": 0.0908, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2569479942321777, + "rewards/margins": 7.822461128234863, + "rewards/rejected": -9.079408645629883, + "step": 1482 + }, + { + "epoch": 1.94, + "learning_rate": 1.5004263254166107e-05, + "logits/chosen": -1.9387784004211426, + "logits/rejected": -1.856516718864441, + "logps/chosen": -194.5041046142578, + "logps/rejected": -295.2081604003906, + "loss": 0.0266, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7324516773223877, + "rewards/margins": 11.716482162475586, + "rewards/rejected": -10.984031677246094, + "step": 1483 + }, + { + "epoch": 1.94, + "learning_rate": 1.4971431937222283e-05, + "logits/chosen": -1.7553457021713257, + "logits/rejected": -1.7600181102752686, + "logps/chosen": -164.54461669921875, + "logps/rejected": -244.7216033935547, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3915133774280548, + "rewards/margins": 8.210360527038574, + "rewards/rejected": -8.601874351501465, + "step": 1484 + }, + { + "epoch": 1.94, + "learning_rate": 1.493862121981729e-05, + "logits/chosen": -1.8042484521865845, + "logits/rejected": -1.834628939628601, + "logps/chosen": -171.3224639892578, + "logps/rejected": -280.1764831542969, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7882771492004395, + "rewards/margins": 9.947287559509277, + "rewards/rejected": -10.735565185546875, + "step": 1485 + }, + { + "epoch": 1.94, + "learning_rate": 1.4905831169347145e-05, + "logits/chosen": -1.9212846755981445, + "logits/rejected": -1.979791283607483, + "logps/chosen": -161.81036376953125, + "logps/rejected": -255.50523376464844, + "loss": 0.053, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.41199663281440735, + "rewards/margins": 8.696393966674805, + "rewards/rejected": -8.284398078918457, + "step": 1486 + }, + { + "epoch": 1.95, + "learning_rate": 1.4873061853165444e-05, + "logits/chosen": -1.6558067798614502, + "logits/rejected": -1.5977141857147217, + "logps/chosen": -179.38882446289062, + "logps/rejected": -263.31414794921875, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.825944185256958, + "rewards/margins": 8.320928573608398, + "rewards/rejected": -9.146871566772461, + "step": 1487 + }, + { + "epoch": 1.95, + "learning_rate": 1.4840313338583162e-05, + "logits/chosen": -1.956737995147705, + "logits/rejected": -1.9808233976364136, + "logps/chosen": -182.4755859375, + "logps/rejected": -285.7007751464844, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23502108454704285, + "rewards/margins": 9.257952690124512, + "rewards/rejected": -9.492974281311035, + "step": 1488 + }, + { + "epoch": 1.95, + "learning_rate": 1.4807585692868552e-05, + "logits/chosen": -1.8370660543441772, + "logits/rejected": -1.8656373023986816, + "logps/chosen": -167.7559814453125, + "logps/rejected": -240.41885375976562, + "loss": 0.1026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04569551348686218, + "rewards/margins": 6.923917293548584, + "rewards/rejected": -6.8782219886779785, + "step": 1489 + }, + { + "epoch": 1.95, + "learning_rate": 1.4774878983247026e-05, + "logits/chosen": -1.6924645900726318, + "logits/rejected": -1.6619470119476318, + "logps/chosen": -176.3340301513672, + "logps/rejected": -241.27755737304688, + "loss": 0.0591, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6439440250396729, + "rewards/margins": 7.184011459350586, + "rewards/rejected": -7.82795524597168, + "step": 1490 + }, + { + "epoch": 1.95, + "learning_rate": 1.4742193276900937e-05, + "logits/chosen": -1.448428988456726, + "logits/rejected": -1.5834200382232666, + "logps/chosen": -171.53662109375, + "logps/rejected": -282.0088195800781, + "loss": 0.0315, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09488153457641602, + "rewards/margins": 9.073232650756836, + "rewards/rejected": -9.168113708496094, + "step": 1491 + }, + { + "epoch": 1.95, + "learning_rate": 1.4709528640969552e-05, + "logits/chosen": -1.8020468950271606, + "logits/rejected": -1.7656517028808594, + "logps/chosen": -179.18386840820312, + "logps/rejected": -267.0850524902344, + "loss": 0.0657, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4516657590866089, + "rewards/margins": 8.470237731933594, + "rewards/rejected": -8.921903610229492, + "step": 1492 + }, + { + "epoch": 1.95, + "learning_rate": 1.4676885142548829e-05, + "logits/chosen": -1.7168338298797607, + "logits/rejected": -1.848124623298645, + "logps/chosen": -156.4276885986328, + "logps/rejected": -247.7043914794922, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46370768547058105, + "rewards/margins": 7.316370487213135, + "rewards/rejected": -7.780077934265137, + "step": 1493 + }, + { + "epoch": 1.96, + "learning_rate": 1.4644262848691311e-05, + "logits/chosen": -1.8042938709259033, + "logits/rejected": -1.8076025247573853, + "logps/chosen": -175.3911895751953, + "logps/rejected": -276.2196960449219, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7568331956863403, + "rewards/margins": 10.083118438720703, + "rewards/rejected": -9.326284408569336, + "step": 1494 + }, + { + "epoch": 1.96, + "learning_rate": 1.4611661826406004e-05, + "logits/chosen": -1.5365004539489746, + "logits/rejected": -1.6221803426742554, + "logps/chosen": -168.2038116455078, + "logps/rejected": -237.28738403320312, + "loss": 0.0593, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04265642166137695, + "rewards/margins": 7.6321702003479, + "rewards/rejected": -7.589514255523682, + "step": 1495 + }, + { + "epoch": 1.96, + "learning_rate": 1.4579082142658176e-05, + "logits/chosen": -1.7976524829864502, + "logits/rejected": -1.8538012504577637, + "logps/chosen": -193.39288330078125, + "logps/rejected": -317.5895080566406, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2991272807121277, + "rewards/margins": 11.215351104736328, + "rewards/rejected": -11.514477729797363, + "step": 1496 + }, + { + "epoch": 1.96, + "learning_rate": 1.4546523864369303e-05, + "logits/chosen": -1.8310483694076538, + "logits/rejected": -1.863168716430664, + "logps/chosen": -204.5284423828125, + "logps/rejected": -298.26629638671875, + "loss": 0.1011, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.34320068359375, + "rewards/margins": 6.96917724609375, + "rewards/rejected": -8.3123779296875, + "step": 1497 + }, + { + "epoch": 1.96, + "learning_rate": 1.4513987058416879e-05, + "logits/chosen": -1.8905072212219238, + "logits/rejected": -1.9070124626159668, + "logps/chosen": -210.54266357421875, + "logps/rejected": -288.186767578125, + "loss": 0.1244, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6764855980873108, + "rewards/margins": 7.883575439453125, + "rewards/rejected": -8.560060501098633, + "step": 1498 + }, + { + "epoch": 1.96, + "learning_rate": 1.448147179163431e-05, + "logits/chosen": -1.950812816619873, + "logits/rejected": -1.8666086196899414, + "logps/chosen": -211.47979736328125, + "logps/rejected": -275.7916564941406, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07499128580093384, + "rewards/margins": 9.091803550720215, + "rewards/rejected": -9.166794776916504, + "step": 1499 + }, + { + "epoch": 1.96, + "learning_rate": 1.4448978130810715e-05, + "logits/chosen": -1.760766625404358, + "logits/rejected": -1.788704752922058, + "logps/chosen": -183.333740234375, + "logps/rejected": -304.87359619140625, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4795857071876526, + "rewards/margins": 9.696410179138184, + "rewards/rejected": -10.175996780395508, + "step": 1500 + }, + { + "epoch": 1.96, + "learning_rate": 1.4416506142690889e-05, + "logits/chosen": -1.6347984075546265, + "logits/rejected": -1.7342019081115723, + "logps/chosen": -168.65963745117188, + "logps/rejected": -290.02252197265625, + "loss": 0.0454, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0552358627319336, + "rewards/margins": 8.927092552185059, + "rewards/rejected": -9.982328414916992, + "step": 1501 + }, + { + "epoch": 1.97, + "learning_rate": 1.4384055893975051e-05, + "logits/chosen": -1.884681224822998, + "logits/rejected": -1.9328612089157104, + "logps/chosen": -180.03875732421875, + "logps/rejected": -277.0013427734375, + "loss": 0.0882, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2654374837875366, + "rewards/margins": 9.350257873535156, + "rewards/rejected": -9.084820747375488, + "step": 1502 + }, + { + "epoch": 1.97, + "learning_rate": 1.4351627451318821e-05, + "logits/chosen": -1.8112989664077759, + "logits/rejected": -1.9305856227874756, + "logps/chosen": -184.3560333251953, + "logps/rejected": -299.67413330078125, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5721713900566101, + "rewards/margins": 10.21081256866455, + "rewards/rejected": -10.782983779907227, + "step": 1503 + }, + { + "epoch": 1.97, + "learning_rate": 1.4319220881332979e-05, + "logits/chosen": -1.546270489692688, + "logits/rejected": -1.586949110031128, + "logps/chosen": -190.1439971923828, + "logps/rejected": -259.6518249511719, + "loss": 0.1048, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05506080389022827, + "rewards/margins": 8.182172775268555, + "rewards/rejected": -8.12711238861084, + "step": 1504 + }, + { + "epoch": 1.97, + "learning_rate": 1.428683625058341e-05, + "logits/chosen": -1.820664882659912, + "logits/rejected": -1.7996609210968018, + "logps/chosen": -216.82296752929688, + "logps/rejected": -288.84417724609375, + "loss": 0.0886, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22843483090400696, + "rewards/margins": 9.70625114440918, + "rewards/rejected": -9.477816581726074, + "step": 1505 + }, + { + "epoch": 1.97, + "learning_rate": 1.4254473625590942e-05, + "logits/chosen": -1.7553366422653198, + "logits/rejected": -1.796731948852539, + "logps/chosen": -193.16140747070312, + "logps/rejected": -292.4389343261719, + "loss": 0.1373, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7500739693641663, + "rewards/margins": 8.625079154968262, + "rewards/rejected": -9.375153541564941, + "step": 1506 + }, + { + "epoch": 1.97, + "learning_rate": 1.4222133072831143e-05, + "logits/chosen": -1.941459059715271, + "logits/rejected": -2.0197572708129883, + "logps/chosen": -172.37442016601562, + "logps/rejected": -275.12939453125, + "loss": 0.066, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6190582513809204, + "rewards/margins": 8.045225143432617, + "rewards/rejected": -8.66428279876709, + "step": 1507 + }, + { + "epoch": 1.97, + "learning_rate": 1.4189814658734302e-05, + "logits/chosen": -1.935334324836731, + "logits/rejected": -1.8946447372436523, + "logps/chosen": -180.55690002441406, + "logps/rejected": -239.8633575439453, + "loss": 0.1302, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9962777495384216, + "rewards/margins": 6.127375602722168, + "rewards/rejected": -7.123652935028076, + "step": 1508 + }, + { + "epoch": 1.97, + "learning_rate": 1.415751844968522e-05, + "logits/chosen": -1.7952340841293335, + "logits/rejected": -1.7781485319137573, + "logps/chosen": -156.3070068359375, + "logps/rejected": -236.4821014404297, + "loss": 0.1022, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.295614242553711, + "rewards/margins": 6.489151954650879, + "rewards/rejected": -7.78476619720459, + "step": 1509 + }, + { + "epoch": 1.98, + "learning_rate": 1.4125244512023062e-05, + "logits/chosen": -1.6392542123794556, + "logits/rejected": -1.6742371320724487, + "logps/chosen": -172.890380859375, + "logps/rejected": -293.1167297363281, + "loss": 0.0561, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.006844118237495422, + "rewards/margins": 10.987504005432129, + "rewards/rejected": -10.980659484863281, + "step": 1510 + }, + { + "epoch": 1.98, + "learning_rate": 1.4092992912041274e-05, + "logits/chosen": -2.1413776874542236, + "logits/rejected": -2.145364284515381, + "logps/chosen": -161.0505828857422, + "logps/rejected": -217.4890899658203, + "loss": 0.1535, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5856319069862366, + "rewards/margins": 5.729649066925049, + "rewards/rejected": -6.315280914306641, + "step": 1511 + }, + { + "epoch": 1.98, + "learning_rate": 1.4060763715987418e-05, + "logits/chosen": -1.6865848302841187, + "logits/rejected": -1.7697378396987915, + "logps/chosen": -197.43948364257812, + "logps/rejected": -312.4697570800781, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9023823738098145, + "rewards/margins": 10.419868469238281, + "rewards/rejected": -11.322250366210938, + "step": 1512 + }, + { + "epoch": 1.98, + "learning_rate": 1.4028556990063018e-05, + "logits/chosen": -1.9619946479797363, + "logits/rejected": -2.0120935440063477, + "logps/chosen": -208.51266479492188, + "logps/rejected": -281.7726745605469, + "loss": 0.1385, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.272727131843567, + "rewards/margins": 6.440592288970947, + "rewards/rejected": -7.713319778442383, + "step": 1513 + }, + { + "epoch": 1.98, + "learning_rate": 1.399637280042344e-05, + "logits/chosen": -1.6263587474822998, + "logits/rejected": -1.6693737506866455, + "logps/chosen": -196.84165954589844, + "logps/rejected": -305.8514099121094, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6264234185218811, + "rewards/margins": 10.212072372436523, + "rewards/rejected": -10.838496208190918, + "step": 1514 + }, + { + "epoch": 1.98, + "learning_rate": 1.3964211213177777e-05, + "logits/chosen": -1.8542428016662598, + "logits/rejected": -1.941563367843628, + "logps/chosen": -177.66531372070312, + "logps/rejected": -283.9886779785156, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6152518391609192, + "rewards/margins": 10.98995590209961, + "rewards/rejected": -10.374704360961914, + "step": 1515 + }, + { + "epoch": 1.98, + "learning_rate": 1.3932072294388701e-05, + "logits/chosen": -1.9051244258880615, + "logits/rejected": -1.9027007818222046, + "logps/chosen": -190.46878051757812, + "logps/rejected": -239.47027587890625, + "loss": 0.1163, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4306082725524902, + "rewards/margins": 6.50973653793335, + "rewards/rejected": -7.94034481048584, + "step": 1516 + }, + { + "epoch": 1.99, + "learning_rate": 1.3899956110072296e-05, + "logits/chosen": -1.8895916938781738, + "logits/rejected": -1.8552770614624023, + "logps/chosen": -217.69955444335938, + "logps/rejected": -283.12786865234375, + "loss": 0.0961, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1030155420303345, + "rewards/margins": 6.898214817047119, + "rewards/rejected": -8.00123119354248, + "step": 1517 + }, + { + "epoch": 1.99, + "learning_rate": 1.386786272619795e-05, + "logits/chosen": -1.9179669618606567, + "logits/rejected": -1.891516089439392, + "logps/chosen": -218.85401916503906, + "logps/rejected": -309.1119384765625, + "loss": 0.0551, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.18665099143981934, + "rewards/margins": 10.043283462524414, + "rewards/rejected": -9.856632232666016, + "step": 1518 + }, + { + "epoch": 1.99, + "learning_rate": 1.383579220868823e-05, + "logits/chosen": -1.6797354221343994, + "logits/rejected": -1.5011869668960571, + "logps/chosen": -199.94345092773438, + "logps/rejected": -283.186767578125, + "loss": 0.0736, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8336053490638733, + "rewards/margins": 8.69570541381836, + "rewards/rejected": -9.529309272766113, + "step": 1519 + }, + { + "epoch": 1.99, + "learning_rate": 1.3803744623418751e-05, + "logits/chosen": -1.940439224243164, + "logits/rejected": -1.9891163110733032, + "logps/chosen": -174.73284912109375, + "logps/rejected": -277.4080810546875, + "loss": 0.0582, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.14671999216079712, + "rewards/margins": 9.259468078613281, + "rewards/rejected": -9.112749099731445, + "step": 1520 + }, + { + "epoch": 1.99, + "learning_rate": 1.3771720036217969e-05, + "logits/chosen": -1.8065035343170166, + "logits/rejected": -1.801666021347046, + "logps/chosen": -195.01231384277344, + "logps/rejected": -303.60565185546875, + "loss": 0.0696, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2088940143585205, + "rewards/margins": 10.52672004699707, + "rewards/rejected": -10.317826271057129, + "step": 1521 + }, + { + "epoch": 1.99, + "learning_rate": 1.3739718512867151e-05, + "logits/chosen": -1.6864298582077026, + "logits/rejected": -1.7546144723892212, + "logps/chosen": -192.529052734375, + "logps/rejected": -298.2187805175781, + "loss": 0.0487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9811297655105591, + "rewards/margins": 10.510316848754883, + "rewards/rejected": -11.491447448730469, + "step": 1522 + }, + { + "epoch": 1.99, + "learning_rate": 1.3707740119100185e-05, + "logits/chosen": -1.9570181369781494, + "logits/rejected": -1.9724302291870117, + "logps/chosen": -203.83419799804688, + "logps/rejected": -289.02734375, + "loss": 0.1304, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30844154953956604, + "rewards/margins": 9.108614921569824, + "rewards/rejected": -8.80017375946045, + "step": 1523 + }, + { + "epoch": 1.99, + "learning_rate": 1.3675784920603397e-05, + "logits/chosen": -1.8031600713729858, + "logits/rejected": -1.8364602327346802, + "logps/chosen": -181.02267456054688, + "logps/rejected": -271.9961853027344, + "loss": 0.0597, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04141731560230255, + "rewards/margins": 8.970625877380371, + "rewards/rejected": -9.012043952941895, + "step": 1524 + }, + { + "epoch": 2.0, + "learning_rate": 1.3643852983015524e-05, + "logits/chosen": -1.4848906993865967, + "logits/rejected": -1.4887772798538208, + "logps/chosen": -165.5267791748047, + "logps/rejected": -229.33474731445312, + "loss": 0.2686, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8461334705352783, + "rewards/margins": 7.155306816101074, + "rewards/rejected": -8.001440048217773, + "step": 1525 + }, + { + "epoch": 2.0, + "learning_rate": 1.3611944371927515e-05, + "logits/chosen": -1.8655469417572021, + "logits/rejected": -1.839209794998169, + "logps/chosen": -181.52239990234375, + "logps/rejected": -271.8775634765625, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3275052309036255, + "rewards/margins": 9.104433059692383, + "rewards/rejected": -9.431938171386719, + "step": 1526 + }, + { + "epoch": 2.0, + "learning_rate": 1.3580059152882374e-05, + "logits/chosen": -1.9023470878601074, + "logits/rejected": -1.8748483657836914, + "logps/chosen": -198.76577758789062, + "logps/rejected": -269.1929626464844, + "loss": 0.0622, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.10333386063575745, + "rewards/margins": 8.20978832244873, + "rewards/rejected": -8.313121795654297, + "step": 1527 + }, + { + "epoch": 2.0, + "learning_rate": 1.3548197391375092e-05, + "logits/chosen": -1.4875879287719727, + "logits/rejected": -1.4259605407714844, + "logps/chosen": -185.81918334960938, + "logps/rejected": -240.16160583496094, + "loss": 0.1162, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8114877939224243, + "rewards/margins": 6.684213638305664, + "rewards/rejected": -7.495701313018799, + "step": 1528 + }, + { + "epoch": 2.0, + "learning_rate": 1.3516359152852443e-05, + "logits/chosen": -1.8391220569610596, + "logits/rejected": -1.8453034162521362, + "logps/chosen": -148.43589782714844, + "logps/rejected": -229.90371704101562, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39271843433380127, + "rewards/margins": 7.75892448425293, + "rewards/rejected": -8.151642799377441, + "step": 1529 + }, + { + "epoch": 2.0, + "learning_rate": 1.348454450271292e-05, + "logits/chosen": -1.8431575298309326, + "logits/rejected": -1.8993892669677734, + "logps/chosen": -143.37904357910156, + "logps/rejected": -274.07861328125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.551169753074646, + "rewards/margins": 12.577217102050781, + "rewards/rejected": -11.026046752929688, + "step": 1530 + }, + { + "epoch": 2.0, + "learning_rate": 1.345275350630652e-05, + "logits/chosen": -1.7807674407958984, + "logits/rejected": -1.8132703304290771, + "logps/chosen": -147.68115234375, + "logps/rejected": -255.60055541992188, + "loss": 0.0453, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0213656425476074, + "rewards/margins": 11.28833293914795, + "rewards/rejected": -10.2669677734375, + "step": 1531 + }, + { + "epoch": 2.0, + "learning_rate": 1.342098622893469e-05, + "logits/chosen": -1.7203935384750366, + "logits/rejected": -1.7435144186019897, + "logps/chosen": -170.30160522460938, + "logps/rejected": -299.8740539550781, + "loss": 0.0448, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3704142570495605, + "rewards/margins": 8.759329795837402, + "rewards/rejected": -10.129744529724121, + "step": 1532 + }, + { + "epoch": 2.01, + "learning_rate": 1.3389242735850146e-05, + "logits/chosen": -1.9918012619018555, + "logits/rejected": -1.980901837348938, + "logps/chosen": -155.14599609375, + "logps/rejected": -270.18017578125, + "loss": 0.0868, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1512441635131836, + "rewards/margins": 11.62071704864502, + "rewards/rejected": -10.469473838806152, + "step": 1533 + }, + { + "epoch": 2.01, + "learning_rate": 1.3357523092256742e-05, + "logits/chosen": -1.527714729309082, + "logits/rejected": -1.5021893978118896, + "logps/chosen": -206.78240966796875, + "logps/rejected": -321.47137451171875, + "loss": 0.0579, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.22152179479599, + "rewards/margins": 11.234718322753906, + "rewards/rejected": -11.01319694519043, + "step": 1534 + }, + { + "epoch": 2.01, + "learning_rate": 1.3325827363309329e-05, + "logits/chosen": -1.9469248056411743, + "logits/rejected": -1.9597361087799072, + "logps/chosen": -191.61795043945312, + "logps/rejected": -256.5360107421875, + "loss": 0.0438, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.015395380556583405, + "rewards/margins": 9.715763092041016, + "rewards/rejected": -9.70036792755127, + "step": 1535 + }, + { + "epoch": 2.01, + "learning_rate": 1.3294155614113673e-05, + "logits/chosen": -2.037672758102417, + "logits/rejected": -2.0547268390655518, + "logps/chosen": -176.68716430664062, + "logps/rejected": -274.5751953125, + "loss": 0.0444, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.01030568778514862, + "rewards/margins": 9.373575210571289, + "rewards/rejected": -9.383879661560059, + "step": 1536 + }, + { + "epoch": 2.01, + "learning_rate": 1.3262507909726251e-05, + "logits/chosen": -1.5088722705841064, + "logits/rejected": -1.554348111152649, + "logps/chosen": -167.13824462890625, + "logps/rejected": -263.2229309082031, + "loss": 0.0868, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.318091481924057, + "rewards/margins": 8.720268249511719, + "rewards/rejected": -9.038360595703125, + "step": 1537 + }, + { + "epoch": 2.01, + "learning_rate": 1.3230884315154163e-05, + "logits/chosen": -1.869170904159546, + "logits/rejected": -1.913615107536316, + "logps/chosen": -165.81358337402344, + "logps/rejected": -293.13629150390625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2461726665496826, + "rewards/margins": 12.157745361328125, + "rewards/rejected": -10.911572456359863, + "step": 1538 + }, + { + "epoch": 2.01, + "learning_rate": 1.3199284895355002e-05, + "logits/chosen": -1.9871957302093506, + "logits/rejected": -1.999311089515686, + "logps/chosen": -162.21644592285156, + "logps/rejected": -217.64495849609375, + "loss": 0.0982, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05965733900666237, + "rewards/margins": 6.789799213409424, + "rewards/rejected": -6.730141639709473, + "step": 1539 + }, + { + "epoch": 2.02, + "learning_rate": 1.316770971523667e-05, + "logits/chosen": -1.63510262966156, + "logits/rejected": -1.63307523727417, + "logps/chosen": -160.36801147460938, + "logps/rejected": -294.63671875, + "loss": 0.0442, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9374539852142334, + "rewards/margins": 11.629518508911133, + "rewards/rejected": -10.69206428527832, + "step": 1540 + }, + { + "epoch": 2.02, + "learning_rate": 1.3136158839657287e-05, + "logits/chosen": -1.5396658182144165, + "logits/rejected": -1.5948903560638428, + "logps/chosen": -174.12709045410156, + "logps/rejected": -318.6853332519531, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.054802656173706, + "rewards/margins": 13.222975730895996, + "rewards/rejected": -12.168172836303711, + "step": 1541 + }, + { + "epoch": 2.02, + "learning_rate": 1.3104632333425066e-05, + "logits/chosen": -1.5707889795303345, + "logits/rejected": -1.5340653657913208, + "logps/chosen": -213.45535278320312, + "logps/rejected": -308.8717346191406, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.197975516319275, + "rewards/margins": 8.70287799835205, + "rewards/rejected": -9.900854110717773, + "step": 1542 + }, + { + "epoch": 2.02, + "learning_rate": 1.3073130261298167e-05, + "logits/chosen": -1.7302970886230469, + "logits/rejected": -1.8362350463867188, + "logps/chosen": -190.625732421875, + "logps/rejected": -259.4525146484375, + "loss": 0.1451, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7216106653213501, + "rewards/margins": 6.380286693572998, + "rewards/rejected": -7.101897239685059, + "step": 1543 + }, + { + "epoch": 2.02, + "learning_rate": 1.3041652687984535e-05, + "logits/chosen": -1.7102713584899902, + "logits/rejected": -1.6913330554962158, + "logps/chosen": -198.17388916015625, + "logps/rejected": -296.4353942871094, + "loss": 0.1306, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.157545804977417, + "rewards/margins": 9.085430145263672, + "rewards/rejected": -9.242976188659668, + "step": 1544 + }, + { + "epoch": 2.02, + "learning_rate": 1.3010199678141793e-05, + "logits/chosen": -1.8154162168502808, + "logits/rejected": -1.806509256362915, + "logps/chosen": -196.5818634033203, + "logps/rejected": -343.44024658203125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1952199935913086, + "rewards/margins": 14.458597183227539, + "rewards/rejected": -13.263376235961914, + "step": 1545 + }, + { + "epoch": 2.02, + "learning_rate": 1.297877129637714e-05, + "logits/chosen": -1.5137269496917725, + "logits/rejected": -1.5175082683563232, + "logps/chosen": -217.48431396484375, + "logps/rejected": -333.30291748046875, + "loss": 0.13, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.333642840385437, + "rewards/margins": 10.544397354125977, + "rewards/rejected": -10.878040313720703, + "step": 1546 + }, + { + "epoch": 2.02, + "learning_rate": 1.2947367607247168e-05, + "logits/chosen": -1.9852371215820312, + "logits/rejected": -1.9342000484466553, + "logps/chosen": -176.0267333984375, + "logps/rejected": -307.2276306152344, + "loss": 0.0449, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.1807262897491455, + "rewards/margins": 10.857745170593262, + "rewards/rejected": -9.677019119262695, + "step": 1547 + }, + { + "epoch": 2.03, + "learning_rate": 1.2915988675257729e-05, + "logits/chosen": -1.8818638324737549, + "logits/rejected": -1.957972764968872, + "logps/chosen": -149.12771606445312, + "logps/rejected": -285.6009521484375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2972172498703003, + "rewards/margins": 11.586394309997559, + "rewards/rejected": -11.289176940917969, + "step": 1548 + }, + { + "epoch": 2.03, + "learning_rate": 1.2884634564863853e-05, + "logits/chosen": -1.7779502868652344, + "logits/rejected": -1.7382148504257202, + "logps/chosen": -191.65310668945312, + "logps/rejected": -301.51947021484375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.245689034461975, + "rewards/margins": 10.16487979888916, + "rewards/rejected": -11.410568237304688, + "step": 1549 + }, + { + "epoch": 2.03, + "learning_rate": 1.2853305340469592e-05, + "logits/chosen": -1.712690830230713, + "logits/rejected": -1.675184965133667, + "logps/chosen": -176.84695434570312, + "logps/rejected": -274.3833312988281, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07334829866886139, + "rewards/margins": 9.532466888427734, + "rewards/rejected": -9.459117889404297, + "step": 1550 + }, + { + "epoch": 2.03, + "learning_rate": 1.2822001066427818e-05, + "logits/chosen": -1.6040308475494385, + "logits/rejected": -1.63424813747406, + "logps/chosen": -205.90966796875, + "logps/rejected": -326.33575439453125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7295277118682861, + "rewards/margins": 11.217487335205078, + "rewards/rejected": -10.487959861755371, + "step": 1551 + }, + { + "epoch": 2.03, + "learning_rate": 1.2790721807040216e-05, + "logits/chosen": -1.9036675691604614, + "logits/rejected": -1.8898967504501343, + "logps/chosen": -178.674560546875, + "logps/rejected": -297.104248046875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09359598159790039, + "rewards/margins": 11.31398868560791, + "rewards/rejected": -11.407584190368652, + "step": 1552 + }, + { + "epoch": 2.03, + "learning_rate": 1.2759467626557076e-05, + "logits/chosen": -1.575809359550476, + "logits/rejected": -1.6393325328826904, + "logps/chosen": -139.43023681640625, + "logps/rejected": -237.76670837402344, + "loss": 0.0444, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12280965596437454, + "rewards/margins": 9.348191261291504, + "rewards/rejected": -9.225381851196289, + "step": 1553 + }, + { + "epoch": 2.03, + "learning_rate": 1.2728238589177141e-05, + "logits/chosen": -1.9256058931350708, + "logits/rejected": -1.9834691286087036, + "logps/chosen": -158.78602600097656, + "logps/rejected": -246.46710205078125, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4418870806694031, + "rewards/margins": 10.31446647644043, + "rewards/rejected": -9.872579574584961, + "step": 1554 + }, + { + "epoch": 2.04, + "learning_rate": 1.2697034759047561e-05, + "logits/chosen": -1.7548719644546509, + "logits/rejected": -1.8134605884552002, + "logps/chosen": -180.61431884765625, + "logps/rejected": -298.4842529296875, + "loss": 0.0449, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.028109461069107056, + "rewards/margins": 11.055364608764648, + "rewards/rejected": -11.083474159240723, + "step": 1555 + }, + { + "epoch": 2.04, + "learning_rate": 1.2665856200263649e-05, + "logits/chosen": -1.738784670829773, + "logits/rejected": -1.7976702451705933, + "logps/chosen": -175.2208709716797, + "logps/rejected": -299.0406494140625, + "loss": 0.0455, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.17027875781059265, + "rewards/margins": 11.51250171661377, + "rewards/rejected": -11.34222412109375, + "step": 1556 + }, + { + "epoch": 2.04, + "learning_rate": 1.2634702976868868e-05, + "logits/chosen": -1.8731894493103027, + "logits/rejected": -1.9213989973068237, + "logps/chosen": -186.83779907226562, + "logps/rejected": -281.11663818359375, + "loss": 0.0436, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6948243975639343, + "rewards/margins": 9.840022087097168, + "rewards/rejected": -10.534846305847168, + "step": 1557 + }, + { + "epoch": 2.04, + "learning_rate": 1.2603575152854582e-05, + "logits/chosen": -1.876339077949524, + "logits/rejected": -1.8815693855285645, + "logps/chosen": -176.52540588378906, + "logps/rejected": -269.7199401855469, + "loss": 0.0637, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7384051084518433, + "rewards/margins": 8.616518020629883, + "rewards/rejected": -9.354923248291016, + "step": 1558 + }, + { + "epoch": 2.04, + "learning_rate": 1.2572472792160029e-05, + "logits/chosen": -1.940361499786377, + "logits/rejected": -1.8838485479354858, + "logps/chosen": -193.45838928222656, + "logps/rejected": -272.8813171386719, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8808805346488953, + "rewards/margins": 10.543073654174805, + "rewards/rejected": -9.662193298339844, + "step": 1559 + }, + { + "epoch": 2.04, + "learning_rate": 1.2541395958672128e-05, + "logits/chosen": -1.799181342124939, + "logits/rejected": -1.7694520950317383, + "logps/chosen": -165.37045288085938, + "logps/rejected": -275.53570556640625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8924998044967651, + "rewards/margins": 11.558836936950684, + "rewards/rejected": -10.666337013244629, + "step": 1560 + }, + { + "epoch": 2.04, + "learning_rate": 1.2510344716225353e-05, + "logits/chosen": -1.8289241790771484, + "logits/rejected": -1.8177402019500732, + "logps/chosen": -164.09075927734375, + "logps/rejected": -271.72601318359375, + "loss": 0.0441, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.11308735609054565, + "rewards/margins": 9.504997253417969, + "rewards/rejected": -9.3919095993042, + "step": 1561 + }, + { + "epoch": 2.04, + "learning_rate": 1.247931912860161e-05, + "logits/chosen": -1.8938099145889282, + "logits/rejected": -1.8855090141296387, + "logps/chosen": -174.61190795898438, + "logps/rejected": -258.94482421875, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6930776834487915, + "rewards/margins": 8.374161720275879, + "rewards/rejected": -9.067239761352539, + "step": 1562 + }, + { + "epoch": 2.05, + "learning_rate": 1.2448319259530129e-05, + "logits/chosen": -1.826311469078064, + "logits/rejected": -1.8732446432113647, + "logps/chosen": -190.18638610839844, + "logps/rejected": -283.4232177734375, + "loss": 0.1302, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24883827567100525, + "rewards/margins": 9.094849586486816, + "rewards/rejected": -9.343688011169434, + "step": 1563 + }, + { + "epoch": 2.05, + "learning_rate": 1.2417345172687303e-05, + "logits/chosen": -1.6359118223190308, + "logits/rejected": -1.6801053285598755, + "logps/chosen": -169.96446228027344, + "logps/rejected": -259.3656921386719, + "loss": 0.0869, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6407762765884399, + "rewards/margins": 10.16246509552002, + "rewards/rejected": -9.521688461303711, + "step": 1564 + }, + { + "epoch": 2.05, + "learning_rate": 1.2386396931696545e-05, + "logits/chosen": -1.9608029127120972, + "logits/rejected": -1.9681981801986694, + "logps/chosen": -173.88815307617188, + "logps/rejected": -262.5113525390625, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3761565387248993, + "rewards/margins": 8.115509986877441, + "rewards/rejected": -8.491666793823242, + "step": 1565 + }, + { + "epoch": 2.05, + "learning_rate": 1.235547460012822e-05, + "logits/chosen": -1.9599274396896362, + "logits/rejected": -1.915431022644043, + "logps/chosen": -169.6197967529297, + "logps/rejected": -255.58401489257812, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3725338578224182, + "rewards/margins": 8.86599349975586, + "rewards/rejected": -9.238527297973633, + "step": 1566 + }, + { + "epoch": 2.05, + "learning_rate": 1.2324578241499434e-05, + "logits/chosen": -1.4780423641204834, + "logits/rejected": -1.4643298387527466, + "logps/chosen": -172.65921020507812, + "logps/rejected": -290.62542724609375, + "loss": 0.0458, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5282078385353088, + "rewards/margins": 10.207393646240234, + "rewards/rejected": -10.735601425170898, + "step": 1567 + }, + { + "epoch": 2.05, + "learning_rate": 1.2293707919273951e-05, + "logits/chosen": -1.9044053554534912, + "logits/rejected": -1.9357309341430664, + "logps/chosen": -172.81166076660156, + "logps/rejected": -314.439453125, + "loss": 0.044, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2622718811035156, + "rewards/margins": 12.844025611877441, + "rewards/rejected": -11.58175277709961, + "step": 1568 + }, + { + "epoch": 2.05, + "learning_rate": 1.2262863696862067e-05, + "logits/chosen": -1.8907290697097778, + "logits/rejected": -1.932104229927063, + "logps/chosen": -169.04861450195312, + "logps/rejected": -234.15304565429688, + "loss": 0.1754, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4177451729774475, + "rewards/margins": 6.046030044555664, + "rewards/rejected": -6.463775634765625, + "step": 1569 + }, + { + "epoch": 2.05, + "learning_rate": 1.223204563762047e-05, + "logits/chosen": -1.6060715913772583, + "logits/rejected": -1.6463336944580078, + "logps/chosen": -153.34140014648438, + "logps/rejected": -263.79669189453125, + "loss": 0.0438, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2805473804473877, + "rewards/margins": 10.844926834106445, + "rewards/rejected": -9.564379692077637, + "step": 1570 + }, + { + "epoch": 2.06, + "learning_rate": 1.2201253804852081e-05, + "logits/chosen": -1.9487419128417969, + "logits/rejected": -2.008817672729492, + "logps/chosen": -178.88906860351562, + "logps/rejected": -303.64898681640625, + "loss": 0.0437, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1704249083995819, + "rewards/margins": 10.515235900878906, + "rewards/rejected": -10.34481143951416, + "step": 1571 + }, + { + "epoch": 2.06, + "learning_rate": 1.2170488261805978e-05, + "logits/chosen": -1.8077938556671143, + "logits/rejected": -1.8451355695724487, + "logps/chosen": -156.46859741210938, + "logps/rejected": -310.28692626953125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.536043643951416, + "rewards/margins": 13.092052459716797, + "rewards/rejected": -11.556008338928223, + "step": 1572 + }, + { + "epoch": 2.06, + "learning_rate": 1.2139749071677215e-05, + "logits/chosen": -2.0997793674468994, + "logits/rejected": -2.1545159816741943, + "logps/chosen": -164.85052490234375, + "logps/rejected": -275.2083740234375, + "loss": 0.1305, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3790677785873413, + "rewards/margins": 10.304277420043945, + "rewards/rejected": -9.925209045410156, + "step": 1573 + }, + { + "epoch": 2.06, + "learning_rate": 1.2109036297606733e-05, + "logits/chosen": -1.8115272521972656, + "logits/rejected": -1.8162680864334106, + "logps/chosen": -206.42071533203125, + "logps/rejected": -321.3243713378906, + "loss": 0.0457, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.006946034729480743, + "rewards/margins": 10.70283317565918, + "rewards/rejected": -10.695886611938477, + "step": 1574 + }, + { + "epoch": 2.06, + "learning_rate": 1.207835000268119e-05, + "logits/chosen": -1.6967673301696777, + "logits/rejected": -1.777079701423645, + "logps/chosen": -153.98355102539062, + "logps/rejected": -268.57708740234375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2870631515979767, + "rewards/margins": 10.082841873168945, + "rewards/rejected": -9.795778274536133, + "step": 1575 + }, + { + "epoch": 2.06, + "learning_rate": 1.2047690249932881e-05, + "logits/chosen": -1.743074893951416, + "logits/rejected": -1.7276415824890137, + "logps/chosen": -154.81675720214844, + "logps/rejected": -257.8333435058594, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3405160903930664, + "rewards/margins": 9.382745742797852, + "rewards/rejected": -9.723262786865234, + "step": 1576 + }, + { + "epoch": 2.06, + "learning_rate": 1.2017057102339579e-05, + "logits/chosen": -2.004493236541748, + "logits/rejected": -2.0227136611938477, + "logps/chosen": -203.96725463867188, + "logps/rejected": -323.3419189453125, + "loss": 0.096, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.48205581307411194, + "rewards/margins": 11.126084327697754, + "rewards/rejected": -10.644027709960938, + "step": 1577 + }, + { + "epoch": 2.07, + "learning_rate": 1.198645062282436e-05, + "logits/chosen": -1.8242206573486328, + "logits/rejected": -1.811288833618164, + "logps/chosen": -166.9787139892578, + "logps/rejected": -282.35833740234375, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6162128448486328, + "rewards/margins": 10.101678848266602, + "rewards/rejected": -10.717890739440918, + "step": 1578 + }, + { + "epoch": 2.07, + "learning_rate": 1.1955870874255581e-05, + "logits/chosen": -1.8448400497436523, + "logits/rejected": -1.8529481887817383, + "logps/chosen": -222.9049835205078, + "logps/rejected": -289.4002685546875, + "loss": 0.0441, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0007186774164438248, + "rewards/margins": 9.15060806274414, + "rewards/rejected": -9.151325225830078, + "step": 1579 + }, + { + "epoch": 2.07, + "learning_rate": 1.1925317919446674e-05, + "logits/chosen": -1.707364797592163, + "logits/rejected": -1.6845389604568481, + "logps/chosen": -167.9447021484375, + "logps/rejected": -284.65289306640625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09448209404945374, + "rewards/margins": 11.150354385375977, + "rewards/rejected": -11.244836807250977, + "step": 1580 + }, + { + "epoch": 2.07, + "learning_rate": 1.189479182115601e-05, + "logits/chosen": -1.7035338878631592, + "logits/rejected": -1.6998414993286133, + "logps/chosen": -189.14300537109375, + "logps/rejected": -302.6960754394531, + "loss": 0.0435, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.058510057628154755, + "rewards/margins": 10.953461647033691, + "rewards/rejected": -11.011971473693848, + "step": 1581 + }, + { + "epoch": 2.07, + "learning_rate": 1.1864292642086821e-05, + "logits/chosen": -1.9866160154342651, + "logits/rejected": -1.9344441890716553, + "logps/chosen": -198.30035400390625, + "logps/rejected": -278.4330139160156, + "loss": 0.0892, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35245728492736816, + "rewards/margins": 8.703463554382324, + "rewards/rejected": -9.055920600891113, + "step": 1582 + }, + { + "epoch": 2.07, + "learning_rate": 1.1833820444887047e-05, + "logits/chosen": -2.044182300567627, + "logits/rejected": -2.031755208969116, + "logps/chosen": -193.3849334716797, + "logps/rejected": -301.7840576171875, + "loss": 0.0867, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1335229873657227, + "rewards/margins": 10.915968894958496, + "rewards/rejected": -9.782445907592773, + "step": 1583 + }, + { + "epoch": 2.07, + "learning_rate": 1.1803375292149188e-05, + "logits/chosen": -1.7033708095550537, + "logits/rejected": -1.7231299877166748, + "logps/chosen": -153.70217895507812, + "logps/rejected": -304.10284423828125, + "loss": 0.0887, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.18422439694404602, + "rewards/margins": 10.87944507598877, + "rewards/rejected": -10.695219993591309, + "step": 1584 + }, + { + "epoch": 2.07, + "learning_rate": 1.1772957246410182e-05, + "logits/chosen": -1.9338066577911377, + "logits/rejected": -2.00685715675354, + "logps/chosen": -178.58847045898438, + "logps/rejected": -298.1968078613281, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9479953050613403, + "rewards/margins": 10.410967826843262, + "rewards/rejected": -9.462972640991211, + "step": 1585 + }, + { + "epoch": 2.08, + "learning_rate": 1.174256637015132e-05, + "logits/chosen": -1.946398138999939, + "logits/rejected": -1.9503860473632812, + "logps/chosen": -148.6555938720703, + "logps/rejected": -240.55503845214844, + "loss": 0.046, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.38876765966415405, + "rewards/margins": 9.896851539611816, + "rewards/rejected": -9.508084297180176, + "step": 1586 + }, + { + "epoch": 2.08, + "learning_rate": 1.1712202725798072e-05, + "logits/chosen": -1.674027442932129, + "logits/rejected": -1.6356555223464966, + "logps/chosen": -149.5840301513672, + "logps/rejected": -232.08218383789062, + "loss": 0.1303, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5469903945922852, + "rewards/margins": 8.101824760437012, + "rewards/rejected": -7.554834365844727, + "step": 1587 + }, + { + "epoch": 2.08, + "learning_rate": 1.1681866375719962e-05, + "logits/chosen": -1.8917112350463867, + "logits/rejected": -1.889296054840088, + "logps/chosen": -192.59689331054688, + "logps/rejected": -270.35418701171875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6063494086265564, + "rewards/margins": 8.95035171508789, + "rewards/rejected": -9.556700706481934, + "step": 1588 + }, + { + "epoch": 2.08, + "learning_rate": 1.1651557382230444e-05, + "logits/chosen": -1.722730040550232, + "logits/rejected": -1.7723509073257446, + "logps/chosen": -151.0909423828125, + "logps/rejected": -300.1007995605469, + "loss": 0.0447, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04151998460292816, + "rewards/margins": 11.873591423034668, + "rewards/rejected": -11.832071304321289, + "step": 1589 + }, + { + "epoch": 2.08, + "learning_rate": 1.1621275807586799e-05, + "logits/chosen": -1.8380475044250488, + "logits/rejected": -1.8655589818954468, + "logps/chosen": -179.2967529296875, + "logps/rejected": -260.4557800292969, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4083634614944458, + "rewards/margins": 8.877779960632324, + "rewards/rejected": -9.286144256591797, + "step": 1590 + }, + { + "epoch": 2.08, + "learning_rate": 1.1591021713989986e-05, + "logits/chosen": -2.031733989715576, + "logits/rejected": -2.025977611541748, + "logps/chosen": -237.14959716796875, + "logps/rejected": -316.32666015625, + "loss": 0.0447, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9425259232521057, + "rewards/margins": 8.197513580322266, + "rewards/rejected": -9.140039443969727, + "step": 1591 + }, + { + "epoch": 2.08, + "learning_rate": 1.1560795163584492e-05, + "logits/chosen": -1.7341399192810059, + "logits/rejected": -1.805586576461792, + "logps/chosen": -164.51083374023438, + "logps/rejected": -280.4045104980469, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9063906669616699, + "rewards/margins": 12.112311363220215, + "rewards/rejected": -11.205921173095703, + "step": 1592 + }, + { + "epoch": 2.08, + "learning_rate": 1.153059621845825e-05, + "logits/chosen": -1.7905223369598389, + "logits/rejected": -1.9217365980148315, + "logps/chosen": -180.30960083007812, + "logps/rejected": -299.43023681640625, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2291586548089981, + "rewards/margins": 10.62186050415039, + "rewards/rejected": -10.392702102661133, + "step": 1593 + }, + { + "epoch": 2.09, + "learning_rate": 1.1500424940642507e-05, + "logits/chosen": -1.7739769220352173, + "logits/rejected": -1.8366410732269287, + "logps/chosen": -160.60951232910156, + "logps/rejected": -240.15560913085938, + "loss": 0.0877, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.45145949721336365, + "rewards/margins": 8.882786750793457, + "rewards/rejected": -8.431326866149902, + "step": 1594 + }, + { + "epoch": 2.09, + "learning_rate": 1.1470281392111611e-05, + "logits/chosen": -1.3909509181976318, + "logits/rejected": -1.4538073539733887, + "logps/chosen": -189.73439025878906, + "logps/rejected": -312.4124450683594, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.026380524039268494, + "rewards/margins": 11.454268455505371, + "rewards/rejected": -11.4806489944458, + "step": 1595 + }, + { + "epoch": 2.09, + "learning_rate": 1.144016563478302e-05, + "logits/chosen": -1.7928729057312012, + "logits/rejected": -1.7446744441986084, + "logps/chosen": -200.19595336914062, + "logps/rejected": -280.40399169921875, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16370874643325806, + "rewards/margins": 8.618330955505371, + "rewards/rejected": -8.454622268676758, + "step": 1596 + }, + { + "epoch": 2.09, + "learning_rate": 1.1410077730517089e-05, + "logits/chosen": -1.7523415088653564, + "logits/rejected": -1.7710070610046387, + "logps/chosen": -279.132080078125, + "logps/rejected": -401.2290344238281, + "loss": 0.087, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1064532995224, + "rewards/margins": 11.154422760009766, + "rewards/rejected": -10.047968864440918, + "step": 1597 + }, + { + "epoch": 2.09, + "learning_rate": 1.1380017741116933e-05, + "logits/chosen": -1.8794658184051514, + "logits/rejected": -1.8197338581085205, + "logps/chosen": -198.9808349609375, + "logps/rejected": -304.1878356933594, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6200318336486816, + "rewards/margins": 12.087699890136719, + "rewards/rejected": -11.467667579650879, + "step": 1598 + }, + { + "epoch": 2.09, + "learning_rate": 1.134998572832837e-05, + "logits/chosen": -1.746663212776184, + "logits/rejected": -1.820358395576477, + "logps/chosen": -174.73336791992188, + "logps/rejected": -325.8305358886719, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11171448230743408, + "rewards/margins": 12.630899429321289, + "rewards/rejected": -12.519185066223145, + "step": 1599 + }, + { + "epoch": 2.09, + "learning_rate": 1.1319981753839709e-05, + "logits/chosen": -1.9953845739364624, + "logits/rejected": -2.0003836154937744, + "logps/chosen": -156.08651733398438, + "logps/rejected": -258.3033142089844, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8491846919059753, + "rewards/margins": 10.991496086120605, + "rewards/rejected": -10.142311096191406, + "step": 1600 + }, + { + "epoch": 2.1, + "learning_rate": 1.129000587928171e-05, + "logits/chosen": -1.4941459894180298, + "logits/rejected": -1.5289289951324463, + "logps/chosen": -168.3614959716797, + "logps/rejected": -305.6832275390625, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19267244637012482, + "rewards/margins": 11.463333129882812, + "rewards/rejected": -11.656005859375, + "step": 1601 + }, + { + "epoch": 2.1, + "learning_rate": 1.1260058166227364e-05, + "logits/chosen": -1.6332361698150635, + "logits/rejected": -1.646107792854309, + "logps/chosen": -178.55096435546875, + "logps/rejected": -266.9166564941406, + "loss": 0.0452, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.12308045476675034, + "rewards/margins": 9.401651382446289, + "rewards/rejected": -9.524730682373047, + "step": 1602 + }, + { + "epoch": 2.1, + "learning_rate": 1.1230138676191857e-05, + "logits/chosen": -1.7331981658935547, + "logits/rejected": -1.7423471212387085, + "logps/chosen": -181.67837524414062, + "logps/rejected": -294.7712707519531, + "loss": 0.0435, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.27864792943000793, + "rewards/margins": 10.941581726074219, + "rewards/rejected": -11.220229148864746, + "step": 1603 + }, + { + "epoch": 2.1, + "learning_rate": 1.1200247470632393e-05, + "logits/chosen": -1.6798791885375977, + "logits/rejected": -1.7434027194976807, + "logps/chosen": -182.3284912109375, + "logps/rejected": -318.9844970703125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3225843608379364, + "rewards/margins": 11.364851951599121, + "rewards/rejected": -11.042265892028809, + "step": 1604 + }, + { + "epoch": 2.1, + "learning_rate": 1.1170384610948065e-05, + "logits/chosen": -1.779942274093628, + "logits/rejected": -1.6583943367004395, + "logps/chosen": -177.961669921875, + "logps/rejected": -264.97259521484375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2990310788154602, + "rewards/margins": 9.055750846862793, + "rewards/rejected": -9.354782104492188, + "step": 1605 + }, + { + "epoch": 2.1, + "learning_rate": 1.1140550158479737e-05, + "logits/chosen": -1.7814364433288574, + "logits/rejected": -1.8094960451126099, + "logps/chosen": -191.44149780273438, + "logps/rejected": -298.1101989746094, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07657552510499954, + "rewards/margins": 10.405778884887695, + "rewards/rejected": -10.482353210449219, + "step": 1606 + }, + { + "epoch": 2.1, + "learning_rate": 1.1110744174509952e-05, + "logits/chosen": -1.8888778686523438, + "logits/rejected": -1.97819185256958, + "logps/chosen": -150.6721954345703, + "logps/rejected": -275.5218811035156, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7233006954193115, + "rewards/margins": 12.13279914855957, + "rewards/rejected": -11.40949821472168, + "step": 1607 + }, + { + "epoch": 2.1, + "learning_rate": 1.1080966720262737e-05, + "logits/chosen": -1.7064411640167236, + "logits/rejected": -1.6912921667099, + "logps/chosen": -167.0463104248047, + "logps/rejected": -272.361328125, + "loss": 0.0444, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11471748352050781, + "rewards/margins": 9.870677947998047, + "rewards/rejected": -9.985396385192871, + "step": 1608 + }, + { + "epoch": 2.11, + "learning_rate": 1.1051217856903551e-05, + "logits/chosen": -1.893139123916626, + "logits/rejected": -1.9162890911102295, + "logps/chosen": -169.37657165527344, + "logps/rejected": -276.80767822265625, + "loss": 0.0435, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9231244921684265, + "rewards/margins": 11.909534454345703, + "rewards/rejected": -10.986411094665527, + "step": 1609 + }, + { + "epoch": 2.11, + "learning_rate": 1.1021497645539115e-05, + "logits/chosen": -1.8868383169174194, + "logits/rejected": -1.9275648593902588, + "logps/chosen": -200.1304473876953, + "logps/rejected": -295.7847900390625, + "loss": 0.044, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11123043298721313, + "rewards/margins": 9.088184356689453, + "rewards/rejected": -9.19941520690918, + "step": 1610 + }, + { + "epoch": 2.11, + "learning_rate": 1.0991806147217282e-05, + "logits/chosen": -1.7267825603485107, + "logits/rejected": -1.760359287261963, + "logps/chosen": -208.52496337890625, + "logps/rejected": -324.5615234375, + "loss": 0.0868, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20633268356323242, + "rewards/margins": 11.242762565612793, + "rewards/rejected": -11.449095726013184, + "step": 1611 + }, + { + "epoch": 2.11, + "learning_rate": 1.0962143422926929e-05, + "logits/chosen": -2.08742356300354, + "logits/rejected": -2.1348607540130615, + "logps/chosen": -166.95785522460938, + "logps/rejected": -253.4603729248047, + "loss": 0.0903, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49011409282684326, + "rewards/margins": 7.9456048011779785, + "rewards/rejected": -8.435718536376953, + "step": 1612 + }, + { + "epoch": 2.11, + "learning_rate": 1.0932509533597843e-05, + "logits/chosen": -1.7041451930999756, + "logits/rejected": -1.7591880559921265, + "logps/chosen": -164.47991943359375, + "logps/rejected": -270.92425537109375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8775690197944641, + "rewards/margins": 10.522651672363281, + "rewards/rejected": -9.6450834274292, + "step": 1613 + }, + { + "epoch": 2.11, + "learning_rate": 1.0902904540100587e-05, + "logits/chosen": -1.719311237335205, + "logits/rejected": -1.7044110298156738, + "logps/chosen": -180.66668701171875, + "logps/rejected": -307.71392822265625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.039889901876449585, + "rewards/margins": 11.706808090209961, + "rewards/rejected": -11.66691780090332, + "step": 1614 + }, + { + "epoch": 2.11, + "learning_rate": 1.0873328503246336e-05, + "logits/chosen": -1.874279260635376, + "logits/rejected": -1.9009246826171875, + "logps/chosen": -174.0518798828125, + "logps/rejected": -256.1578063964844, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06289811432361603, + "rewards/margins": 8.905080795288086, + "rewards/rejected": -8.967979431152344, + "step": 1615 + }, + { + "epoch": 2.11, + "learning_rate": 1.0843781483786823e-05, + "logits/chosen": -1.636064887046814, + "logits/rejected": -1.6881489753723145, + "logps/chosen": -161.75390625, + "logps/rejected": -256.316650390625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05496731400489807, + "rewards/margins": 9.652389526367188, + "rewards/rejected": -9.59742259979248, + "step": 1616 + }, + { + "epoch": 2.12, + "learning_rate": 1.081426354241414e-05, + "logits/chosen": -1.7782402038574219, + "logits/rejected": -1.8061838150024414, + "logps/chosen": -155.12046813964844, + "logps/rejected": -254.95724487304688, + "loss": 0.0453, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.40571296215057373, + "rewards/margins": 8.516573905944824, + "rewards/rejected": -8.922286987304688, + "step": 1617 + }, + { + "epoch": 2.12, + "learning_rate": 1.0784774739760694e-05, + "logits/chosen": -1.6029926538467407, + "logits/rejected": -1.5955368280410767, + "logps/chosen": -164.0563201904297, + "logps/rejected": -274.7829284667969, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9035606980323792, + "rewards/margins": 11.742399215698242, + "rewards/rejected": -10.838838577270508, + "step": 1618 + }, + { + "epoch": 2.12, + "learning_rate": 1.075531513639899e-05, + "logits/chosen": -1.6708571910858154, + "logits/rejected": -1.682619571685791, + "logps/chosen": -190.71385192871094, + "logps/rejected": -274.246337890625, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8636829853057861, + "rewards/margins": 8.821520805358887, + "rewards/rejected": -9.685203552246094, + "step": 1619 + }, + { + "epoch": 2.12, + "learning_rate": 1.0725884792841598e-05, + "logits/chosen": -1.6778228282928467, + "logits/rejected": -1.7020595073699951, + "logps/chosen": -173.37705993652344, + "logps/rejected": -278.74420166015625, + "loss": 0.0884, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6656838655471802, + "rewards/margins": 9.953472137451172, + "rewards/rejected": -10.61915397644043, + "step": 1620 + }, + { + "epoch": 2.12, + "learning_rate": 1.0696483769540974e-05, + "logits/chosen": -1.7680715322494507, + "logits/rejected": -1.7570364475250244, + "logps/chosen": -171.22381591796875, + "logps/rejected": -318.7603759765625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3030441403388977, + "rewards/margins": 11.448980331420898, + "rewards/rejected": -11.145936012268066, + "step": 1621 + }, + { + "epoch": 2.12, + "learning_rate": 1.0667112126889314e-05, + "logits/chosen": -1.735276699066162, + "logits/rejected": -1.700201153755188, + "logps/chosen": -201.6254119873047, + "logps/rejected": -305.27105712890625, + "loss": 0.0876, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5756285190582275, + "rewards/margins": 12.081156730651855, + "rewards/rejected": -11.505528450012207, + "step": 1622 + }, + { + "epoch": 2.12, + "learning_rate": 1.0637769925218502e-05, + "logits/chosen": -1.9475431442260742, + "logits/rejected": -1.9481316804885864, + "logps/chosen": -181.50909423828125, + "logps/rejected": -293.2982482910156, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8867416381835938, + "rewards/margins": 10.009493827819824, + "rewards/rejected": -10.896235466003418, + "step": 1623 + }, + { + "epoch": 2.13, + "learning_rate": 1.0608457224799953e-05, + "logits/chosen": -1.5991860628128052, + "logits/rejected": -1.636041283607483, + "logps/chosen": -169.8463897705078, + "logps/rejected": -262.57867431640625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6114577651023865, + "rewards/margins": 8.335268020629883, + "rewards/rejected": -8.946724891662598, + "step": 1624 + }, + { + "epoch": 2.13, + "learning_rate": 1.0579174085844442e-05, + "logits/chosen": -1.8283710479736328, + "logits/rejected": -1.8787097930908203, + "logps/chosen": -181.96414184570312, + "logps/rejected": -299.1936340332031, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5312442779541016, + "rewards/margins": 10.204803466796875, + "rewards/rejected": -10.73604679107666, + "step": 1625 + }, + { + "epoch": 2.13, + "learning_rate": 1.0549920568502065e-05, + "logits/chosen": -1.8167983293533325, + "logits/rejected": -1.747694969177246, + "logps/chosen": -167.78123474121094, + "logps/rejected": -301.7006530761719, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3517674207687378, + "rewards/margins": 13.407731056213379, + "rewards/rejected": -12.055963516235352, + "step": 1626 + }, + { + "epoch": 2.13, + "learning_rate": 1.0520696732862057e-05, + "logits/chosen": -1.9232838153839111, + "logits/rejected": -1.9250985383987427, + "logps/chosen": -187.1770782470703, + "logps/rejected": -290.00518798828125, + "loss": 0.044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23508702218532562, + "rewards/margins": 10.48847484588623, + "rewards/rejected": -10.253388404846191, + "step": 1627 + }, + { + "epoch": 2.13, + "learning_rate": 1.0491502638952675e-05, + "logits/chosen": -1.7883388996124268, + "logits/rejected": -1.7719625234603882, + "logps/chosen": -180.1321258544922, + "logps/rejected": -275.4041442871094, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1425415575504303, + "rewards/margins": 10.312170028686523, + "rewards/rejected": -10.169628143310547, + "step": 1628 + }, + { + "epoch": 2.13, + "learning_rate": 1.0462338346741086e-05, + "logits/chosen": -1.9197849035263062, + "logits/rejected": -1.8951356410980225, + "logps/chosen": -201.99610900878906, + "logps/rejected": -292.5700988769531, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6948319673538208, + "rewards/margins": 10.412498474121094, + "rewards/rejected": -9.717665672302246, + "step": 1629 + }, + { + "epoch": 2.13, + "learning_rate": 1.0433203916133252e-05, + "logits/chosen": -1.49874746799469, + "logits/rejected": -1.470816969871521, + "logps/chosen": -171.98373413085938, + "logps/rejected": -251.63925170898438, + "loss": 0.0873, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.011922568082809448, + "rewards/margins": 8.181827545166016, + "rewards/rejected": -8.169904708862305, + "step": 1630 + }, + { + "epoch": 2.13, + "learning_rate": 1.0404099406973803e-05, + "logits/chosen": -1.7821449041366577, + "logits/rejected": -1.8011589050292969, + "logps/chosen": -169.43299865722656, + "logps/rejected": -288.68353271484375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5487989783287048, + "rewards/margins": 11.3939208984375, + "rewards/rejected": -10.845123291015625, + "step": 1631 + }, + { + "epoch": 2.14, + "learning_rate": 1.0375024879045889e-05, + "logits/chosen": -1.8686878681182861, + "logits/rejected": -1.92938232421875, + "logps/chosen": -161.27911376953125, + "logps/rejected": -272.1463928222656, + "loss": 0.0436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.24484024941921234, + "rewards/margins": 10.078117370605469, + "rewards/rejected": -9.833276748657227, + "step": 1632 + }, + { + "epoch": 2.14, + "learning_rate": 1.0345980392071073e-05, + "logits/chosen": -1.7522367238998413, + "logits/rejected": -1.7652907371520996, + "logps/chosen": -180.3960418701172, + "logps/rejected": -337.2406921386719, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4226654767990112, + "rewards/margins": 14.044825553894043, + "rewards/rejected": -12.622160911560059, + "step": 1633 + }, + { + "epoch": 2.14, + "learning_rate": 1.031696600570923e-05, + "logits/chosen": -1.9498507976531982, + "logits/rejected": -1.9650359153747559, + "logps/chosen": -177.22329711914062, + "logps/rejected": -325.6481628417969, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3029983043670654, + "rewards/margins": 14.440342903137207, + "rewards/rejected": -13.137345314025879, + "step": 1634 + }, + { + "epoch": 2.14, + "learning_rate": 1.0287981779558411e-05, + "logits/chosen": -1.7514562606811523, + "logits/rejected": -1.7056151628494263, + "logps/chosen": -200.11724853515625, + "logps/rejected": -277.3222351074219, + "loss": 0.1308, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3846664428710938, + "rewards/margins": 8.497995376586914, + "rewards/rejected": -9.882662773132324, + "step": 1635 + }, + { + "epoch": 2.14, + "learning_rate": 1.0259027773154681e-05, + "logits/chosen": -1.7622318267822266, + "logits/rejected": -1.7579509019851685, + "logps/chosen": -159.00167846679688, + "logps/rejected": -244.04287719726562, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0079766511917114, + "rewards/margins": 9.59066390991211, + "rewards/rejected": -8.582687377929688, + "step": 1636 + }, + { + "epoch": 2.14, + "learning_rate": 1.023010404597206e-05, + "logits/chosen": -1.8814631700515747, + "logits/rejected": -1.9322527647018433, + "logps/chosen": -183.9650421142578, + "logps/rejected": -273.3780517578125, + "loss": 0.0873, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1322219967842102, + "rewards/margins": 9.583847999572754, + "rewards/rejected": -9.45162582397461, + "step": 1637 + }, + { + "epoch": 2.14, + "learning_rate": 1.0201210657422386e-05, + "logits/chosen": -1.9555423259735107, + "logits/rejected": -2.01981258392334, + "logps/chosen": -172.41693115234375, + "logps/rejected": -288.9444580078125, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6541494131088257, + "rewards/margins": 10.550215721130371, + "rewards/rejected": -11.204366683959961, + "step": 1638 + }, + { + "epoch": 2.14, + "learning_rate": 1.0172347666855117e-05, + "logits/chosen": -1.9514449834823608, + "logits/rejected": -1.9467823505401611, + "logps/chosen": -162.5089569091797, + "logps/rejected": -282.4434814453125, + "loss": 0.0872, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.23492062091827393, + "rewards/margins": 11.506002426147461, + "rewards/rejected": -11.271082878112793, + "step": 1639 + }, + { + "epoch": 2.15, + "learning_rate": 1.0143515133557333e-05, + "logits/chosen": -1.878548502922058, + "logits/rejected": -1.9108283519744873, + "logps/chosen": -173.166748046875, + "logps/rejected": -297.6573791503906, + "loss": 0.0445, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5305010080337524, + "rewards/margins": 12.044855117797852, + "rewards/rejected": -11.51435375213623, + "step": 1640 + }, + { + "epoch": 2.15, + "learning_rate": 1.0114713116753533e-05, + "logits/chosen": -1.5864242315292358, + "logits/rejected": -1.6092569828033447, + "logps/chosen": -159.9029998779297, + "logps/rejected": -268.3252868652344, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4869553744792938, + "rewards/margins": 10.347358703613281, + "rewards/rejected": -9.860403060913086, + "step": 1641 + }, + { + "epoch": 2.15, + "learning_rate": 1.0085941675605517e-05, + "logits/chosen": -1.6476333141326904, + "logits/rejected": -1.6244561672210693, + "logps/chosen": -220.84066772460938, + "logps/rejected": -316.5938720703125, + "loss": 0.0448, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14580899477005005, + "rewards/margins": 10.031061172485352, + "rewards/rejected": -10.17686939239502, + "step": 1642 + }, + { + "epoch": 2.15, + "learning_rate": 1.0057200869212308e-05, + "logits/chosen": -1.857738733291626, + "logits/rejected": -1.8484587669372559, + "logps/chosen": -172.53736877441406, + "logps/rejected": -274.78131103515625, + "loss": 0.0444, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3535841107368469, + "rewards/margins": 11.54519271850586, + "rewards/rejected": -11.191608428955078, + "step": 1643 + }, + { + "epoch": 2.15, + "learning_rate": 1.0028490756609971e-05, + "logits/chosen": -1.7425236701965332, + "logits/rejected": -1.7153847217559814, + "logps/chosen": -196.6745147705078, + "logps/rejected": -310.6172790527344, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48441994190216064, + "rewards/margins": 9.840300559997559, + "rewards/rejected": -10.324722290039062, + "step": 1644 + }, + { + "epoch": 2.15, + "learning_rate": 9.999811396771554e-06, + "logits/chosen": -1.859032392501831, + "logits/rejected": -1.810266375541687, + "logps/chosen": -158.87892150878906, + "logps/rejected": -230.06251525878906, + "loss": 0.174, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7558112740516663, + "rewards/margins": 7.201855182647705, + "rewards/rejected": -7.957665920257568, + "step": 1645 + }, + { + "epoch": 2.15, + "learning_rate": 9.971162848606907e-06, + "logits/chosen": -1.7121883630752563, + "logits/rejected": -1.7422066926956177, + "logps/chosen": -200.49237060546875, + "logps/rejected": -311.9027099609375, + "loss": 0.0874, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.027033932507038116, + "rewards/margins": 9.665481567382812, + "rewards/rejected": -9.692516326904297, + "step": 1646 + }, + { + "epoch": 2.16, + "learning_rate": 9.942545170962611e-06, + "logits/chosen": -1.9748835563659668, + "logits/rejected": -2.0193512439727783, + "logps/chosen": -153.66432189941406, + "logps/rejected": -247.5177459716797, + "loss": 0.0446, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.23768003284931183, + "rewards/margins": 8.148002624511719, + "rewards/rejected": -8.385683059692383, + "step": 1647 + }, + { + "epoch": 2.16, + "learning_rate": 9.913958422621845e-06, + "logits/chosen": -1.963097333908081, + "logits/rejected": -1.918212652206421, + "logps/chosen": -189.08810424804688, + "logps/rejected": -272.96563720703125, + "loss": 0.0868, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.928251326084137, + "rewards/margins": 10.042634963989258, + "rewards/rejected": -10.97088623046875, + "step": 1648 + }, + { + "epoch": 2.16, + "learning_rate": 9.885402662304222e-06, + "logits/chosen": -1.8511769771575928, + "logits/rejected": -1.9369834661483765, + "logps/chosen": -161.8604736328125, + "logps/rejected": -245.03482055664062, + "loss": 0.0441, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.44078436493873596, + "rewards/margins": 9.245979309082031, + "rewards/rejected": -8.805194854736328, + "step": 1649 + }, + { + "epoch": 2.16, + "learning_rate": 9.856877948665724e-06, + "logits/chosen": -1.903306484222412, + "logits/rejected": -1.9450178146362305, + "logps/chosen": -185.69610595703125, + "logps/rejected": -272.4430236816406, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6749640107154846, + "rewards/margins": 9.954863548278809, + "rewards/rejected": -9.279899597167969, + "step": 1650 + }, + { + "epoch": 2.16, + "learning_rate": 9.828384340298572e-06, + "logits/chosen": -1.9700182676315308, + "logits/rejected": -1.9676039218902588, + "logps/chosen": -212.72140502929688, + "logps/rejected": -308.92926025390625, + "loss": 0.045, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.30744415521621704, + "rewards/margins": 8.809019088745117, + "rewards/rejected": -9.116462707519531, + "step": 1651 + }, + { + "epoch": 2.16, + "learning_rate": 9.799921895731062e-06, + "logits/chosen": -1.8452091217041016, + "logits/rejected": -1.8436799049377441, + "logps/chosen": -165.15667724609375, + "logps/rejected": -277.76092529296875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2959306836128235, + "rewards/margins": 10.06021499633789, + "rewards/rejected": -9.764284133911133, + "step": 1652 + }, + { + "epoch": 2.16, + "learning_rate": 9.771490673427508e-06, + "logits/chosen": -1.667567253112793, + "logits/rejected": -1.6532378196716309, + "logps/chosen": -188.00926208496094, + "logps/rejected": -300.84136962890625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3765113949775696, + "rewards/margins": 9.473361015319824, + "rewards/rejected": -9.849871635437012, + "step": 1653 + }, + { + "epoch": 2.16, + "learning_rate": 9.743090731788088e-06, + "logits/chosen": -1.9611507654190063, + "logits/rejected": -1.9925371408462524, + "logps/chosen": -151.85243225097656, + "logps/rejected": -248.31158447265625, + "loss": 0.0876, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31397300958633423, + "rewards/margins": 8.316049575805664, + "rewards/rejected": -8.630022048950195, + "step": 1654 + }, + { + "epoch": 2.17, + "learning_rate": 9.714722129148705e-06, + "logits/chosen": -1.5252611637115479, + "logits/rejected": -1.457181453704834, + "logps/chosen": -159.80308532714844, + "logps/rejected": -304.29669189453125, + "loss": 0.1312, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0435137748718262, + "rewards/margins": 9.943965911865234, + "rewards/rejected": -10.987480163574219, + "step": 1655 + }, + { + "epoch": 2.17, + "learning_rate": 9.686384923780894e-06, + "logits/chosen": -1.7786623239517212, + "logits/rejected": -1.73944091796875, + "logps/chosen": -205.1579132080078, + "logps/rejected": -323.33599853515625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21578331291675568, + "rewards/margins": 14.134706497192383, + "rewards/rejected": -13.918923377990723, + "step": 1656 + }, + { + "epoch": 2.17, + "learning_rate": 9.658079173891718e-06, + "logits/chosen": -1.7368509769439697, + "logits/rejected": -1.695062518119812, + "logps/chosen": -173.6504669189453, + "logps/rejected": -273.2975769042969, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8823786973953247, + "rewards/margins": 9.59858512878418, + "rewards/rejected": -10.480963706970215, + "step": 1657 + }, + { + "epoch": 2.17, + "learning_rate": 9.62980493762362e-06, + "logits/chosen": -1.6737737655639648, + "logits/rejected": -1.6783125400543213, + "logps/chosen": -172.31503295898438, + "logps/rejected": -254.68673706054688, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16359907388687134, + "rewards/margins": 9.567679405212402, + "rewards/rejected": -9.731277465820312, + "step": 1658 + }, + { + "epoch": 2.17, + "learning_rate": 9.60156227305429e-06, + "logits/chosen": -1.7691925764083862, + "logits/rejected": -1.895477294921875, + "logps/chosen": -198.03857421875, + "logps/rejected": -314.5537414550781, + "loss": 0.0872, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3405799865722656, + "rewards/margins": 7.990486145019531, + "rewards/rejected": -9.331066131591797, + "step": 1659 + }, + { + "epoch": 2.17, + "learning_rate": 9.573351238196598e-06, + "logits/chosen": -1.8263225555419922, + "logits/rejected": -1.896690845489502, + "logps/chosen": -169.5816192626953, + "logps/rejected": -315.9022521972656, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16010499000549316, + "rewards/margins": 12.193278312683105, + "rewards/rejected": -12.35338306427002, + "step": 1660 + }, + { + "epoch": 2.17, + "learning_rate": 9.545171890998415e-06, + "logits/chosen": -1.8200551271438599, + "logits/rejected": -1.754014492034912, + "logps/chosen": -178.17071533203125, + "logps/rejected": -268.7278747558594, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09589181840419769, + "rewards/margins": 10.046781539916992, + "rewards/rejected": -10.14267349243164, + "step": 1661 + }, + { + "epoch": 2.18, + "learning_rate": 9.51702428934255e-06, + "logits/chosen": -1.8058146238327026, + "logits/rejected": -1.900023102760315, + "logps/chosen": -162.40560913085938, + "logps/rejected": -273.80963134765625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5310177206993103, + "rewards/margins": 11.774530410766602, + "rewards/rejected": -11.243513107299805, + "step": 1662 + }, + { + "epoch": 2.18, + "learning_rate": 9.488908491046575e-06, + "logits/chosen": -1.815151333808899, + "logits/rejected": -1.829606294631958, + "logps/chosen": -170.41360473632812, + "logps/rejected": -260.3449401855469, + "loss": 0.0435, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8860507011413574, + "rewards/margins": 9.887506484985352, + "rewards/rejected": -9.001455307006836, + "step": 1663 + }, + { + "epoch": 2.18, + "learning_rate": 9.460824553862762e-06, + "logits/chosen": -1.6950938701629639, + "logits/rejected": -1.8226205110549927, + "logps/chosen": -217.45001220703125, + "logps/rejected": -298.23956298828125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49591341614723206, + "rewards/margins": 11.643468856811523, + "rewards/rejected": -11.147554397583008, + "step": 1664 + }, + { + "epoch": 2.18, + "learning_rate": 9.432772535477941e-06, + "logits/chosen": -1.9122917652130127, + "logits/rejected": -1.888707160949707, + "logps/chosen": -161.55978393554688, + "logps/rejected": -242.57106018066406, + "loss": 0.0877, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.017862819135189056, + "rewards/margins": 8.404541015625, + "rewards/rejected": -8.386678695678711, + "step": 1665 + }, + { + "epoch": 2.18, + "learning_rate": 9.40475249351333e-06, + "logits/chosen": -1.5493508577346802, + "logits/rejected": -1.4833691120147705, + "logps/chosen": -216.32479858398438, + "logps/rejected": -297.0689392089844, + "loss": 0.1738, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6247285008430481, + "rewards/margins": 8.476842880249023, + "rewards/rejected": -9.101572036743164, + "step": 1666 + }, + { + "epoch": 2.18, + "learning_rate": 9.376764485524515e-06, + "logits/chosen": -1.4133046865463257, + "logits/rejected": -1.3796892166137695, + "logps/chosen": -157.8040008544922, + "logps/rejected": -263.4658203125, + "loss": 0.0875, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4148402214050293, + "rewards/margins": 8.737409591674805, + "rewards/rejected": -8.322569847106934, + "step": 1667 + }, + { + "epoch": 2.18, + "learning_rate": 9.348808569001272e-06, + "logits/chosen": -1.8207170963287354, + "logits/rejected": -1.8168244361877441, + "logps/chosen": -174.46641540527344, + "logps/rejected": -268.09661865234375, + "loss": 0.0445, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4276840686798096, + "rewards/margins": 8.959912300109863, + "rewards/rejected": -10.387595176696777, + "step": 1668 + }, + { + "epoch": 2.18, + "learning_rate": 9.320884801367435e-06, + "logits/chosen": -1.7735166549682617, + "logits/rejected": -1.8737962245941162, + "logps/chosen": -179.9202423095703, + "logps/rejected": -303.98883056640625, + "loss": 0.0438, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4187115430831909, + "rewards/margins": 11.284400939941406, + "rewards/rejected": -10.86568832397461, + "step": 1669 + }, + { + "epoch": 2.19, + "learning_rate": 9.292993239980827e-06, + "logits/chosen": -1.82965087890625, + "logits/rejected": -1.769222378730774, + "logps/chosen": -175.60897827148438, + "logps/rejected": -294.7171325683594, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.214296817779541, + "rewards/margins": 12.029009819030762, + "rewards/rejected": -10.814712524414062, + "step": 1670 + }, + { + "epoch": 2.19, + "learning_rate": 9.265133942133115e-06, + "logits/chosen": -1.496509075164795, + "logits/rejected": -1.521586537361145, + "logps/chosen": -161.1988983154297, + "logps/rejected": -298.87799072265625, + "loss": 0.0438, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1889244019985199, + "rewards/margins": 10.518080711364746, + "rewards/rejected": -10.70700454711914, + "step": 1671 + }, + { + "epoch": 2.19, + "learning_rate": 9.237306965049677e-06, + "logits/chosen": -1.8703449964523315, + "logits/rejected": -1.9033639430999756, + "logps/chosen": -146.85728454589844, + "logps/rejected": -249.86672973632812, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8050918579101562, + "rewards/margins": 10.219182014465332, + "rewards/rejected": -9.414091110229492, + "step": 1672 + }, + { + "epoch": 2.19, + "learning_rate": 9.2095123658895e-06, + "logits/chosen": -1.6411526203155518, + "logits/rejected": -1.7091474533081055, + "logps/chosen": -201.58026123046875, + "logps/rejected": -301.3131408691406, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6549285054206848, + "rewards/margins": 9.490497589111328, + "rewards/rejected": -10.145425796508789, + "step": 1673 + }, + { + "epoch": 2.19, + "learning_rate": 9.181750201745087e-06, + "logits/chosen": -1.9388796091079712, + "logits/rejected": -1.8985590934753418, + "logps/chosen": -193.84022521972656, + "logps/rejected": -309.42718505859375, + "loss": 0.0451, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6327271461486816, + "rewards/margins": 11.08639144897461, + "rewards/rejected": -11.71911907196045, + "step": 1674 + }, + { + "epoch": 2.19, + "learning_rate": 9.15402052964231e-06, + "logits/chosen": -1.6782294511795044, + "logits/rejected": -1.6884478330612183, + "logps/chosen": -173.98757934570312, + "logps/rejected": -294.1504211425781, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3069442808628082, + "rewards/margins": 12.193976402282715, + "rewards/rejected": -11.887031555175781, + "step": 1675 + }, + { + "epoch": 2.19, + "learning_rate": 9.126323406540282e-06, + "logits/chosen": -1.8137794733047485, + "logits/rejected": -1.780582308769226, + "logps/chosen": -176.708251953125, + "logps/rejected": -283.90447998046875, + "loss": 0.0468, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.030196547508239746, + "rewards/margins": 10.235297203063965, + "rewards/rejected": -10.205100059509277, + "step": 1676 + }, + { + "epoch": 2.19, + "learning_rate": 9.098658889331265e-06, + "logits/chosen": -1.856440544128418, + "logits/rejected": -1.8947744369506836, + "logps/chosen": -160.95370483398438, + "logps/rejected": -240.45040893554688, + "loss": 0.1749, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008556969463825226, + "rewards/margins": 7.522327423095703, + "rewards/rejected": -7.513769626617432, + "step": 1677 + }, + { + "epoch": 2.2, + "learning_rate": 9.07102703484056e-06, + "logits/chosen": -1.7309825420379639, + "logits/rejected": -1.690861463546753, + "logps/chosen": -201.16575622558594, + "logps/rejected": -290.79205322265625, + "loss": 0.0871, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2205784022808075, + "rewards/margins": 9.602574348449707, + "rewards/rejected": -9.823152542114258, + "step": 1678 + }, + { + "epoch": 2.2, + "learning_rate": 9.043427899826367e-06, + "logits/chosen": -1.895434856414795, + "logits/rejected": -1.8420089483261108, + "logps/chosen": -197.84713745117188, + "logps/rejected": -307.4045715332031, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1810436248779297, + "rewards/margins": 10.660295486450195, + "rewards/rejected": -11.841340065002441, + "step": 1679 + }, + { + "epoch": 2.2, + "learning_rate": 9.015861540979667e-06, + "logits/chosen": -1.9898107051849365, + "logits/rejected": -1.9642372131347656, + "logps/chosen": -239.70726013183594, + "logps/rejected": -334.6875305175781, + "loss": 0.1307, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0683082342147827, + "rewards/margins": 8.380480766296387, + "rewards/rejected": -9.4487886428833, + "step": 1680 + }, + { + "epoch": 2.2, + "learning_rate": 8.988328014924136e-06, + "logits/chosen": -1.664607048034668, + "logits/rejected": -1.6641795635223389, + "logps/chosen": -191.84580993652344, + "logps/rejected": -270.3985595703125, + "loss": 0.0617, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5724802017211914, + "rewards/margins": 10.088390350341797, + "rewards/rejected": -9.515910148620605, + "step": 1681 + }, + { + "epoch": 2.2, + "learning_rate": 8.960827378215994e-06, + "logits/chosen": -1.7210701704025269, + "logits/rejected": -1.7156659364700317, + "logps/chosen": -149.3922119140625, + "logps/rejected": -271.9853515625, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6715322136878967, + "rewards/margins": 11.536502838134766, + "rewards/rejected": -10.864970207214355, + "step": 1682 + }, + { + "epoch": 2.2, + "learning_rate": 8.933359687343895e-06, + "logits/chosen": -1.5690048933029175, + "logits/rejected": -1.6352813243865967, + "logps/chosen": -196.51898193359375, + "logps/rejected": -304.961181640625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8091062307357788, + "rewards/margins": 9.671791076660156, + "rewards/rejected": -10.48089599609375, + "step": 1683 + }, + { + "epoch": 2.2, + "learning_rate": 8.90592499872884e-06, + "logits/chosen": -1.979506254196167, + "logits/rejected": -2.003925323486328, + "logps/chosen": -182.73460388183594, + "logps/rejected": -268.9897155761719, + "loss": 0.1314, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7393634915351868, + "rewards/margins": 7.028038501739502, + "rewards/rejected": -7.767402172088623, + "step": 1684 + }, + { + "epoch": 2.21, + "learning_rate": 8.878523368724046e-06, + "logits/chosen": -2.036076068878174, + "logits/rejected": -2.095663547515869, + "logps/chosen": -162.7556610107422, + "logps/rejected": -268.5704345703125, + "loss": 0.1303, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2916828691959381, + "rewards/margins": 10.175681114196777, + "rewards/rejected": -9.883997917175293, + "step": 1685 + }, + { + "epoch": 2.21, + "learning_rate": 8.851154853614788e-06, + "logits/chosen": -1.6768990755081177, + "logits/rejected": -1.490555763244629, + "logps/chosen": -155.71044921875, + "logps/rejected": -264.0386047363281, + "loss": 0.1312, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37653812766075134, + "rewards/margins": 7.281563758850098, + "rewards/rejected": -7.6581010818481445, + "step": 1686 + }, + { + "epoch": 2.21, + "learning_rate": 8.823819509618364e-06, + "logits/chosen": -1.6825820207595825, + "logits/rejected": -1.7071927785873413, + "logps/chosen": -152.19607543945312, + "logps/rejected": -269.927734375, + "loss": 0.0876, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6712758541107178, + "rewards/margins": 9.459580421447754, + "rewards/rejected": -10.13085651397705, + "step": 1687 + }, + { + "epoch": 2.21, + "learning_rate": 8.796517392883894e-06, + "logits/chosen": -1.8068063259124756, + "logits/rejected": -1.694048285484314, + "logps/chosen": -169.33920288085938, + "logps/rejected": -254.17953491210938, + "loss": 0.0442, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5995453000068665, + "rewards/margins": 9.813188552856445, + "rewards/rejected": -10.412734031677246, + "step": 1688 + }, + { + "epoch": 2.21, + "learning_rate": 8.769248559492286e-06, + "logits/chosen": -1.8590977191925049, + "logits/rejected": -1.924188256263733, + "logps/chosen": -171.3964080810547, + "logps/rejected": -298.5698547363281, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006125234067440033, + "rewards/margins": 10.698637962341309, + "rewards/rejected": -10.704763412475586, + "step": 1689 + }, + { + "epoch": 2.21, + "learning_rate": 8.742013065456047e-06, + "logits/chosen": -1.9317048788070679, + "logits/rejected": -1.922799825668335, + "logps/chosen": -174.6070556640625, + "logps/rejected": -268.29376220703125, + "loss": 0.0869, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.514312207698822, + "rewards/margins": 10.548632621765137, + "rewards/rejected": -10.034319877624512, + "step": 1690 + }, + { + "epoch": 2.21, + "learning_rate": 8.714810966719225e-06, + "logits/chosen": -1.6925863027572632, + "logits/rejected": -1.7239949703216553, + "logps/chosen": -211.32351684570312, + "logps/rejected": -308.89752197265625, + "loss": 0.0442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5241937637329102, + "rewards/margins": 10.372793197631836, + "rewards/rejected": -10.896986961364746, + "step": 1691 + }, + { + "epoch": 2.21, + "learning_rate": 8.687642319157279e-06, + "logits/chosen": -1.6300621032714844, + "logits/rejected": -1.6681506633758545, + "logps/chosen": -192.51858520507812, + "logps/rejected": -305.91741943359375, + "loss": 0.0447, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20705091953277588, + "rewards/margins": 12.050918579101562, + "rewards/rejected": -12.25796890258789, + "step": 1692 + }, + { + "epoch": 2.22, + "learning_rate": 8.660507178576907e-06, + "logits/chosen": -1.5620609521865845, + "logits/rejected": -1.5364537239074707, + "logps/chosen": -197.2242889404297, + "logps/rejected": -260.6908264160156, + "loss": 0.0472, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1111658811569214, + "rewards/margins": 8.088720321655273, + "rewards/rejected": -9.199886322021484, + "step": 1693 + }, + { + "epoch": 2.22, + "learning_rate": 8.633405600716035e-06, + "logits/chosen": -1.7832496166229248, + "logits/rejected": -1.813164234161377, + "logps/chosen": -194.8793487548828, + "logps/rejected": -279.18377685546875, + "loss": 0.0463, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.29322922229766846, + "rewards/margins": 9.493721008300781, + "rewards/rejected": -9.786949157714844, + "step": 1694 + }, + { + "epoch": 2.22, + "learning_rate": 8.606337641243634e-06, + "logits/chosen": -1.7315824031829834, + "logits/rejected": -1.763061285018921, + "logps/chosen": -188.6962432861328, + "logps/rejected": -276.6380615234375, + "loss": 0.0444, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13001757860183716, + "rewards/margins": 10.258501052856445, + "rewards/rejected": -10.388519287109375, + "step": 1695 + }, + { + "epoch": 2.22, + "learning_rate": 8.579303355759597e-06, + "logits/chosen": -1.6586931943893433, + "logits/rejected": -1.6834213733673096, + "logps/chosen": -128.07907104492188, + "logps/rejected": -248.05532836914062, + "loss": 0.0442, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5915146470069885, + "rewards/margins": 10.520118713378906, + "rewards/rejected": -9.928605079650879, + "step": 1696 + }, + { + "epoch": 2.22, + "learning_rate": 8.552302799794675e-06, + "logits/chosen": -1.7995065450668335, + "logits/rejected": -1.8001186847686768, + "logps/chosen": -179.8526153564453, + "logps/rejected": -281.18304443359375, + "loss": 0.0453, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08163979649543762, + "rewards/margins": 11.314369201660156, + "rewards/rejected": -11.39600944519043, + "step": 1697 + }, + { + "epoch": 2.22, + "learning_rate": 8.525336028810333e-06, + "logits/chosen": -1.7883713245391846, + "logits/rejected": -1.8603564500808716, + "logps/chosen": -294.2748718261719, + "logps/rejected": -383.8592224121094, + "loss": 0.0883, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3609399795532227, + "rewards/margins": 8.238929748535156, + "rewards/rejected": -9.599870681762695, + "step": 1698 + }, + { + "epoch": 2.22, + "learning_rate": 8.498403098198621e-06, + "logits/chosen": -1.576483964920044, + "logits/rejected": -1.5710241794586182, + "logps/chosen": -169.9357147216797, + "logps/rejected": -252.6559295654297, + "loss": 0.0874, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2965281009674072, + "rewards/margins": 7.970451354980469, + "rewards/rejected": -8.266979217529297, + "step": 1699 + }, + { + "epoch": 2.22, + "learning_rate": 8.471504063282082e-06, + "logits/chosen": -1.9117447137832642, + "logits/rejected": -1.9516785144805908, + "logps/chosen": -151.51559448242188, + "logps/rejected": -229.78457641601562, + "loss": 0.133, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21838048100471497, + "rewards/margins": 8.731021881103516, + "rewards/rejected": -8.512640953063965, + "step": 1700 + }, + { + "epoch": 2.23, + "learning_rate": 8.444638979313647e-06, + "logits/chosen": -1.5704425573349, + "logits/rejected": -1.6318943500518799, + "logps/chosen": -179.54750061035156, + "logps/rejected": -320.55023193359375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2907328605651855, + "rewards/margins": 9.840455055236816, + "rewards/rejected": -11.131187438964844, + "step": 1701 + }, + { + "epoch": 2.23, + "learning_rate": 8.417807901476513e-06, + "logits/chosen": -1.6483116149902344, + "logits/rejected": -1.619083046913147, + "logps/chosen": -158.54934692382812, + "logps/rejected": -280.7205810546875, + "loss": 0.0887, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3883090317249298, + "rewards/margins": 8.795272827148438, + "rewards/rejected": -9.183581352233887, + "step": 1702 + }, + { + "epoch": 2.23, + "learning_rate": 8.391010884884008e-06, + "logits/chosen": -1.9159705638885498, + "logits/rejected": -1.9450147151947021, + "logps/chosen": -169.84194946289062, + "logps/rejected": -249.08335876464844, + "loss": 0.0438, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5200881361961365, + "rewards/margins": 8.488304138183594, + "rewards/rejected": -9.008392333984375, + "step": 1703 + }, + { + "epoch": 2.23, + "learning_rate": 8.364247984579487e-06, + "logits/chosen": -1.8001549243927002, + "logits/rejected": -1.7809131145477295, + "logps/chosen": -158.47268676757812, + "logps/rejected": -263.05780029296875, + "loss": 0.0873, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.189047709107399, + "rewards/margins": 10.581604957580566, + "rewards/rejected": -10.392557144165039, + "step": 1704 + }, + { + "epoch": 2.23, + "learning_rate": 8.337519255536259e-06, + "logits/chosen": -1.6612846851348877, + "logits/rejected": -1.6448615789413452, + "logps/chosen": -189.70001220703125, + "logps/rejected": -309.1474914550781, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3494005799293518, + "rewards/margins": 11.393888473510742, + "rewards/rejected": -11.743289947509766, + "step": 1705 + }, + { + "epoch": 2.23, + "learning_rate": 8.310824752657426e-06, + "logits/chosen": -1.7485321760177612, + "logits/rejected": -1.744619607925415, + "logps/chosen": -169.79127502441406, + "logps/rejected": -262.014404296875, + "loss": 0.0452, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.163727879524231, + "rewards/margins": 8.498297691345215, + "rewards/rejected": -9.662026405334473, + "step": 1706 + }, + { + "epoch": 2.23, + "learning_rate": 8.284164530775776e-06, + "logits/chosen": -1.9259155988693237, + "logits/rejected": -1.9073561429977417, + "logps/chosen": -183.88482666015625, + "logps/rejected": -301.5204772949219, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8612270951271057, + "rewards/margins": 10.315326690673828, + "rewards/rejected": -11.176551818847656, + "step": 1707 + }, + { + "epoch": 2.24, + "learning_rate": 8.257538644653695e-06, + "logits/chosen": -1.7669718265533447, + "logits/rejected": -1.7617294788360596, + "logps/chosen": -180.52041625976562, + "logps/rejected": -282.12646484375, + "loss": 0.0478, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.29249075055122375, + "rewards/margins": 8.902650833129883, + "rewards/rejected": -9.195141792297363, + "step": 1708 + }, + { + "epoch": 2.24, + "learning_rate": 8.230947148983056e-06, + "logits/chosen": -1.6603291034698486, + "logits/rejected": -1.626970648765564, + "logps/chosen": -207.06224060058594, + "logps/rejected": -284.30712890625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6507579684257507, + "rewards/margins": 9.356623649597168, + "rewards/rejected": -10.007380485534668, + "step": 1709 + }, + { + "epoch": 2.24, + "learning_rate": 8.20439009838504e-06, + "logits/chosen": -1.9861862659454346, + "logits/rejected": -1.9869616031646729, + "logps/chosen": -164.0230255126953, + "logps/rejected": -270.929443359375, + "loss": 0.0449, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7103303670883179, + "rewards/margins": 9.149612426757812, + "rewards/rejected": -9.859943389892578, + "step": 1710 + }, + { + "epoch": 2.24, + "learning_rate": 8.177867547410117e-06, + "logits/chosen": -1.9062087535858154, + "logits/rejected": -1.925144910812378, + "logps/chosen": -184.590087890625, + "logps/rejected": -311.4590759277344, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11384262144565582, + "rewards/margins": 12.008268356323242, + "rewards/rejected": -12.122112274169922, + "step": 1711 + }, + { + "epoch": 2.24, + "learning_rate": 8.151379550537894e-06, + "logits/chosen": -1.6894547939300537, + "logits/rejected": -1.6387817859649658, + "logps/chosen": -177.83206176757812, + "logps/rejected": -307.7388610839844, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10131040215492249, + "rewards/margins": 11.217378616333008, + "rewards/rejected": -11.31868839263916, + "step": 1712 + }, + { + "epoch": 2.24, + "learning_rate": 8.124926162176972e-06, + "logits/chosen": -1.9544663429260254, + "logits/rejected": -1.9023258686065674, + "logps/chosen": -184.77833557128906, + "logps/rejected": -285.4997253417969, + "loss": 0.0455, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4059910178184509, + "rewards/margins": 9.919851303100586, + "rewards/rejected": -10.32584285736084, + "step": 1713 + }, + { + "epoch": 2.24, + "learning_rate": 8.09850743666489e-06, + "logits/chosen": -1.814618468284607, + "logits/rejected": -1.8521473407745361, + "logps/chosen": -170.41326904296875, + "logps/rejected": -269.4269714355469, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1938045620918274, + "rewards/margins": 10.348040580749512, + "rewards/rejected": -10.541845321655273, + "step": 1714 + }, + { + "epoch": 2.24, + "learning_rate": 8.072123428267966e-06, + "logits/chosen": -1.841110110282898, + "logits/rejected": -1.8072327375411987, + "logps/chosen": -184.53738403320312, + "logps/rejected": -290.59417724609375, + "loss": 0.1307, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4841134548187256, + "rewards/margins": 9.09646987915039, + "rewards/rejected": -9.580583572387695, + "step": 1715 + }, + { + "epoch": 2.25, + "learning_rate": 8.045774191181229e-06, + "logits/chosen": -1.8143200874328613, + "logits/rejected": -1.8037315607070923, + "logps/chosen": -171.4906005859375, + "logps/rejected": -260.1253356933594, + "loss": 0.0436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.40983882546424866, + "rewards/margins": 10.020313262939453, + "rewards/rejected": -9.6104736328125, + "step": 1716 + }, + { + "epoch": 2.25, + "learning_rate": 8.01945977952826e-06, + "logits/chosen": -1.8991285562515259, + "logits/rejected": -1.8985545635223389, + "logps/chosen": -179.48533630371094, + "logps/rejected": -282.4693908691406, + "loss": 0.0434, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.035931214690208435, + "rewards/margins": 11.239422798156738, + "rewards/rejected": -11.2034912109375, + "step": 1717 + }, + { + "epoch": 2.25, + "learning_rate": 7.993180247361117e-06, + "logits/chosen": -1.9856749773025513, + "logits/rejected": -2.0255978107452393, + "logps/chosen": -169.13197326660156, + "logps/rejected": -288.31292724609375, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2914577126502991, + "rewards/margins": 10.44228458404541, + "rewards/rejected": -10.733741760253906, + "step": 1718 + }, + { + "epoch": 2.25, + "learning_rate": 7.966935648660229e-06, + "logits/chosen": -1.8952863216400146, + "logits/rejected": -1.936596393585205, + "logps/chosen": -152.17034912109375, + "logps/rejected": -211.1765899658203, + "loss": 0.1324, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.30124765634536743, + "rewards/margins": 6.913272857666016, + "rewards/rejected": -7.214520454406738, + "step": 1719 + }, + { + "epoch": 2.25, + "learning_rate": 7.940726037334237e-06, + "logits/chosen": -1.4483975172042847, + "logits/rejected": -1.5490933656692505, + "logps/chosen": -190.86947631835938, + "logps/rejected": -308.2823486328125, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.748171329498291, + "rewards/margins": 11.27152156829834, + "rewards/rejected": -10.52334976196289, + "step": 1720 + }, + { + "epoch": 2.25, + "learning_rate": 7.914551467219928e-06, + "logits/chosen": -1.8453223705291748, + "logits/rejected": -1.8781405687332153, + "logps/chosen": -160.87864685058594, + "logps/rejected": -267.6905822753906, + "loss": 0.0444, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.37843769788742065, + "rewards/margins": 9.748764991760254, + "rewards/rejected": -10.127202033996582, + "step": 1721 + }, + { + "epoch": 2.25, + "learning_rate": 7.88841199208212e-06, + "logits/chosen": -1.6350255012512207, + "logits/rejected": -1.6655409336090088, + "logps/chosen": -216.103271484375, + "logps/rejected": -321.94915771484375, + "loss": 0.1302, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.36814403533935547, + "rewards/margins": 11.262405395507812, + "rewards/rejected": -11.630549430847168, + "step": 1722 + }, + { + "epoch": 2.25, + "learning_rate": 7.862307665613543e-06, + "logits/chosen": -1.7220410108566284, + "logits/rejected": -1.7594200372695923, + "logps/chosen": -169.74510192871094, + "logps/rejected": -287.31219482421875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15666982531547546, + "rewards/margins": 11.298591613769531, + "rewards/rejected": -11.141921997070312, + "step": 1723 + }, + { + "epoch": 2.26, + "learning_rate": 7.836238541434709e-06, + "logits/chosen": -1.8581788539886475, + "logits/rejected": -1.8930964469909668, + "logps/chosen": -190.91131591796875, + "logps/rejected": -287.133544921875, + "loss": 0.0871, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.06962475180625916, + "rewards/margins": 9.650343894958496, + "rewards/rejected": -9.580718994140625, + "step": 1724 + }, + { + "epoch": 2.26, + "learning_rate": 7.810204673093848e-06, + "logits/chosen": -1.6504888534545898, + "logits/rejected": -1.650030493736267, + "logps/chosen": -183.9004364013672, + "logps/rejected": -294.3666076660156, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2765714228153229, + "rewards/margins": 11.313508033752441, + "rewards/rejected": -11.590078353881836, + "step": 1725 + }, + { + "epoch": 2.26, + "learning_rate": 7.784206114066753e-06, + "logits/chosen": -1.9586927890777588, + "logits/rejected": -2.0147244930267334, + "logps/chosen": -176.6748504638672, + "logps/rejected": -254.2784881591797, + "loss": 0.0893, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2919812202453613, + "rewards/margins": 7.025051593780518, + "rewards/rejected": -8.317032814025879, + "step": 1726 + }, + { + "epoch": 2.26, + "learning_rate": 7.758242917756683e-06, + "logits/chosen": -1.538002848625183, + "logits/rejected": -1.594235897064209, + "logps/chosen": -165.9473419189453, + "logps/rejected": -299.7716979980469, + "loss": 0.044, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5748831629753113, + "rewards/margins": 12.214863777160645, + "rewards/rejected": -11.63998031616211, + "step": 1727 + }, + { + "epoch": 2.26, + "learning_rate": 7.732315137494277e-06, + "logits/chosen": -1.3138413429260254, + "logits/rejected": -1.344238042831421, + "logps/chosen": -168.0712127685547, + "logps/rejected": -266.783447265625, + "loss": 0.087, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37343519926071167, + "rewards/margins": 10.039738655090332, + "rewards/rejected": -10.41317367553711, + "step": 1728 + }, + { + "epoch": 2.26, + "learning_rate": 7.706422826537435e-06, + "logits/chosen": -1.853121280670166, + "logits/rejected": -1.8434929847717285, + "logps/chosen": -172.1444854736328, + "logps/rejected": -275.5237121582031, + "loss": 0.087, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7644771933555603, + "rewards/margins": 10.53807544708252, + "rewards/rejected": -9.773597717285156, + "step": 1729 + }, + { + "epoch": 2.26, + "learning_rate": 7.680566038071157e-06, + "logits/chosen": -1.9105231761932373, + "logits/rejected": -1.8776659965515137, + "logps/chosen": -175.51573181152344, + "logps/rejected": -271.3371276855469, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3626537322998047, + "rewards/margins": 9.205049514770508, + "rewards/rejected": -9.567703247070312, + "step": 1730 + }, + { + "epoch": 2.27, + "learning_rate": 7.654744825207527e-06, + "logits/chosen": -1.8213955163955688, + "logits/rejected": -1.9022295475006104, + "logps/chosen": -174.43328857421875, + "logps/rejected": -287.72198486328125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3019050359725952, + "rewards/margins": 10.817704200744629, + "rewards/rejected": -10.515799522399902, + "step": 1731 + }, + { + "epoch": 2.27, + "learning_rate": 7.628959240985514e-06, + "logits/chosen": -1.7960586547851562, + "logits/rejected": -1.7198560237884521, + "logps/chosen": -156.78213500976562, + "logps/rejected": -270.96875, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0103964805603027, + "rewards/margins": 11.763899803161621, + "rewards/rejected": -10.753503799438477, + "step": 1732 + }, + { + "epoch": 2.27, + "learning_rate": 7.6032093383709345e-06, + "logits/chosen": -2.052406072616577, + "logits/rejected": -1.983301043510437, + "logps/chosen": -318.0826110839844, + "logps/rejected": -400.65264892578125, + "loss": 0.1307, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.044349193572998, + "rewards/margins": 8.950064659118652, + "rewards/rejected": -9.994413375854492, + "step": 1733 + }, + { + "epoch": 2.27, + "learning_rate": 7.57749517025628e-06, + "logits/chosen": -1.8126393556594849, + "logits/rejected": -1.8063534498214722, + "logps/chosen": -180.15164184570312, + "logps/rejected": -288.88739013671875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1004091203212738, + "rewards/margins": 11.325434684753418, + "rewards/rejected": -11.425844192504883, + "step": 1734 + }, + { + "epoch": 2.27, + "learning_rate": 7.551816789460664e-06, + "logits/chosen": -1.5149426460266113, + "logits/rejected": -1.5536918640136719, + "logps/chosen": -190.0808868408203, + "logps/rejected": -299.5590515136719, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4202817678451538, + "rewards/margins": 10.151581764221191, + "rewards/rejected": -10.571863174438477, + "step": 1735 + }, + { + "epoch": 2.27, + "learning_rate": 7.5261742487297e-06, + "logits/chosen": -1.7174972295761108, + "logits/rejected": -1.680656909942627, + "logps/chosen": -155.5364990234375, + "logps/rejected": -272.58392333984375, + "loss": 0.0888, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6095947027206421, + "rewards/margins": 11.343057632446289, + "rewards/rejected": -10.733463287353516, + "step": 1736 + }, + { + "epoch": 2.27, + "learning_rate": 7.5005676007353364e-06, + "logits/chosen": -1.7456557750701904, + "logits/rejected": -1.8724675178527832, + "logps/chosen": -194.61590576171875, + "logps/rejected": -293.4609375, + "loss": 0.0462, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06910091638565063, + "rewards/margins": 10.825394630432129, + "rewards/rejected": -10.894495010375977, + "step": 1737 + }, + { + "epoch": 2.27, + "learning_rate": 7.4749968980758365e-06, + "logits/chosen": -2.0038504600524902, + "logits/rejected": -2.0212323665618896, + "logps/chosen": -177.0474853515625, + "logps/rejected": -269.4918212890625, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.31083160638809204, + "rewards/margins": 8.82983684539795, + "rewards/rejected": -9.140668869018555, + "step": 1738 + }, + { + "epoch": 2.28, + "learning_rate": 7.449462193275628e-06, + "logits/chosen": -1.8676152229309082, + "logits/rejected": -1.8716466426849365, + "logps/chosen": -169.9856414794922, + "logps/rejected": -287.6925354003906, + "loss": 0.0872, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22346314787864685, + "rewards/margins": 10.747478485107422, + "rewards/rejected": -10.524015426635742, + "step": 1739 + }, + { + "epoch": 2.28, + "learning_rate": 7.4239635387851615e-06, + "logits/chosen": -1.406266689300537, + "logits/rejected": -1.3973854780197144, + "logps/chosen": -179.2802276611328, + "logps/rejected": -284.8497314453125, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7682864665985107, + "rewards/margins": 9.718920707702637, + "rewards/rejected": -10.48720645904541, + "step": 1740 + }, + { + "epoch": 2.28, + "learning_rate": 7.398500986980877e-06, + "logits/chosen": -1.6960973739624023, + "logits/rejected": -1.7026457786560059, + "logps/chosen": -199.8760528564453, + "logps/rejected": -306.4073791503906, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0006264448165894, + "rewards/margins": 10.567926406860352, + "rewards/rejected": -11.568553924560547, + "step": 1741 + }, + { + "epoch": 2.28, + "learning_rate": 7.373074590165041e-06, + "logits/chosen": -1.9691028594970703, + "logits/rejected": -1.9850671291351318, + "logps/chosen": -139.7640380859375, + "logps/rejected": -236.82205200195312, + "loss": 0.0873, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.37834060192108154, + "rewards/margins": 8.221779823303223, + "rewards/rejected": -8.600120544433594, + "step": 1742 + }, + { + "epoch": 2.28, + "learning_rate": 7.347684400565646e-06, + "logits/chosen": -1.7108796834945679, + "logits/rejected": -1.749725580215454, + "logps/chosen": -166.09190368652344, + "logps/rejected": -258.6752624511719, + "loss": 0.0875, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1374603658914566, + "rewards/margins": 10.045243263244629, + "rewards/rejected": -9.907782554626465, + "step": 1743 + }, + { + "epoch": 2.28, + "learning_rate": 7.3223304703363135e-06, + "logits/chosen": -1.612709879875183, + "logits/rejected": -1.64168381690979, + "logps/chosen": -163.29652404785156, + "logps/rejected": -299.8216857910156, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7383025288581848, + "rewards/margins": 12.689697265625, + "rewards/rejected": -11.951395988464355, + "step": 1744 + }, + { + "epoch": 2.28, + "learning_rate": 7.297012851556198e-06, + "logits/chosen": -1.7771739959716797, + "logits/rejected": -1.7433230876922607, + "logps/chosen": -180.24359130859375, + "logps/rejected": -289.2408447265625, + "loss": 0.0445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9659570455551147, + "rewards/margins": 9.667607307434082, + "rewards/rejected": -10.633563995361328, + "step": 1745 + }, + { + "epoch": 2.28, + "learning_rate": 7.271731596229864e-06, + "logits/chosen": -1.8907089233398438, + "logits/rejected": -1.9317028522491455, + "logps/chosen": -168.59190368652344, + "logps/rejected": -298.4954528808594, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8127346038818359, + "rewards/margins": 12.784120559692383, + "rewards/rejected": -11.971385955810547, + "step": 1746 + }, + { + "epoch": 2.29, + "learning_rate": 7.2464867562871745e-06, + "logits/chosen": -1.9037020206451416, + "logits/rejected": -1.9482851028442383, + "logps/chosen": -160.204345703125, + "logps/rejected": -283.2275390625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10229030251502991, + "rewards/margins": 11.46241569519043, + "rewards/rejected": -11.360127449035645, + "step": 1747 + }, + { + "epoch": 2.29, + "learning_rate": 7.221278383583185e-06, + "logits/chosen": -1.8530339002609253, + "logits/rejected": -1.8360775709152222, + "logps/chosen": -198.25192260742188, + "logps/rejected": -291.5886535644531, + "loss": 0.048, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.22122418880462646, + "rewards/margins": 10.326275825500488, + "rewards/rejected": -10.105051040649414, + "step": 1748 + }, + { + "epoch": 2.29, + "learning_rate": 7.1961065298980666e-06, + "logits/chosen": -1.830185890197754, + "logits/rejected": -1.856569766998291, + "logps/chosen": -165.61268615722656, + "logps/rejected": -301.9888916015625, + "loss": 0.045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0014628022909164429, + "rewards/margins": 12.039970397949219, + "rewards/rejected": -12.038508415222168, + "step": 1749 + }, + { + "epoch": 2.29, + "learning_rate": 7.170971246936966e-06, + "logits/chosen": -1.938109040260315, + "logits/rejected": -1.9661192893981934, + "logps/chosen": -171.55438232421875, + "logps/rejected": -270.74993896484375, + "loss": 0.0454, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6923284530639648, + "rewards/margins": 8.934579849243164, + "rewards/rejected": -9.626909255981445, + "step": 1750 + }, + { + "epoch": 2.29, + "learning_rate": 7.145872586329902e-06, + "logits/chosen": -1.8289794921875, + "logits/rejected": -1.75788152217865, + "logps/chosen": -158.6400909423828, + "logps/rejected": -263.5159912109375, + "loss": 0.0441, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.32655128836631775, + "rewards/margins": 10.302413940429688, + "rewards/rejected": -9.975861549377441, + "step": 1751 + }, + { + "epoch": 2.29, + "learning_rate": 7.12081059963168e-06, + "logits/chosen": -2.0309486389160156, + "logits/rejected": -2.0366148948669434, + "logps/chosen": -223.0327911376953, + "logps/rejected": -289.87469482421875, + "loss": 0.0876, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.23090195655822754, + "rewards/margins": 9.548128128051758, + "rewards/rejected": -9.779030799865723, + "step": 1752 + }, + { + "epoch": 2.29, + "learning_rate": 7.095785338321787e-06, + "logits/chosen": -1.715881586074829, + "logits/rejected": -1.6714414358139038, + "logps/chosen": -147.38153076171875, + "logps/rejected": -292.9781494140625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1249943971633911, + "rewards/margins": 13.216689109802246, + "rewards/rejected": -12.091693878173828, + "step": 1753 + }, + { + "epoch": 2.3, + "learning_rate": 7.070796853804221e-06, + "logits/chosen": -1.9819705486297607, + "logits/rejected": -1.9642049074172974, + "logps/chosen": -164.6637420654297, + "logps/rejected": -239.84664916992188, + "loss": 0.0445, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3134628236293793, + "rewards/margins": 8.493982315063477, + "rewards/rejected": -8.180519104003906, + "step": 1754 + }, + { + "epoch": 2.3, + "learning_rate": 7.045845197407494e-06, + "logits/chosen": -1.6679937839508057, + "logits/rejected": -1.694068193435669, + "logps/chosen": -181.0894317626953, + "logps/rejected": -291.09149169921875, + "loss": 0.081, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.41158822178840637, + "rewards/margins": 10.341178894042969, + "rewards/rejected": -9.929591178894043, + "step": 1755 + }, + { + "epoch": 2.3, + "learning_rate": 7.02093042038445e-06, + "logits/chosen": -1.947412371635437, + "logits/rejected": -1.9508118629455566, + "logps/chosen": -170.98974609375, + "logps/rejected": -263.0202941894531, + "loss": 0.0876, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16154395043849945, + "rewards/margins": 9.356968879699707, + "rewards/rejected": -9.518511772155762, + "step": 1756 + }, + { + "epoch": 2.3, + "learning_rate": 6.996052573912163e-06, + "logits/chosen": -1.5817075967788696, + "logits/rejected": -1.6646158695220947, + "logps/chosen": -150.36471557617188, + "logps/rejected": -249.65423583984375, + "loss": 0.0874, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08709275722503662, + "rewards/margins": 9.293158531188965, + "rewards/rejected": -9.38025188446045, + "step": 1757 + }, + { + "epoch": 2.3, + "learning_rate": 6.971211709091882e-06, + "logits/chosen": -1.9653867483139038, + "logits/rejected": -2.008445978164673, + "logps/chosen": -173.4401092529297, + "logps/rejected": -289.6496887207031, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5124838352203369, + "rewards/margins": 10.733196258544922, + "rewards/rejected": -11.245681762695312, + "step": 1758 + }, + { + "epoch": 2.3, + "learning_rate": 6.946407876948854e-06, + "logits/chosen": -1.4314374923706055, + "logits/rejected": -1.4279768466949463, + "logps/chosen": -203.32525634765625, + "logps/rejected": -288.1222839355469, + "loss": 0.0869, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9329442977905273, + "rewards/margins": 7.944190979003906, + "rewards/rejected": -9.877135276794434, + "step": 1759 + }, + { + "epoch": 2.3, + "learning_rate": 6.921641128432299e-06, + "logits/chosen": -1.569838285446167, + "logits/rejected": -1.6693848371505737, + "logps/chosen": -175.25723266601562, + "logps/rejected": -274.6288146972656, + "loss": 0.0577, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4557570219039917, + "rewards/margins": 8.802705764770508, + "rewards/rejected": -10.258461952209473, + "step": 1760 + }, + { + "epoch": 2.3, + "learning_rate": 6.896911514415219e-06, + "logits/chosen": -1.6506481170654297, + "logits/rejected": -1.6806344985961914, + "logps/chosen": -164.460693359375, + "logps/rejected": -293.4676818847656, + "loss": 0.045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14706763625144958, + "rewards/margins": 10.831473350524902, + "rewards/rejected": -10.684406280517578, + "step": 1761 + }, + { + "epoch": 2.31, + "learning_rate": 6.872219085694376e-06, + "logits/chosen": -1.6621962785720825, + "logits/rejected": -1.6537809371948242, + "logps/chosen": -177.6445770263672, + "logps/rejected": -285.210205078125, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.056619852781295776, + "rewards/margins": 10.052887916564941, + "rewards/rejected": -9.996268272399902, + "step": 1762 + }, + { + "epoch": 2.31, + "learning_rate": 6.8475638929901385e-06, + "logits/chosen": -2.0308778285980225, + "logits/rejected": -2.1186184883117676, + "logps/chosen": -206.31399536132812, + "logps/rejected": -318.4495544433594, + "loss": 0.0441, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19176353514194489, + "rewards/margins": 9.868120193481445, + "rewards/rejected": -10.059885025024414, + "step": 1763 + }, + { + "epoch": 2.31, + "learning_rate": 6.822945986946386e-06, + "logits/chosen": -1.4879238605499268, + "logits/rejected": -1.4405834674835205, + "logps/chosen": -194.39315795898438, + "logps/rejected": -300.735595703125, + "loss": 0.0888, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2869211435317993, + "rewards/margins": 9.08183479309082, + "rewards/rejected": -10.368757247924805, + "step": 1764 + }, + { + "epoch": 2.31, + "learning_rate": 6.798365418130395e-06, + "logits/chosen": -1.810922622680664, + "logits/rejected": -1.8760426044464111, + "logps/chosen": -227.18251037597656, + "logps/rejected": -324.980712890625, + "loss": 0.0914, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13253527879714966, + "rewards/margins": 8.583011627197266, + "rewards/rejected": -8.450477600097656, + "step": 1765 + }, + { + "epoch": 2.31, + "learning_rate": 6.773822237032779e-06, + "logits/chosen": -1.796043872833252, + "logits/rejected": -1.7689787149429321, + "logps/chosen": -162.033203125, + "logps/rejected": -275.82513427734375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3353985548019409, + "rewards/margins": 10.962149620056152, + "rewards/rejected": -11.297548294067383, + "step": 1766 + }, + { + "epoch": 2.31, + "learning_rate": 6.74931649406732e-06, + "logits/chosen": -1.7102222442626953, + "logits/rejected": -1.731156587600708, + "logps/chosen": -165.66786193847656, + "logps/rejected": -275.9735107421875, + "loss": 0.0442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5054381489753723, + "rewards/margins": 10.0288724899292, + "rewards/rejected": -10.534309387207031, + "step": 1767 + }, + { + "epoch": 2.31, + "learning_rate": 6.724848239570927e-06, + "logits/chosen": -1.750337839126587, + "logits/rejected": -1.8709826469421387, + "logps/chosen": -179.8159637451172, + "logps/rejected": -302.61749267578125, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9734774827957153, + "rewards/margins": 10.258872985839844, + "rewards/rejected": -11.23235034942627, + "step": 1768 + }, + { + "epoch": 2.32, + "learning_rate": 6.700417523803498e-06, + "logits/chosen": -1.8112870454788208, + "logits/rejected": -1.7693458795547485, + "logps/chosen": -172.14793395996094, + "logps/rejected": -279.39923095703125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.034608423709869385, + "rewards/margins": 11.188340187072754, + "rewards/rejected": -11.222949028015137, + "step": 1769 + }, + { + "epoch": 2.32, + "learning_rate": 6.6760243969478105e-06, + "logits/chosen": -1.7485147714614868, + "logits/rejected": -1.824428677558899, + "logps/chosen": -163.40614318847656, + "logps/rejected": -269.3749694824219, + "loss": 0.0441, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.05545555055141449, + "rewards/margins": 9.918785095214844, + "rewards/rejected": -9.863329887390137, + "step": 1770 + }, + { + "epoch": 2.32, + "learning_rate": 6.651668909109435e-06, + "logits/chosen": -1.6673730611801147, + "logits/rejected": -1.707067608833313, + "logps/chosen": -137.98208618164062, + "logps/rejected": -277.6700439453125, + "loss": 0.0867, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5911627411842346, + "rewards/margins": 11.366202354431152, + "rewards/rejected": -10.775039672851562, + "step": 1771 + }, + { + "epoch": 2.32, + "learning_rate": 6.627351110316635e-06, + "logits/chosen": -1.7997257709503174, + "logits/rejected": -1.8512601852416992, + "logps/chosen": -173.059326171875, + "logps/rejected": -305.6718444824219, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7866885662078857, + "rewards/margins": 11.9276704788208, + "rewards/rejected": -11.140981674194336, + "step": 1772 + }, + { + "epoch": 2.32, + "learning_rate": 6.603071050520262e-06, + "logits/chosen": -1.8208539485931396, + "logits/rejected": -1.8070167303085327, + "logps/chosen": -177.668212890625, + "logps/rejected": -295.25311279296875, + "loss": 0.0468, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.26464977860450745, + "rewards/margins": 10.599936485290527, + "rewards/rejected": -10.86458683013916, + "step": 1773 + }, + { + "epoch": 2.32, + "learning_rate": 6.578828779593632e-06, + "logits/chosen": -1.793238639831543, + "logits/rejected": -1.8560458421707153, + "logps/chosen": -157.04820251464844, + "logps/rejected": -239.97195434570312, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1773844063282013, + "rewards/margins": 8.815276145935059, + "rewards/rejected": -8.637890815734863, + "step": 1774 + }, + { + "epoch": 2.32, + "learning_rate": 6.554624347332458e-06, + "logits/chosen": -1.8851892948150635, + "logits/rejected": -1.8741503953933716, + "logps/chosen": -153.40682983398438, + "logps/rejected": -267.282470703125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0238525867462158, + "rewards/margins": 11.993562698364258, + "rewards/rejected": -10.969711303710938, + "step": 1775 + }, + { + "epoch": 2.32, + "learning_rate": 6.530457803454707e-06, + "logits/chosen": -1.6542538404464722, + "logits/rejected": -1.7552071809768677, + "logps/chosen": -170.07151794433594, + "logps/rejected": -285.5491943359375, + "loss": 0.0641, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8318610191345215, + "rewards/margins": 10.191630363464355, + "rewards/rejected": -11.023491859436035, + "step": 1776 + }, + { + "epoch": 2.33, + "learning_rate": 6.5063291976005445e-06, + "logits/chosen": -1.8388575315475464, + "logits/rejected": -1.8397880792617798, + "logps/chosen": -188.78768920898438, + "logps/rejected": -298.4885559082031, + "loss": 0.1308, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0867139101028442, + "rewards/margins": 8.545710563659668, + "rewards/rejected": -9.632424354553223, + "step": 1777 + }, + { + "epoch": 2.33, + "learning_rate": 6.482238579332184e-06, + "logits/chosen": -1.6654510498046875, + "logits/rejected": -1.6751917600631714, + "logps/chosen": -191.33050537109375, + "logps/rejected": -329.8052978515625, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9118808507919312, + "rewards/margins": 11.250426292419434, + "rewards/rejected": -12.162307739257812, + "step": 1778 + }, + { + "epoch": 2.33, + "learning_rate": 6.458185998133828e-06, + "logits/chosen": -1.5010361671447754, + "logits/rejected": -1.580959677696228, + "logps/chosen": -146.5097198486328, + "logps/rejected": -271.41400146484375, + "loss": 0.045, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.505151629447937, + "rewards/margins": 11.024741172790527, + "rewards/rejected": -10.5195894241333, + "step": 1779 + }, + { + "epoch": 2.33, + "learning_rate": 6.434171503411557e-06, + "logits/chosen": -1.6735241413116455, + "logits/rejected": -1.6906976699829102, + "logps/chosen": -182.5349884033203, + "logps/rejected": -304.000244140625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7863311171531677, + "rewards/margins": 12.555675506591797, + "rewards/rejected": -11.769344329833984, + "step": 1780 + }, + { + "epoch": 2.33, + "learning_rate": 6.4101951444931725e-06, + "logits/chosen": -1.8501273393630981, + "logits/rejected": -1.7672300338745117, + "logps/chosen": -166.9024658203125, + "logps/rejected": -256.5000305175781, + "loss": 0.0447, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19849181175231934, + "rewards/margins": 9.342573165893555, + "rewards/rejected": -9.541065216064453, + "step": 1781 + }, + { + "epoch": 2.33, + "learning_rate": 6.386256970628185e-06, + "logits/chosen": -1.821285605430603, + "logits/rejected": -1.850578784942627, + "logps/chosen": -148.71214294433594, + "logps/rejected": -262.0516052246094, + "loss": 0.0867, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18901565670967102, + "rewards/margins": 9.697868347167969, + "rewards/rejected": -9.508852005004883, + "step": 1782 + }, + { + "epoch": 2.33, + "learning_rate": 6.362357030987667e-06, + "logits/chosen": -1.7374156713485718, + "logits/rejected": -1.7040479183197021, + "logps/chosen": -187.08480834960938, + "logps/rejected": -327.8758239746094, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2558389902114868, + "rewards/margins": 10.793437004089355, + "rewards/rejected": -10.53759765625, + "step": 1783 + }, + { + "epoch": 2.33, + "learning_rate": 6.338495374664127e-06, + "logits/chosen": -1.9482240676879883, + "logits/rejected": -1.916212797164917, + "logps/chosen": -238.54067993164062, + "logps/rejected": -339.4722595214844, + "loss": 0.0434, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5566293001174927, + "rewards/margins": 12.390148162841797, + "rewards/rejected": -11.833518981933594, + "step": 1784 + }, + { + "epoch": 2.34, + "learning_rate": 6.314672050671461e-06, + "logits/chosen": -1.9244554042816162, + "logits/rejected": -1.9063340425491333, + "logps/chosen": -221.03720092773438, + "logps/rejected": -272.60052490234375, + "loss": 0.174, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1492745280265808, + "rewards/margins": 7.161561012268066, + "rewards/rejected": -7.012287139892578, + "step": 1785 + }, + { + "epoch": 2.34, + "learning_rate": 6.290887107944826e-06, + "logits/chosen": -1.9409064054489136, + "logits/rejected": -1.8746559619903564, + "logps/chosen": -209.3438262939453, + "logps/rejected": -318.3373718261719, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.22665250301361084, + "rewards/margins": 9.859477043151855, + "rewards/rejected": -10.086130142211914, + "step": 1786 + }, + { + "epoch": 2.34, + "learning_rate": 6.267140595340529e-06, + "logits/chosen": -1.8786401748657227, + "logits/rejected": -1.8052152395248413, + "logps/chosen": -213.80767822265625, + "logps/rejected": -300.69677734375, + "loss": 0.0443, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.776313841342926, + "rewards/margins": 10.222015380859375, + "rewards/rejected": -9.44570255279541, + "step": 1787 + }, + { + "epoch": 2.34, + "learning_rate": 6.243432561635934e-06, + "logits/chosen": -1.9693737030029297, + "logits/rejected": -2.0261220932006836, + "logps/chosen": -203.7155303955078, + "logps/rejected": -307.5109558105469, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6053099036216736, + "rewards/margins": 10.26247501373291, + "rewards/rejected": -10.867785453796387, + "step": 1788 + }, + { + "epoch": 2.34, + "learning_rate": 6.219763055529384e-06, + "logits/chosen": -1.7718794345855713, + "logits/rejected": -1.8051306009292603, + "logps/chosen": -200.9105224609375, + "logps/rejected": -301.87518310546875, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0174882411956787, + "rewards/margins": 10.628706932067871, + "rewards/rejected": -11.646194458007812, + "step": 1789 + }, + { + "epoch": 2.34, + "learning_rate": 6.1961321256400836e-06, + "logits/chosen": -1.9645118713378906, + "logits/rejected": -1.957564353942871, + "logps/chosen": -191.6086883544922, + "logps/rejected": -295.28485107421875, + "loss": 0.087, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7043985724449158, + "rewards/margins": 9.528505325317383, + "rewards/rejected": -10.232903480529785, + "step": 1790 + }, + { + "epoch": 2.34, + "learning_rate": 6.172539820507977e-06, + "logits/chosen": -1.9271502494812012, + "logits/rejected": -2.000107526779175, + "logps/chosen": -174.35333251953125, + "logps/rejected": -283.39227294921875, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2140548825263977, + "rewards/margins": 9.546053886413574, + "rewards/rejected": -9.760109901428223, + "step": 1791 + }, + { + "epoch": 2.35, + "learning_rate": 6.1489861885936805e-06, + "logits/chosen": -1.7759851217269897, + "logits/rejected": -1.8125998973846436, + "logps/chosen": -187.45242309570312, + "logps/rejected": -269.9373474121094, + "loss": 0.0912, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8011727929115295, + "rewards/margins": 9.386351585388184, + "rewards/rejected": -10.187522888183594, + "step": 1792 + }, + { + "epoch": 2.35, + "learning_rate": 6.125471278278378e-06, + "logits/chosen": -1.2684663534164429, + "logits/rejected": -1.3982093334197998, + "logps/chosen": -168.6224822998047, + "logps/rejected": -290.4124755859375, + "loss": 0.0436, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0519822835922241, + "rewards/margins": 9.972909927368164, + "rewards/rejected": -11.024892807006836, + "step": 1793 + }, + { + "epoch": 2.35, + "learning_rate": 6.101995137863717e-06, + "logits/chosen": -1.5991662740707397, + "logits/rejected": -1.6004818677902222, + "logps/chosen": -160.52716064453125, + "logps/rejected": -296.5534973144531, + "loss": 0.0871, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.052079677581787, + "rewards/margins": 11.519514083862305, + "rewards/rejected": -10.467434883117676, + "step": 1794 + }, + { + "epoch": 2.35, + "learning_rate": 6.078557815571692e-06, + "logits/chosen": -1.8737196922302246, + "logits/rejected": -1.8845328092575073, + "logps/chosen": -167.57736206054688, + "logps/rejected": -268.4541931152344, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3138190507888794, + "rewards/margins": 9.577204704284668, + "rewards/rejected": -9.891023635864258, + "step": 1795 + }, + { + "epoch": 2.35, + "learning_rate": 6.055159359544579e-06, + "logits/chosen": -1.8007721900939941, + "logits/rejected": -1.7881731986999512, + "logps/chosen": -166.81748962402344, + "logps/rejected": -283.91729736328125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4316680431365967, + "rewards/margins": 12.010297775268555, + "rewards/rejected": -12.441967010498047, + "step": 1796 + }, + { + "epoch": 2.35, + "learning_rate": 6.03179981784483e-06, + "logits/chosen": -1.9042326211929321, + "logits/rejected": -1.9271005392074585, + "logps/chosen": -202.57493591308594, + "logps/rejected": -273.7978210449219, + "loss": 0.0474, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4292362928390503, + "rewards/margins": 8.316028594970703, + "rewards/rejected": -9.745264053344727, + "step": 1797 + }, + { + "epoch": 2.35, + "learning_rate": 6.008479238454915e-06, + "logits/chosen": -1.90767240524292, + "logits/rejected": -1.8962470293045044, + "logps/chosen": -151.21585083007812, + "logps/rejected": -269.2382507324219, + "loss": 0.0437, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1595345735549927, + "rewards/margins": 10.691314697265625, + "rewards/rejected": -9.531778335571289, + "step": 1798 + }, + { + "epoch": 2.35, + "learning_rate": 5.98519766927732e-06, + "logits/chosen": -1.6973731517791748, + "logits/rejected": -1.6941744089126587, + "logps/chosen": -152.48489379882812, + "logps/rejected": -237.02682495117188, + "loss": 0.0874, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4318411946296692, + "rewards/margins": 9.360139846801758, + "rewards/rejected": -8.928298950195312, + "step": 1799 + }, + { + "epoch": 2.36, + "learning_rate": 5.961955158134391e-06, + "logits/chosen": -1.8882299661636353, + "logits/rejected": -1.9187428951263428, + "logps/chosen": -160.3352508544922, + "logps/rejected": -276.63671875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10231944918632507, + "rewards/margins": 10.880594253540039, + "rewards/rejected": -10.778274536132812, + "step": 1800 + }, + { + "epoch": 2.36, + "learning_rate": 5.938751752768226e-06, + "logits/chosen": -1.6967055797576904, + "logits/rejected": -1.7408099174499512, + "logps/chosen": -176.86697387695312, + "logps/rejected": -332.26141357421875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.040652744472026825, + "rewards/margins": 10.334501266479492, + "rewards/rejected": -10.375153541564941, + "step": 1801 + }, + { + "epoch": 2.36, + "learning_rate": 5.915587500840625e-06, + "logits/chosen": -1.966801404953003, + "logits/rejected": -1.943291187286377, + "logps/chosen": -214.0811767578125, + "logps/rejected": -315.0403747558594, + "loss": 0.1301, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05812062323093414, + "rewards/margins": 10.02153491973877, + "rewards/rejected": -10.079654693603516, + "step": 1802 + }, + { + "epoch": 2.36, + "learning_rate": 5.892462449932928e-06, + "logits/chosen": -1.7515013217926025, + "logits/rejected": -1.7488012313842773, + "logps/chosen": -175.84429931640625, + "logps/rejected": -273.9769592285156, + "loss": 0.0868, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02575960010290146, + "rewards/margins": 10.522844314575195, + "rewards/rejected": -10.497084617614746, + "step": 1803 + }, + { + "epoch": 2.36, + "learning_rate": 5.869376647545993e-06, + "logits/chosen": -1.669764518737793, + "logits/rejected": -1.7677987813949585, + "logps/chosen": -212.88726806640625, + "logps/rejected": -306.52215576171875, + "loss": 0.0451, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10393333435058594, + "rewards/margins": 11.629434585571289, + "rewards/rejected": -11.525501251220703, + "step": 1804 + }, + { + "epoch": 2.36, + "learning_rate": 5.84633014110002e-06, + "logits/chosen": -1.7090595960617065, + "logits/rejected": -1.6194384098052979, + "logps/chosen": -187.34771728515625, + "logps/rejected": -276.8177490234375, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6972207427024841, + "rewards/margins": 10.579232215881348, + "rewards/rejected": -11.27645206451416, + "step": 1805 + }, + { + "epoch": 2.36, + "learning_rate": 5.82332297793452e-06, + "logits/chosen": -1.89621102809906, + "logits/rejected": -1.9055019617080688, + "logps/chosen": -198.2242889404297, + "logps/rejected": -272.9102783203125, + "loss": 0.1331, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3955163359642029, + "rewards/margins": 9.155433654785156, + "rewards/rejected": -8.759916305541992, + "step": 1806 + }, + { + "epoch": 2.36, + "learning_rate": 5.800355205308183e-06, + "logits/chosen": -1.768875002861023, + "logits/rejected": -1.7794102430343628, + "logps/chosen": -204.25833129882812, + "logps/rejected": -347.1092529296875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37276333570480347, + "rewards/margins": 12.828039169311523, + "rewards/rejected": -13.20080280303955, + "step": 1807 + }, + { + "epoch": 2.37, + "learning_rate": 5.777426870398777e-06, + "logits/chosen": -1.412650465965271, + "logits/rejected": -1.413249135017395, + "logps/chosen": -217.42286682128906, + "logps/rejected": -329.94866943359375, + "loss": 0.0434, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6643086075782776, + "rewards/margins": 10.350072860717773, + "rewards/rejected": -11.014381408691406, + "step": 1808 + }, + { + "epoch": 2.37, + "learning_rate": 5.754538020303063e-06, + "logits/chosen": -1.427734375, + "logits/rejected": -1.362438440322876, + "logps/chosen": -182.58688354492188, + "logps/rejected": -280.49285888671875, + "loss": 0.0872, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06954428553581238, + "rewards/margins": 9.564674377441406, + "rewards/rejected": -9.49513053894043, + "step": 1809 + }, + { + "epoch": 2.37, + "learning_rate": 5.731688702036717e-06, + "logits/chosen": -1.926405668258667, + "logits/rejected": -1.953773021697998, + "logps/chosen": -166.82115173339844, + "logps/rejected": -251.54034423828125, + "loss": 0.1401, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4003523886203766, + "rewards/margins": 7.934447765350342, + "rewards/rejected": -8.334800720214844, + "step": 1810 + }, + { + "epoch": 2.37, + "learning_rate": 5.708878962534181e-06, + "logits/chosen": -1.914361834526062, + "logits/rejected": -1.921243667602539, + "logps/chosen": -180.48472595214844, + "logps/rejected": -288.3634033203125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36496296525001526, + "rewards/margins": 10.066577911376953, + "rewards/rejected": -10.431539535522461, + "step": 1811 + }, + { + "epoch": 2.37, + "learning_rate": 5.686108848648624e-06, + "logits/chosen": -1.7519727945327759, + "logits/rejected": -1.7584078311920166, + "logps/chosen": -204.1603546142578, + "logps/rejected": -306.96392822265625, + "loss": 0.044, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.48017215728759766, + "rewards/margins": 9.87966537475586, + "rewards/rejected": -10.359838485717773, + "step": 1812 + }, + { + "epoch": 2.37, + "learning_rate": 5.6633784071518205e-06, + "logits/chosen": -1.9468023777008057, + "logits/rejected": -1.959162950515747, + "logps/chosen": -184.849853515625, + "logps/rejected": -271.1631164550781, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.15084180235862732, + "rewards/margins": 9.28327751159668, + "rewards/rejected": -9.43411922454834, + "step": 1813 + }, + { + "epoch": 2.37, + "learning_rate": 5.640687684734039e-06, + "logits/chosen": -1.8862568140029907, + "logits/rejected": -1.8250839710235596, + "logps/chosen": -133.92697143554688, + "logps/rejected": -194.6947479248047, + "loss": 0.1342, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7051723003387451, + "rewards/margins": 6.402805328369141, + "rewards/rejected": -7.107977867126465, + "step": 1814 + }, + { + "epoch": 2.38, + "learning_rate": 5.618036728003965e-06, + "logits/chosen": -1.8589372634887695, + "logits/rejected": -1.8547106981277466, + "logps/chosen": -171.41448974609375, + "logps/rejected": -248.6511688232422, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27654093503952026, + "rewards/margins": 9.099873542785645, + "rewards/rejected": -8.823331832885742, + "step": 1815 + }, + { + "epoch": 2.38, + "learning_rate": 5.595425583488608e-06, + "logits/chosen": -1.8651342391967773, + "logits/rejected": -1.9222290515899658, + "logps/chosen": -172.67733764648438, + "logps/rejected": -293.6805114746094, + "loss": 0.0444, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.46450647711753845, + "rewards/margins": 10.447179794311523, + "rewards/rejected": -10.911686897277832, + "step": 1816 + }, + { + "epoch": 2.38, + "learning_rate": 5.572854297633209e-06, + "logits/chosen": -1.7997162342071533, + "logits/rejected": -1.7741899490356445, + "logps/chosen": -217.53509521484375, + "logps/rejected": -290.6956787109375, + "loss": 0.087, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6117663383483887, + "rewards/margins": 8.956192016601562, + "rewards/rejected": -9.56795883178711, + "step": 1817 + }, + { + "epoch": 2.38, + "learning_rate": 5.550322916801115e-06, + "logits/chosen": -1.5089060068130493, + "logits/rejected": -1.3306849002838135, + "logps/chosen": -163.96722412109375, + "logps/rejected": -236.3388671875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8385988473892212, + "rewards/margins": 10.415196418762207, + "rewards/rejected": -9.576597213745117, + "step": 1818 + }, + { + "epoch": 2.38, + "learning_rate": 5.5278314872737105e-06, + "logits/chosen": -1.9144213199615479, + "logits/rejected": -1.9533132314682007, + "logps/chosen": -167.16183471679688, + "logps/rejected": -289.74591064453125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21969260275363922, + "rewards/margins": 10.645326614379883, + "rewards/rejected": -10.865019798278809, + "step": 1819 + }, + { + "epoch": 2.38, + "learning_rate": 5.505380055250325e-06, + "logits/chosen": -1.7307775020599365, + "logits/rejected": -1.7013916969299316, + "logps/chosen": -169.68658447265625, + "logps/rejected": -311.2918701171875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4395444989204407, + "rewards/margins": 12.967764854431152, + "rewards/rejected": -12.528219223022461, + "step": 1820 + }, + { + "epoch": 2.38, + "learning_rate": 5.482968666848132e-06, + "logits/chosen": -1.7398004531860352, + "logits/rejected": -1.7385789155960083, + "logps/chosen": -159.52310180664062, + "logps/rejected": -265.59405517578125, + "loss": 0.0447, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4824480414390564, + "rewards/margins": 10.138984680175781, + "rewards/rejected": -10.621431350708008, + "step": 1821 + }, + { + "epoch": 2.38, + "learning_rate": 5.460597368102033e-06, + "logits/chosen": -1.6694751977920532, + "logits/rejected": -1.6540926694869995, + "logps/chosen": -180.81236267089844, + "logps/rejected": -295.2912292480469, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04424242675304413, + "rewards/margins": 11.217676162719727, + "rewards/rejected": -11.173434257507324, + "step": 1822 + }, + { + "epoch": 2.39, + "learning_rate": 5.4382662049646036e-06, + "logits/chosen": -2.056762456893921, + "logits/rejected": -2.1063790321350098, + "logps/chosen": -166.14781188964844, + "logps/rejected": -255.3070068359375, + "loss": 0.089, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1774362027645111, + "rewards/margins": 8.638639450073242, + "rewards/rejected": -8.81607437133789, + "step": 1823 + }, + { + "epoch": 2.39, + "learning_rate": 5.4159752233059745e-06, + "logits/chosen": -1.7273327112197876, + "logits/rejected": -1.7593743801116943, + "logps/chosen": -181.8457794189453, + "logps/rejected": -295.46319580078125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8130092620849609, + "rewards/margins": 10.171011924743652, + "rewards/rejected": -10.98402214050293, + "step": 1824 + }, + { + "epoch": 2.39, + "learning_rate": 5.393724468913713e-06, + "logits/chosen": -1.6965676546096802, + "logits/rejected": -1.689316987991333, + "logps/chosen": -205.56024169921875, + "logps/rejected": -303.7931213378906, + "loss": 0.0872, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43353134393692017, + "rewards/margins": 10.221420288085938, + "rewards/rejected": -10.654953002929688, + "step": 1825 + }, + { + "epoch": 2.39, + "learning_rate": 5.371513987492788e-06, + "logits/chosen": -1.8162238597869873, + "logits/rejected": -1.830539584159851, + "logps/chosen": -155.27394104003906, + "logps/rejected": -252.91693115234375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07742422074079514, + "rewards/margins": 9.324585914611816, + "rewards/rejected": -9.402010917663574, + "step": 1826 + }, + { + "epoch": 2.39, + "learning_rate": 5.34934382466544e-06, + "logits/chosen": -1.717362880706787, + "logits/rejected": -1.7740179300308228, + "logps/chosen": -178.6630096435547, + "logps/rejected": -276.423828125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7705976963043213, + "rewards/margins": 10.497088432312012, + "rewards/rejected": -9.72649097442627, + "step": 1827 + }, + { + "epoch": 2.39, + "learning_rate": 5.32721402597107e-06, + "logits/chosen": -1.7956349849700928, + "logits/rejected": -1.825842022895813, + "logps/chosen": -164.54360961914062, + "logps/rejected": -300.4873046875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6423429250717163, + "rewards/margins": 13.236227035522461, + "rewards/rejected": -11.593881607055664, + "step": 1828 + }, + { + "epoch": 2.39, + "learning_rate": 5.3051246368661965e-06, + "logits/chosen": -1.696475863456726, + "logits/rejected": -1.7419952154159546, + "logps/chosen": -157.79843139648438, + "logps/rejected": -273.8384094238281, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9692426919937134, + "rewards/margins": 11.525646209716797, + "rewards/rejected": -10.556403160095215, + "step": 1829 + }, + { + "epoch": 2.39, + "learning_rate": 5.283075702724305e-06, + "logits/chosen": -1.906248927116394, + "logits/rejected": -1.913427472114563, + "logps/chosen": -157.6904754638672, + "logps/rejected": -246.5287628173828, + "loss": 0.0871, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39825528860092163, + "rewards/margins": 9.188461303710938, + "rewards/rejected": -9.586716651916504, + "step": 1830 + }, + { + "epoch": 2.4, + "learning_rate": 5.261067268835812e-06, + "logits/chosen": -1.7194949388504028, + "logits/rejected": -1.7523001432418823, + "logps/chosen": -171.20797729492188, + "logps/rejected": -299.2723693847656, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8622329235076904, + "rewards/margins": 12.980911254882812, + "rewards/rejected": -12.11867904663086, + "step": 1831 + }, + { + "epoch": 2.4, + "learning_rate": 5.239099380407916e-06, + "logits/chosen": -1.5889946222305298, + "logits/rejected": -1.6349446773529053, + "logps/chosen": -158.3022918701172, + "logps/rejected": -260.525634765625, + "loss": 0.1303, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06059791147708893, + "rewards/margins": 8.852926254272461, + "rewards/rejected": -8.913525581359863, + "step": 1832 + }, + { + "epoch": 2.4, + "learning_rate": 5.217172082564547e-06, + "logits/chosen": -1.7202104330062866, + "logits/rejected": -1.707026720046997, + "logps/chosen": -164.1764678955078, + "logps/rejected": -277.5565490722656, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09294569492340088, + "rewards/margins": 10.963340759277344, + "rewards/rejected": -10.870394706726074, + "step": 1833 + }, + { + "epoch": 2.4, + "learning_rate": 5.195285420346263e-06, + "logits/chosen": -1.2561309337615967, + "logits/rejected": -1.223847508430481, + "logps/chosen": -192.4342041015625, + "logps/rejected": -277.72406005859375, + "loss": 0.0882, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4079866409301758, + "rewards/margins": 10.069986343383789, + "rewards/rejected": -9.66200065612793, + "step": 1834 + }, + { + "epoch": 2.4, + "learning_rate": 5.17343943871014e-06, + "logits/chosen": -1.8904907703399658, + "logits/rejected": -1.8836495876312256, + "logps/chosen": -194.80636596679688, + "logps/rejected": -307.4391784667969, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9263059496879578, + "rewards/margins": 11.095623970031738, + "rewards/rejected": -12.021929740905762, + "step": 1835 + }, + { + "epoch": 2.4, + "learning_rate": 5.151634182529691e-06, + "logits/chosen": -1.9277397394180298, + "logits/rejected": -1.8971800804138184, + "logps/chosen": -190.87405395507812, + "logps/rejected": -264.7363586425781, + "loss": 0.0881, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05531910061836243, + "rewards/margins": 9.743350982666016, + "rewards/rejected": -9.688031196594238, + "step": 1836 + }, + { + "epoch": 2.4, + "learning_rate": 5.129869696594786e-06, + "logits/chosen": -1.776524305343628, + "logits/rejected": -1.8130015134811401, + "logps/chosen": -161.97628784179688, + "logps/rejected": -267.0655212402344, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8491145372390747, + "rewards/margins": 10.912199974060059, + "rewards/rejected": -11.76131534576416, + "step": 1837 + }, + { + "epoch": 2.41, + "learning_rate": 5.108146025611554e-06, + "logits/chosen": -1.7536497116088867, + "logits/rejected": -1.8303102254867554, + "logps/chosen": -171.8523712158203, + "logps/rejected": -312.10076904296875, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9070400595664978, + "rewards/margins": 12.459534645080566, + "rewards/rejected": -11.552495956420898, + "step": 1838 + }, + { + "epoch": 2.41, + "learning_rate": 5.086463214202264e-06, + "logits/chosen": -1.8125512599945068, + "logits/rejected": -1.7870056629180908, + "logps/chosen": -236.07394409179688, + "logps/rejected": -335.81866455078125, + "loss": 0.0442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2829594314098358, + "rewards/margins": 9.968437194824219, + "rewards/rejected": -10.251396179199219, + "step": 1839 + }, + { + "epoch": 2.41, + "learning_rate": 5.064821306905288e-06, + "logits/chosen": -2.0149762630462646, + "logits/rejected": -2.046999216079712, + "logps/chosen": -267.974853515625, + "logps/rejected": -340.5155029296875, + "loss": 0.0875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0345547199249268, + "rewards/margins": 7.42270040512085, + "rewards/rejected": -8.457255363464355, + "step": 1840 + }, + { + "epoch": 2.41, + "learning_rate": 5.043220348174945e-06, + "logits/chosen": -1.8509633541107178, + "logits/rejected": -1.9131324291229248, + "logps/chosen": -159.4285125732422, + "logps/rejected": -305.44219970703125, + "loss": 0.0444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10141775012016296, + "rewards/margins": 12.238996505737305, + "rewards/rejected": -12.340413093566895, + "step": 1841 + }, + { + "epoch": 2.41, + "learning_rate": 5.021660382381457e-06, + "logits/chosen": -1.9455000162124634, + "logits/rejected": -1.926633358001709, + "logps/chosen": -196.45211791992188, + "logps/rejected": -264.18890380859375, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7082411050796509, + "rewards/margins": 8.62078857421875, + "rewards/rejected": -9.32903003692627, + "step": 1842 + }, + { + "epoch": 2.41, + "learning_rate": 5.000141453810847e-06, + "logits/chosen": -1.7795230150222778, + "logits/rejected": -1.8405141830444336, + "logps/chosen": -170.18780517578125, + "logps/rejected": -284.61712646484375, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3362077474594116, + "rewards/margins": 10.65847396850586, + "rewards/rejected": -10.322264671325684, + "step": 1843 + }, + { + "epoch": 2.41, + "learning_rate": 4.9786636066648436e-06, + "logits/chosen": -1.7431994676589966, + "logits/rejected": -1.7149559259414673, + "logps/chosen": -175.79371643066406, + "logps/rejected": -270.34393310546875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01590821146965027, + "rewards/margins": 9.766818046569824, + "rewards/rejected": -9.750909805297852, + "step": 1844 + }, + { + "epoch": 2.41, + "learning_rate": 4.957226885060779e-06, + "logits/chosen": -1.7376182079315186, + "logits/rejected": -1.6990633010864258, + "logps/chosen": -206.9566650390625, + "logps/rejected": -325.6643371582031, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8571552038192749, + "rewards/margins": 10.426507949829102, + "rewards/rejected": -11.28366470336914, + "step": 1845 + }, + { + "epoch": 2.42, + "learning_rate": 4.935831333031527e-06, + "logits/chosen": -1.857530951499939, + "logits/rejected": -1.8986061811447144, + "logps/chosen": -180.50323486328125, + "logps/rejected": -281.2354431152344, + "loss": 0.0871, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6256273984909058, + "rewards/margins": 9.073454856872559, + "rewards/rejected": -8.447827339172363, + "step": 1846 + }, + { + "epoch": 2.42, + "learning_rate": 4.914476994525372e-06, + "logits/chosen": -1.6683006286621094, + "logits/rejected": -1.7726991176605225, + "logps/chosen": -173.82167053222656, + "logps/rejected": -269.3650817871094, + "loss": 0.1311, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.515027642250061, + "rewards/margins": 8.45280647277832, + "rewards/rejected": -8.967833518981934, + "step": 1847 + }, + { + "epoch": 2.42, + "learning_rate": 4.893163913405971e-06, + "logits/chosen": -1.6190377473831177, + "logits/rejected": -1.5506397485733032, + "logps/chosen": -268.7127685546875, + "logps/rejected": -363.58331298828125, + "loss": 0.0438, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3319658637046814, + "rewards/margins": 9.663751602172852, + "rewards/rejected": -9.995718002319336, + "step": 1848 + }, + { + "epoch": 2.42, + "learning_rate": 4.871892133452211e-06, + "logits/chosen": -1.8583685159683228, + "logits/rejected": -1.8825597763061523, + "logps/chosen": -191.40194702148438, + "logps/rejected": -314.5677795410156, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2103339433670044, + "rewards/margins": 11.047508239746094, + "rewards/rejected": -11.257842063903809, + "step": 1849 + }, + { + "epoch": 2.42, + "learning_rate": 4.850661698358156e-06, + "logits/chosen": -1.8790136575698853, + "logits/rejected": -1.9097334146499634, + "logps/chosen": -190.89657592773438, + "logps/rejected": -290.02325439453125, + "loss": 0.044, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5802582502365112, + "rewards/margins": 9.128975868225098, + "rewards/rejected": -9.709234237670898, + "step": 1850 + }, + { + "epoch": 2.42, + "learning_rate": 4.8294726517329496e-06, + "logits/chosen": -1.7216588258743286, + "logits/rejected": -1.771230936050415, + "logps/chosen": -179.63800048828125, + "logps/rejected": -317.079345703125, + "loss": 0.0441, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.014571011066436768, + "rewards/margins": 12.362967491149902, + "rewards/rejected": -12.377538681030273, + "step": 1851 + }, + { + "epoch": 2.42, + "learning_rate": 4.808325037100691e-06, + "logits/chosen": -1.2767850160598755, + "logits/rejected": -1.3450748920440674, + "logps/chosen": -186.95932006835938, + "logps/rejected": -286.52520751953125, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4472254514694214, + "rewards/margins": 10.150073051452637, + "rewards/rejected": -9.702848434448242, + "step": 1852 + }, + { + "epoch": 2.42, + "learning_rate": 4.787218897900403e-06, + "logits/chosen": -1.91081964969635, + "logits/rejected": -1.9476604461669922, + "logps/chosen": -230.13436889648438, + "logps/rejected": -346.1830749511719, + "loss": 0.1302, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5067952275276184, + "rewards/margins": 11.242530822753906, + "rewards/rejected": -11.7493257522583, + "step": 1853 + }, + { + "epoch": 2.43, + "learning_rate": 4.766154277485915e-06, + "logits/chosen": -1.63359534740448, + "logits/rejected": -1.6596496105194092, + "logps/chosen": -174.63385009765625, + "logps/rejected": -338.9428405761719, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3271731436252594, + "rewards/margins": 12.626646995544434, + "rewards/rejected": -12.29947280883789, + "step": 1854 + }, + { + "epoch": 2.43, + "learning_rate": 4.745131219125748e-06, + "logits/chosen": -1.613443374633789, + "logits/rejected": -1.637129545211792, + "logps/chosen": -170.82139587402344, + "logps/rejected": -277.8330993652344, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.853659451007843, + "rewards/margins": 9.57777214050293, + "rewards/rejected": -10.431431770324707, + "step": 1855 + }, + { + "epoch": 2.43, + "learning_rate": 4.7241497660030744e-06, + "logits/chosen": -1.7482025623321533, + "logits/rejected": -1.7822033166885376, + "logps/chosen": -213.3333740234375, + "logps/rejected": -315.5164489746094, + "loss": 0.0445, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9184327125549316, + "rewards/margins": 9.542754173278809, + "rewards/rejected": -11.461186408996582, + "step": 1856 + }, + { + "epoch": 2.43, + "learning_rate": 4.703209961215607e-06, + "logits/chosen": -1.9127970933914185, + "logits/rejected": -1.9124560356140137, + "logps/chosen": -153.39889526367188, + "logps/rejected": -270.3568115234375, + "loss": 0.0868, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6635768413543701, + "rewards/margins": 9.972073554992676, + "rewards/rejected": -9.308496475219727, + "step": 1857 + }, + { + "epoch": 2.43, + "learning_rate": 4.682311847775489e-06, + "logits/chosen": -1.6051024198532104, + "logits/rejected": -1.6780390739440918, + "logps/chosen": -177.33233642578125, + "logps/rejected": -313.31011962890625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6623866558074951, + "rewards/margins": 10.764513969421387, + "rewards/rejected": -11.426900863647461, + "step": 1858 + }, + { + "epoch": 2.43, + "learning_rate": 4.661455468609235e-06, + "logits/chosen": -1.9415199756622314, + "logits/rejected": -1.9421744346618652, + "logps/chosen": -193.20706176757812, + "logps/rejected": -283.1640625, + "loss": 0.0438, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9550546407699585, + "rewards/margins": 9.1715087890625, + "rewards/rejected": -10.12656307220459, + "step": 1859 + }, + { + "epoch": 2.43, + "learning_rate": 4.640640866557644e-06, + "logits/chosen": -1.6554899215698242, + "logits/rejected": -1.673490047454834, + "logps/chosen": -193.56585693359375, + "logps/rejected": -310.44677734375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5918046236038208, + "rewards/margins": 10.593010902404785, + "rewards/rejected": -11.184816360473633, + "step": 1860 + }, + { + "epoch": 2.44, + "learning_rate": 4.6198680843756975e-06, + "logits/chosen": -1.8800220489501953, + "logits/rejected": -1.9442106485366821, + "logps/chosen": -171.99818420410156, + "logps/rejected": -290.9822998046875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32778018712997437, + "rewards/margins": 12.33488655090332, + "rewards/rejected": -12.007104873657227, + "step": 1861 + }, + { + "epoch": 2.44, + "learning_rate": 4.599137164732464e-06, + "logits/chosen": -1.7752525806427002, + "logits/rejected": -1.7977432012557983, + "logps/chosen": -184.8430633544922, + "logps/rejected": -262.4587707519531, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.223883718252182, + "rewards/margins": 9.400529861450195, + "rewards/rejected": -9.624414443969727, + "step": 1862 + }, + { + "epoch": 2.44, + "learning_rate": 4.578448150211026e-06, + "logits/chosen": -1.6502456665039062, + "logits/rejected": -1.6839553117752075, + "logps/chosen": -188.94287109375, + "logps/rejected": -326.7992248535156, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3275027573108673, + "rewards/margins": 10.024675369262695, + "rewards/rejected": -10.352178573608398, + "step": 1863 + }, + { + "epoch": 2.44, + "learning_rate": 4.557801083308403e-06, + "logits/chosen": -1.8393663167953491, + "logits/rejected": -1.8291668891906738, + "logps/chosen": -201.91502380371094, + "logps/rejected": -307.2249755859375, + "loss": 0.0868, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2956475019454956, + "rewards/margins": 9.467047691345215, + "rewards/rejected": -9.7626953125, + "step": 1864 + }, + { + "epoch": 2.44, + "learning_rate": 4.53719600643544e-06, + "logits/chosen": -1.8593618869781494, + "logits/rejected": -1.8590339422225952, + "logps/chosen": -183.19473266601562, + "logps/rejected": -251.8839569091797, + "loss": 0.0449, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2952789068222046, + "rewards/margins": 8.777321815490723, + "rewards/rejected": -8.48204231262207, + "step": 1865 + }, + { + "epoch": 2.44, + "learning_rate": 4.516632961916722e-06, + "logits/chosen": -1.859098196029663, + "logits/rejected": -1.8674598932266235, + "logps/chosen": -201.6370849609375, + "logps/rejected": -316.97705078125, + "loss": 0.0495, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20135939121246338, + "rewards/margins": 10.37714958190918, + "rewards/rejected": -10.578510284423828, + "step": 1866 + }, + { + "epoch": 2.44, + "learning_rate": 4.496111991990518e-06, + "logits/chosen": -1.7154260873794556, + "logits/rejected": -1.7362290620803833, + "logps/chosen": -173.26937866210938, + "logps/rejected": -265.31231689453125, + "loss": 0.0444, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0907449722290039, + "rewards/margins": 9.66717529296875, + "rewards/rejected": -9.57642936706543, + "step": 1867 + }, + { + "epoch": 2.44, + "learning_rate": 4.475633138808663e-06, + "logits/chosen": -1.686737298965454, + "logits/rejected": -1.7819643020629883, + "logps/chosen": -220.27188110351562, + "logps/rejected": -357.5853271484375, + "loss": 0.0434, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9254104495048523, + "rewards/margins": 11.16162395477295, + "rewards/rejected": -12.08703327178955, + "step": 1868 + }, + { + "epoch": 2.45, + "learning_rate": 4.45519644443646e-06, + "logits/chosen": -1.5758470296859741, + "logits/rejected": -1.535646677017212, + "logps/chosen": -179.5104217529297, + "logps/rejected": -254.15135192871094, + "loss": 0.0459, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.241140678524971, + "rewards/margins": 8.851436614990234, + "rewards/rejected": -9.092577934265137, + "step": 1869 + }, + { + "epoch": 2.45, + "learning_rate": 4.434801950852644e-06, + "logits/chosen": -1.647027611732483, + "logits/rejected": -1.7314369678497314, + "logps/chosen": -169.18585205078125, + "logps/rejected": -309.4095458984375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4482993483543396, + "rewards/margins": 12.924880981445312, + "rewards/rejected": -12.476581573486328, + "step": 1870 + }, + { + "epoch": 2.45, + "learning_rate": 4.414449699949255e-06, + "logits/chosen": -1.6599794626235962, + "logits/rejected": -1.6580963134765625, + "logps/chosen": -183.55618286132812, + "logps/rejected": -270.4605712890625, + "loss": 0.1739, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30090123414993286, + "rewards/margins": 8.886276245117188, + "rewards/rejected": -9.187177658081055, + "step": 1871 + }, + { + "epoch": 2.45, + "learning_rate": 4.394139733531555e-06, + "logits/chosen": -1.7219959497451782, + "logits/rejected": -1.7718147039413452, + "logps/chosen": -208.61968994140625, + "logps/rejected": -329.607177734375, + "loss": 0.1301, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7620341777801514, + "rewards/margins": 9.17745590209961, + "rewards/rejected": -9.93949031829834, + "step": 1872 + }, + { + "epoch": 2.45, + "learning_rate": 4.373872093317965e-06, + "logits/chosen": -1.813781499862671, + "logits/rejected": -1.850644826889038, + "logps/chosen": -166.1450653076172, + "logps/rejected": -290.51116943359375, + "loss": 0.0434, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9715732932090759, + "rewards/margins": 12.873713493347168, + "rewards/rejected": -11.902140617370605, + "step": 1873 + }, + { + "epoch": 2.45, + "learning_rate": 4.353646820939944e-06, + "logits/chosen": -1.8622902631759644, + "logits/rejected": -1.921432614326477, + "logps/chosen": -171.5963592529297, + "logps/rejected": -287.3878479003906, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9964441061019897, + "rewards/margins": 12.732107162475586, + "rewards/rejected": -11.735663414001465, + "step": 1874 + }, + { + "epoch": 2.45, + "learning_rate": 4.333463957941952e-06, + "logits/chosen": -1.7719342708587646, + "logits/rejected": -1.7790818214416504, + "logps/chosen": -155.32640075683594, + "logps/rejected": -258.50689697265625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16835889220237732, + "rewards/margins": 10.497770309448242, + "rewards/rejected": -10.329410552978516, + "step": 1875 + }, + { + "epoch": 2.46, + "learning_rate": 4.313323545781306e-06, + "logits/chosen": -1.8225802183151245, + "logits/rejected": -1.8765015602111816, + "logps/chosen": -196.75241088867188, + "logps/rejected": -290.4941101074219, + "loss": 0.1303, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7300618886947632, + "rewards/margins": 9.704621315002441, + "rewards/rejected": -8.97455883026123, + "step": 1876 + }, + { + "epoch": 2.46, + "learning_rate": 4.293225625828143e-06, + "logits/chosen": -1.7383315563201904, + "logits/rejected": -1.7423555850982666, + "logps/chosen": -181.74229431152344, + "logps/rejected": -297.0148620605469, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3948476314544678, + "rewards/margins": 11.338163375854492, + "rewards/rejected": -11.733011245727539, + "step": 1877 + }, + { + "epoch": 2.46, + "learning_rate": 4.273170239365323e-06, + "logits/chosen": -1.5229793787002563, + "logits/rejected": -1.5268354415893555, + "logps/chosen": -181.29904174804688, + "logps/rejected": -275.0649108886719, + "loss": 0.0449, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6714894771575928, + "rewards/margins": 8.326661109924316, + "rewards/rejected": -9.998150825500488, + "step": 1878 + }, + { + "epoch": 2.46, + "learning_rate": 4.253157427588325e-06, + "logits/chosen": -1.614930510520935, + "logits/rejected": -1.599328875541687, + "logps/chosen": -165.44772338867188, + "logps/rejected": -230.71478271484375, + "loss": 0.1313, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2349449396133423, + "rewards/margins": 7.450197219848633, + "rewards/rejected": -8.685142517089844, + "step": 1879 + }, + { + "epoch": 2.46, + "learning_rate": 4.233187231605173e-06, + "logits/chosen": -1.7556192874908447, + "logits/rejected": -1.6957415342330933, + "logps/chosen": -149.04718017578125, + "logps/rejected": -246.29391479492188, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25145241618156433, + "rewards/margins": 9.897075653076172, + "rewards/rejected": -10.148529052734375, + "step": 1880 + }, + { + "epoch": 2.46, + "learning_rate": 4.213259692436367e-06, + "logits/chosen": -1.7649939060211182, + "logits/rejected": -1.819460391998291, + "logps/chosen": -183.86708068847656, + "logps/rejected": -304.8548889160156, + "loss": 0.087, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9879122972488403, + "rewards/margins": 9.6655912399292, + "rewards/rejected": -10.653504371643066, + "step": 1881 + }, + { + "epoch": 2.46, + "learning_rate": 4.193374851014789e-06, + "logits/chosen": -1.7411935329437256, + "logits/rejected": -1.714906930923462, + "logps/chosen": -172.36880493164062, + "logps/rejected": -267.7445068359375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.631706953048706, + "rewards/margins": 11.342836380004883, + "rewards/rejected": -9.711129188537598, + "step": 1882 + }, + { + "epoch": 2.46, + "learning_rate": 4.1735327481855965e-06, + "logits/chosen": -1.638293981552124, + "logits/rejected": -1.6485520601272583, + "logps/chosen": -226.15577697753906, + "logps/rejected": -323.80963134765625, + "loss": 0.0454, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5325433015823364, + "rewards/margins": 9.555930137634277, + "rewards/rejected": -10.08847427368164, + "step": 1883 + }, + { + "epoch": 2.47, + "learning_rate": 4.153733424706183e-06, + "logits/chosen": -1.7929471731185913, + "logits/rejected": -1.8844083547592163, + "logps/chosen": -172.9356689453125, + "logps/rejected": -270.8924560546875, + "loss": 0.0445, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3643515408039093, + "rewards/margins": 9.834647178649902, + "rewards/rejected": -10.198999404907227, + "step": 1884 + }, + { + "epoch": 2.47, + "learning_rate": 4.13397692124605e-06, + "logits/chosen": -1.767879843711853, + "logits/rejected": -1.7298154830932617, + "logps/chosen": -160.33436584472656, + "logps/rejected": -264.5224609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9149874448776245, + "rewards/margins": 10.589787483215332, + "rewards/rejected": -9.674798965454102, + "step": 1885 + }, + { + "epoch": 2.47, + "learning_rate": 4.114263278386743e-06, + "logits/chosen": -1.675075888633728, + "logits/rejected": -1.7658896446228027, + "logps/chosen": -177.25686645507812, + "logps/rejected": -267.1629333496094, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.060354623943567276, + "rewards/margins": 9.187674522399902, + "rewards/rejected": -9.248029708862305, + "step": 1886 + }, + { + "epoch": 2.47, + "learning_rate": 4.09459253662178e-06, + "logits/chosen": -1.7416391372680664, + "logits/rejected": -1.7353601455688477, + "logps/chosen": -217.3726043701172, + "logps/rejected": -321.947509765625, + "loss": 0.045, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6451941728591919, + "rewards/margins": 10.958710670471191, + "rewards/rejected": -11.603903770446777, + "step": 1887 + }, + { + "epoch": 2.47, + "learning_rate": 4.074964736356563e-06, + "logits/chosen": -1.661785364151001, + "logits/rejected": -1.7497574090957642, + "logps/chosen": -247.2830810546875, + "logps/rejected": -342.251220703125, + "loss": 0.0448, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2880454957485199, + "rewards/margins": 9.622705459594727, + "rewards/rejected": -9.910751342773438, + "step": 1888 + }, + { + "epoch": 2.47, + "learning_rate": 4.055379917908258e-06, + "logits/chosen": -1.8370734453201294, + "logits/rejected": -1.8688035011291504, + "logps/chosen": -149.69937133789062, + "logps/rejected": -275.235595703125, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.003645420074463, + "rewards/margins": 11.699909210205078, + "rewards/rejected": -10.696264266967773, + "step": 1889 + }, + { + "epoch": 2.47, + "learning_rate": 4.035838121505778e-06, + "logits/chosen": -1.8958244323730469, + "logits/rejected": -1.9296478033065796, + "logps/chosen": -152.5163116455078, + "logps/rejected": -269.3370666503906, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7204244136810303, + "rewards/margins": 11.035517692565918, + "rewards/rejected": -10.315093040466309, + "step": 1890 + }, + { + "epoch": 2.47, + "learning_rate": 4.016339387289636e-06, + "logits/chosen": -1.4891737699508667, + "logits/rejected": -1.5125484466552734, + "logps/chosen": -171.88983154296875, + "logps/rejected": -280.478515625, + "loss": 0.0438, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2287459522485733, + "rewards/margins": 10.938657760620117, + "rewards/rejected": -10.709912300109863, + "step": 1891 + }, + { + "epoch": 2.48, + "learning_rate": 3.996883755311917e-06, + "logits/chosen": -1.8915555477142334, + "logits/rejected": -1.9547946453094482, + "logps/chosen": -174.96197509765625, + "logps/rejected": -298.06884765625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10064416378736496, + "rewards/margins": 11.849396705627441, + "rewards/rejected": -11.74875259399414, + "step": 1892 + }, + { + "epoch": 2.48, + "learning_rate": 3.977471265536142e-06, + "logits/chosen": -1.9358724355697632, + "logits/rejected": -2.017740249633789, + "logps/chosen": -146.5030517578125, + "logps/rejected": -236.8922882080078, + "loss": 0.0873, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4964335858821869, + "rewards/margins": 7.8269362449646, + "rewards/rejected": -8.323370933532715, + "step": 1893 + }, + { + "epoch": 2.48, + "learning_rate": 3.95810195783724e-06, + "logits/chosen": -1.9090484380722046, + "logits/rejected": -1.98114013671875, + "logps/chosen": -165.24191284179688, + "logps/rejected": -305.91796875, + "loss": 0.0438, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4551979899406433, + "rewards/margins": 12.494522094726562, + "rewards/rejected": -12.039323806762695, + "step": 1894 + }, + { + "epoch": 2.48, + "learning_rate": 3.938775872001441e-06, + "logits/chosen": -1.5879120826721191, + "logits/rejected": -1.597350835800171, + "logps/chosen": -243.751220703125, + "logps/rejected": -302.36865234375, + "loss": 0.2171, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2877172231674194, + "rewards/margins": 5.655614376068115, + "rewards/rejected": -6.943331718444824, + "step": 1895 + }, + { + "epoch": 2.48, + "learning_rate": 3.919493047726156e-06, + "logits/chosen": -1.8299814462661743, + "logits/rejected": -1.8792191743850708, + "logps/chosen": -187.0034942626953, + "logps/rejected": -304.144775390625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.632046103477478, + "rewards/margins": 10.175278663635254, + "rewards/rejected": -10.807324409484863, + "step": 1896 + }, + { + "epoch": 2.48, + "learning_rate": 3.900253524619973e-06, + "logits/chosen": -1.8866732120513916, + "logits/rejected": -1.941267967224121, + "logps/chosen": -202.72264099121094, + "logps/rejected": -312.6195983886719, + "loss": 0.0875, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16807635128498077, + "rewards/margins": 10.341617584228516, + "rewards/rejected": -10.509692192077637, + "step": 1897 + }, + { + "epoch": 2.48, + "learning_rate": 3.881057342202532e-06, + "logits/chosen": -1.5389578342437744, + "logits/rejected": -1.57792329788208, + "logps/chosen": -242.33811950683594, + "logps/rejected": -371.6668701171875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0891375541687012, + "rewards/margins": 13.010469436645508, + "rewards/rejected": -14.099605560302734, + "step": 1898 + }, + { + "epoch": 2.49, + "learning_rate": 3.861904539904421e-06, + "logits/chosen": -1.619484782218933, + "logits/rejected": -1.5936853885650635, + "logps/chosen": -198.81809997558594, + "logps/rejected": -314.9617614746094, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3503853678703308, + "rewards/margins": 11.024124145507812, + "rewards/rejected": -10.673737525939941, + "step": 1899 + }, + { + "epoch": 2.49, + "learning_rate": 3.842795157067147e-06, + "logits/chosen": -1.8745231628417969, + "logits/rejected": -1.9318552017211914, + "logps/chosen": -189.03311157226562, + "logps/rejected": -312.5460510253906, + "loss": 0.0449, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4822556972503662, + "rewards/margins": 11.277061462402344, + "rewards/rejected": -11.759316444396973, + "step": 1900 + }, + { + "epoch": 2.49, + "learning_rate": 3.823729232943027e-06, + "logits/chosen": -1.926098346710205, + "logits/rejected": -1.9408183097839355, + "logps/chosen": -167.53396606445312, + "logps/rejected": -285.34368896484375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005785465240478516, + "rewards/margins": 10.646468162536621, + "rewards/rejected": -10.640682220458984, + "step": 1901 + }, + { + "epoch": 2.49, + "learning_rate": 3.804706806695099e-06, + "logits/chosen": -1.4987002611160278, + "logits/rejected": -1.4628316164016724, + "logps/chosen": -234.7961883544922, + "logps/rejected": -341.6724853515625, + "loss": 0.1738, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3272281885147095, + "rewards/margins": 9.002359390258789, + "rewards/rejected": -10.329587936401367, + "step": 1902 + }, + { + "epoch": 2.49, + "learning_rate": 3.785727917397047e-06, + "logits/chosen": -1.7174509763717651, + "logits/rejected": -1.849609136581421, + "logps/chosen": -164.53118896484375, + "logps/rejected": -253.94622802734375, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.545985221862793, + "rewards/margins": 10.138985633850098, + "rewards/rejected": -9.593000411987305, + "step": 1903 + }, + { + "epoch": 2.49, + "learning_rate": 3.7667926040331507e-06, + "logits/chosen": -1.7833516597747803, + "logits/rejected": -1.8712725639343262, + "logps/chosen": -186.0061798095703, + "logps/rejected": -294.9945068359375, + "loss": 0.0436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1978912055492401, + "rewards/margins": 9.99798583984375, + "rewards/rejected": -9.800094604492188, + "step": 1904 + }, + { + "epoch": 2.49, + "learning_rate": 3.7479009054981667e-06, + "logits/chosen": -1.6857854127883911, + "logits/rejected": -1.676038384437561, + "logps/chosen": -163.19297790527344, + "logps/rejected": -274.4587707519531, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3204114437103271, + "rewards/margins": 12.276033401489258, + "rewards/rejected": -10.955621719360352, + "step": 1905 + }, + { + "epoch": 2.49, + "learning_rate": 3.7290528605972625e-06, + "logits/chosen": -1.6295102834701538, + "logits/rejected": -1.5930213928222656, + "logps/chosen": -184.6400146484375, + "logps/rejected": -320.7083740234375, + "loss": 0.0442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46224215626716614, + "rewards/margins": 11.438702583312988, + "rewards/rejected": -11.900945663452148, + "step": 1906 + }, + { + "epoch": 2.5, + "learning_rate": 3.7102485080459328e-06, + "logits/chosen": -1.7548762559890747, + "logits/rejected": -1.760652780532837, + "logps/chosen": -164.15533447265625, + "logps/rejected": -256.257080078125, + "loss": 0.0591, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.35785847902297974, + "rewards/margins": 9.393709182739258, + "rewards/rejected": -9.751567840576172, + "step": 1907 + }, + { + "epoch": 2.5, + "learning_rate": 3.6914878864699326e-06, + "logits/chosen": -1.6416983604431152, + "logits/rejected": -1.5843173265457153, + "logps/chosen": -192.15359497070312, + "logps/rejected": -300.02392578125, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5073217153549194, + "rewards/margins": 12.17239761352539, + "rewards/rejected": -11.665075302124023, + "step": 1908 + }, + { + "epoch": 2.5, + "learning_rate": 3.672771034405195e-06, + "logits/chosen": -1.4242991209030151, + "logits/rejected": -1.4468224048614502, + "logps/chosen": -175.81768798828125, + "logps/rejected": -265.72723388671875, + "loss": 0.0915, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7085676193237305, + "rewards/margins": 7.678744792938232, + "rewards/rejected": -9.387312889099121, + "step": 1909 + }, + { + "epoch": 2.5, + "learning_rate": 3.654097990297731e-06, + "logits/chosen": -1.9014421701431274, + "logits/rejected": -1.8719252347946167, + "logps/chosen": -186.36656188964844, + "logps/rejected": -275.4283447265625, + "loss": 0.0874, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6154063940048218, + "rewards/margins": 8.910415649414062, + "rewards/rejected": -9.525823593139648, + "step": 1910 + }, + { + "epoch": 2.5, + "learning_rate": 3.6354687925035743e-06, + "logits/chosen": -1.6337980031967163, + "logits/rejected": -1.6202781200408936, + "logps/chosen": -167.40005493164062, + "logps/rejected": -274.8016662597656, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05893923342227936, + "rewards/margins": 10.111628532409668, + "rewards/rejected": -10.170568466186523, + "step": 1911 + }, + { + "epoch": 2.5, + "learning_rate": 3.6168834792887103e-06, + "logits/chosen": -1.4802531003952026, + "logits/rejected": -1.5262775421142578, + "logps/chosen": -195.7445068359375, + "logps/rejected": -304.8841247558594, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7530359625816345, + "rewards/margins": 10.688370704650879, + "rewards/rejected": -11.44140625, + "step": 1912 + }, + { + "epoch": 2.5, + "learning_rate": 3.598342088828943e-06, + "logits/chosen": -1.817293643951416, + "logits/rejected": -1.852118730545044, + "logps/chosen": -170.70130920410156, + "logps/rejected": -237.1297607421875, + "loss": 0.0875, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44093331694602966, + "rewards/margins": 7.5221757888793945, + "rewards/rejected": -7.963109016418457, + "step": 1913 + }, + { + "epoch": 2.5, + "learning_rate": 3.5798446592098883e-06, + "logits/chosen": -2.0670952796936035, + "logits/rejected": -2.092545747756958, + "logps/chosen": -148.00514221191406, + "logps/rejected": -257.2777099609375, + "loss": 0.044, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3789345920085907, + "rewards/margins": 9.679420471191406, + "rewards/rejected": -9.300485610961914, + "step": 1914 + }, + { + "epoch": 2.51, + "learning_rate": 3.561391228426861e-06, + "logits/chosen": -1.789574146270752, + "logits/rejected": -1.8082585334777832, + "logps/chosen": -172.7511749267578, + "logps/rejected": -260.76611328125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3660593032836914, + "rewards/margins": 9.336506843566895, + "rewards/rejected": -8.970447540283203, + "step": 1915 + }, + { + "epoch": 2.51, + "learning_rate": 3.542981834384776e-06, + "logits/chosen": -1.8688348531723022, + "logits/rejected": -1.8891730308532715, + "logps/chosen": -146.59556579589844, + "logps/rejected": -271.8514709472656, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.45870935916900635, + "rewards/margins": 11.440893173217773, + "rewards/rejected": -10.982184410095215, + "step": 1916 + }, + { + "epoch": 2.51, + "learning_rate": 3.5246165148981214e-06, + "logits/chosen": -1.6758280992507935, + "logits/rejected": -1.700160264968872, + "logps/chosen": -161.1240997314453, + "logps/rejected": -249.37840270996094, + "loss": 0.0444, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5361191034317017, + "rewards/margins": 7.293102741241455, + "rewards/rejected": -7.829222679138184, + "step": 1917 + }, + { + "epoch": 2.51, + "learning_rate": 3.5062953076908268e-06, + "logits/chosen": -1.870947003364563, + "logits/rejected": -1.844364881515503, + "logps/chosen": -175.71371459960938, + "logps/rejected": -266.68475341796875, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4167851209640503, + "rewards/margins": 9.747062683105469, + "rewards/rejected": -10.163846969604492, + "step": 1918 + }, + { + "epoch": 2.51, + "learning_rate": 3.488018250396233e-06, + "logits/chosen": -1.7600646018981934, + "logits/rejected": -1.778717041015625, + "logps/chosen": -140.50103759765625, + "logps/rejected": -258.70440673828125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4124709367752075, + "rewards/margins": 10.38486099243164, + "rewards/rejected": -9.972389221191406, + "step": 1919 + }, + { + "epoch": 2.51, + "learning_rate": 3.4697853805569696e-06, + "logits/chosen": -1.9057642221450806, + "logits/rejected": -1.915806770324707, + "logps/chosen": -195.60000610351562, + "logps/rejected": -301.9524230957031, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5150822401046753, + "rewards/margins": 10.301156044006348, + "rewards/rejected": -10.816238403320312, + "step": 1920 + }, + { + "epoch": 2.51, + "learning_rate": 3.4515967356249263e-06, + "logits/chosen": -1.905592679977417, + "logits/rejected": -1.924851894378662, + "logps/chosen": -180.60174560546875, + "logps/rejected": -312.2458190917969, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1700207144021988, + "rewards/margins": 12.216723442077637, + "rewards/rejected": -12.04670238494873, + "step": 1921 + }, + { + "epoch": 2.52, + "learning_rate": 3.4334523529611416e-06, + "logits/chosen": -1.9868727922439575, + "logits/rejected": -1.9933255910873413, + "logps/chosen": -153.79443359375, + "logps/rejected": -256.97705078125, + "loss": 0.0878, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.16693884134292603, + "rewards/margins": 10.073770523071289, + "rewards/rejected": -10.24070930480957, + "step": 1922 + }, + { + "epoch": 2.52, + "learning_rate": 3.415352269835731e-06, + "logits/chosen": -1.7813420295715332, + "logits/rejected": -1.8248730897903442, + "logps/chosen": -186.0016632080078, + "logps/rejected": -301.5447082519531, + "loss": 0.044, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4462379515171051, + "rewards/margins": 10.673506736755371, + "rewards/rejected": -10.227270126342773, + "step": 1923 + }, + { + "epoch": 2.52, + "learning_rate": 3.3972965234278065e-06, + "logits/chosen": -1.8749020099639893, + "logits/rejected": -1.8478282690048218, + "logps/chosen": -263.7992858886719, + "logps/rejected": -335.599365234375, + "loss": 0.0879, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13168469071388245, + "rewards/margins": 9.476238250732422, + "rewards/rejected": -9.344552993774414, + "step": 1924 + }, + { + "epoch": 2.52, + "learning_rate": 3.379285150825434e-06, + "logits/chosen": -1.792923092842102, + "logits/rejected": -1.8523764610290527, + "logps/chosen": -174.39688110351562, + "logps/rejected": -273.3518371582031, + "loss": 0.0462, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6847731471061707, + "rewards/margins": 8.920166015625, + "rewards/rejected": -9.604938507080078, + "step": 1925 + }, + { + "epoch": 2.52, + "learning_rate": 3.3613181890255056e-06, + "logits/chosen": -1.6196924448013306, + "logits/rejected": -1.6110302209854126, + "logps/chosen": -170.70640563964844, + "logps/rejected": -262.6854248046875, + "loss": 0.0443, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2520296275615692, + "rewards/margins": 9.9801607131958, + "rewards/rejected": -9.728132247924805, + "step": 1926 + }, + { + "epoch": 2.52, + "learning_rate": 3.343395674933711e-06, + "logits/chosen": -1.7001302242279053, + "logits/rejected": -1.6758153438568115, + "logps/chosen": -169.02830505371094, + "logps/rejected": -293.9111328125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.697690486907959, + "rewards/margins": 11.347500801086426, + "rewards/rejected": -10.649810791015625, + "step": 1927 + }, + { + "epoch": 2.52, + "learning_rate": 3.325517645364429e-06, + "logits/chosen": -1.7251543998718262, + "logits/rejected": -1.7814052104949951, + "logps/chosen": -186.59817504882812, + "logps/rejected": -272.627197265625, + "loss": 0.0871, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7251887321472168, + "rewards/margins": 7.931783676147461, + "rewards/rejected": -8.65697193145752, + "step": 1928 + }, + { + "epoch": 2.52, + "learning_rate": 3.3076841370406674e-06, + "logits/chosen": -1.803941011428833, + "logits/rejected": -1.8881585597991943, + "logps/chosen": -205.92811584472656, + "logps/rejected": -285.8006286621094, + "loss": 0.0479, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8002499341964722, + "rewards/margins": 9.741439819335938, + "rewards/rejected": -10.541690826416016, + "step": 1929 + }, + { + "epoch": 2.53, + "learning_rate": 3.289895186593972e-06, + "logits/chosen": -1.6815202236175537, + "logits/rejected": -1.7056907415390015, + "logps/chosen": -217.7075653076172, + "logps/rejected": -343.75299072265625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2145544290542603, + "rewards/margins": 11.209775924682617, + "rewards/rejected": -12.424328804016113, + "step": 1930 + }, + { + "epoch": 2.53, + "learning_rate": 3.27215083056438e-06, + "logits/chosen": -1.8623905181884766, + "logits/rejected": -1.8538012504577637, + "logps/chosen": -180.3714599609375, + "logps/rejected": -281.34619140625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1875349283218384, + "rewards/margins": 10.12769889831543, + "rewards/rejected": -11.315234184265137, + "step": 1931 + }, + { + "epoch": 2.53, + "learning_rate": 3.2544511054003246e-06, + "logits/chosen": -1.755469799041748, + "logits/rejected": -1.7369188070297241, + "logps/chosen": -171.15005493164062, + "logps/rejected": -261.5460205078125, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3887454867362976, + "rewards/margins": 9.103650093078613, + "rewards/rejected": -9.492395401000977, + "step": 1932 + }, + { + "epoch": 2.53, + "learning_rate": 3.2367960474585458e-06, + "logits/chosen": -1.4968432188034058, + "logits/rejected": -1.5547635555267334, + "logps/chosen": -191.1721954345703, + "logps/rejected": -309.1034851074219, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2648869454860687, + "rewards/margins": 11.034895896911621, + "rewards/rejected": -11.299783706665039, + "step": 1933 + }, + { + "epoch": 2.53, + "learning_rate": 3.2191856930040646e-06, + "logits/chosen": -1.8915550708770752, + "logits/rejected": -1.8476980924606323, + "logps/chosen": -168.68878173828125, + "logps/rejected": -254.1374969482422, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19241183996200562, + "rewards/margins": 9.465287208557129, + "rewards/rejected": -9.272875785827637, + "step": 1934 + }, + { + "epoch": 2.53, + "learning_rate": 3.2016200782100436e-06, + "logits/chosen": -1.7124667167663574, + "logits/rejected": -1.7377973794937134, + "logps/chosen": -180.88888549804688, + "logps/rejected": -302.8004150390625, + "loss": 0.044, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3677710294723511, + "rewards/margins": 9.91649341583252, + "rewards/rejected": -10.284265518188477, + "step": 1935 + }, + { + "epoch": 2.53, + "learning_rate": 3.18409923915777e-06, + "logits/chosen": -1.6981903314590454, + "logits/rejected": -1.731123685836792, + "logps/chosen": -150.52392578125, + "logps/rejected": -265.4391174316406, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3567240536212921, + "rewards/margins": 9.501225471496582, + "rewards/rejected": -9.857950210571289, + "step": 1936 + }, + { + "epoch": 2.53, + "learning_rate": 3.1666232118365474e-06, + "logits/chosen": -1.9240161180496216, + "logits/rejected": -1.889585256576538, + "logps/chosen": -218.81527709960938, + "logps/rejected": -316.5502624511719, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.614442765712738, + "rewards/margins": 11.967485427856445, + "rewards/rejected": -11.353042602539062, + "step": 1937 + }, + { + "epoch": 2.54, + "learning_rate": 3.1491920321436303e-06, + "logits/chosen": -1.8586318492889404, + "logits/rejected": -1.7816364765167236, + "logps/chosen": -197.2941131591797, + "logps/rejected": -301.5270080566406, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4512920379638672, + "rewards/margins": 10.285028457641602, + "rewards/rejected": -11.736321449279785, + "step": 1938 + }, + { + "epoch": 2.54, + "learning_rate": 3.1318057358841745e-06, + "logits/chosen": -1.6994396448135376, + "logits/rejected": -1.7352485656738281, + "logps/chosen": -151.56106567382812, + "logps/rejected": -252.24249267578125, + "loss": 0.045, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.22541078925132751, + "rewards/margins": 9.173171997070312, + "rewards/rejected": -8.947761535644531, + "step": 1939 + }, + { + "epoch": 2.54, + "learning_rate": 3.114464358771102e-06, + "logits/chosen": -1.905403733253479, + "logits/rejected": -1.962978482246399, + "logps/chosen": -145.729248046875, + "logps/rejected": -254.26089477539062, + "loss": 0.0477, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5973153114318848, + "rewards/margins": 9.859082221984863, + "rewards/rejected": -9.261768341064453, + "step": 1940 + }, + { + "epoch": 2.54, + "learning_rate": 3.097167936425094e-06, + "logits/chosen": -1.6754268407821655, + "logits/rejected": -1.620928168296814, + "logps/chosen": -199.9144287109375, + "logps/rejected": -298.030029296875, + "loss": 0.051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8563889861106873, + "rewards/margins": 9.424233436584473, + "rewards/rejected": -10.280622482299805, + "step": 1941 + }, + { + "epoch": 2.54, + "learning_rate": 3.079916504374494e-06, + "logits/chosen": -1.8336161375045776, + "logits/rejected": -1.7605853080749512, + "logps/chosen": -196.23410034179688, + "logps/rejected": -258.1279296875, + "loss": 0.1048, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6195340156555176, + "rewards/margins": 7.379644870758057, + "rewards/rejected": -8.999178886413574, + "step": 1942 + }, + { + "epoch": 2.54, + "learning_rate": 3.0627100980552133e-06, + "logits/chosen": -1.2440134286880493, + "logits/rejected": -1.2125873565673828, + "logps/chosen": -180.02392578125, + "logps/rejected": -265.74078369140625, + "loss": 0.0446, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.45184409618377686, + "rewards/margins": 8.746187210083008, + "rewards/rejected": -9.198031425476074, + "step": 1943 + }, + { + "epoch": 2.54, + "learning_rate": 3.045548752810687e-06, + "logits/chosen": -1.9536962509155273, + "logits/rejected": -2.002528429031372, + "logps/chosen": -150.48077392578125, + "logps/rejected": -231.3680419921875, + "loss": 0.1314, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.40758663415908813, + "rewards/margins": 7.598135471343994, + "rewards/rejected": -8.005722045898438, + "step": 1944 + }, + { + "epoch": 2.55, + "learning_rate": 3.028432503891801e-06, + "logits/chosen": -1.6654802560806274, + "logits/rejected": -1.6798418760299683, + "logps/chosen": -204.07101440429688, + "logps/rejected": -289.9903564453125, + "loss": 0.0873, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5946987867355347, + "rewards/margins": 9.258700370788574, + "rewards/rejected": -9.853399276733398, + "step": 1945 + }, + { + "epoch": 2.55, + "learning_rate": 3.01136138645679e-06, + "logits/chosen": -1.3871749639511108, + "logits/rejected": -1.438191533088684, + "logps/chosen": -177.79562377929688, + "logps/rejected": -262.07098388671875, + "loss": 0.044, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.15326586365699768, + "rewards/margins": 9.831206321716309, + "rewards/rejected": -9.984471321105957, + "step": 1946 + }, + { + "epoch": 2.55, + "learning_rate": 2.9943354355711884e-06, + "logits/chosen": -1.8507202863693237, + "logits/rejected": -1.7794890403747559, + "logps/chosen": -166.72320556640625, + "logps/rejected": -255.70249938964844, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6053355932235718, + "rewards/margins": 10.309610366821289, + "rewards/rejected": -9.704275131225586, + "step": 1947 + }, + { + "epoch": 2.55, + "learning_rate": 2.9773546862077617e-06, + "logits/chosen": -1.6759676933288574, + "logits/rejected": -1.6692721843719482, + "logps/chosen": -174.26577758789062, + "logps/rejected": -271.256591796875, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.040123552083969116, + "rewards/margins": 9.965165138244629, + "rewards/rejected": -9.925040245056152, + "step": 1948 + }, + { + "epoch": 2.55, + "learning_rate": 2.960419173246437e-06, + "logits/chosen": -1.8298752307891846, + "logits/rejected": -1.816307544708252, + "logps/chosen": -158.45436096191406, + "logps/rejected": -248.50633239746094, + "loss": 0.0438, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16295182704925537, + "rewards/margins": 9.929421424865723, + "rewards/rejected": -9.766469955444336, + "step": 1949 + }, + { + "epoch": 2.55, + "learning_rate": 2.9435289314742015e-06, + "logits/chosen": -1.9003292322158813, + "logits/rejected": -1.9631367921829224, + "logps/chosen": -162.85433959960938, + "logps/rejected": -294.857177734375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1328267604112625, + "rewards/margins": 11.211357116699219, + "rewards/rejected": -11.344182968139648, + "step": 1950 + }, + { + "epoch": 2.55, + "learning_rate": 2.926683995585053e-06, + "logits/chosen": -1.5471967458724976, + "logits/rejected": -1.5688446760177612, + "logps/chosen": -158.8806610107422, + "logps/rejected": -265.52008056640625, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1667221337556839, + "rewards/margins": 10.557982444763184, + "rewards/rejected": -10.72470474243164, + "step": 1951 + }, + { + "epoch": 2.55, + "learning_rate": 2.9098844001799407e-06, + "logits/chosen": -1.747592568397522, + "logits/rejected": -1.750756025314331, + "logps/chosen": -185.88575744628906, + "logps/rejected": -293.0629577636719, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4671233892440796, + "rewards/margins": 11.061368942260742, + "rewards/rejected": -10.594245910644531, + "step": 1952 + }, + { + "epoch": 2.56, + "learning_rate": 2.8931301797666844e-06, + "logits/chosen": -1.9031234979629517, + "logits/rejected": -1.9144034385681152, + "logps/chosen": -190.86790466308594, + "logps/rejected": -281.02984619140625, + "loss": 0.0438, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13364754617214203, + "rewards/margins": 9.8737211227417, + "rewards/rejected": -10.007369041442871, + "step": 1953 + }, + { + "epoch": 2.56, + "learning_rate": 2.8764213687598713e-06, + "logits/chosen": -1.6918234825134277, + "logits/rejected": -1.7215158939361572, + "logps/chosen": -180.20265197753906, + "logps/rejected": -259.58319091796875, + "loss": 0.1306, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2702764868736267, + "rewards/margins": 8.5771484375, + "rewards/rejected": -8.847424507141113, + "step": 1954 + }, + { + "epoch": 2.56, + "learning_rate": 2.85975800148085e-06, + "logits/chosen": -1.5428351163864136, + "logits/rejected": -1.5541678667068481, + "logps/chosen": -188.09185791015625, + "logps/rejected": -324.4872741699219, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24225345253944397, + "rewards/margins": 12.110486030578613, + "rewards/rejected": -11.868233680725098, + "step": 1955 + }, + { + "epoch": 2.56, + "learning_rate": 2.843140112157594e-06, + "logits/chosen": -1.6897979974746704, + "logits/rejected": -1.7243423461914062, + "logps/chosen": -171.72552490234375, + "logps/rejected": -272.5803527832031, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13817450404167175, + "rewards/margins": 11.09737491607666, + "rewards/rejected": -11.235549926757812, + "step": 1956 + }, + { + "epoch": 2.56, + "learning_rate": 2.8265677349246735e-06, + "logits/chosen": -1.8125051259994507, + "logits/rejected": -1.7779297828674316, + "logps/chosen": -215.3422393798828, + "logps/rejected": -298.55609130859375, + "loss": 0.0442, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8671368360519409, + "rewards/margins": 9.761358261108398, + "rewards/rejected": -10.628495216369629, + "step": 1957 + }, + { + "epoch": 2.56, + "learning_rate": 2.8100409038231746e-06, + "logits/chosen": -1.8005623817443848, + "logits/rejected": -1.7292400598526, + "logps/chosen": -219.92709350585938, + "logps/rejected": -309.89111328125, + "loss": 0.087, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22227677702903748, + "rewards/margins": 9.28552532196045, + "rewards/rejected": -9.063248634338379, + "step": 1958 + }, + { + "epoch": 2.56, + "learning_rate": 2.793559652800631e-06, + "logits/chosen": -1.726702332496643, + "logits/rejected": -1.812628984451294, + "logps/chosen": -168.04141235351562, + "logps/rejected": -364.46893310546875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9757401943206787, + "rewards/margins": 16.0539608001709, + "rewards/rejected": -15.078222274780273, + "step": 1959 + }, + { + "epoch": 2.57, + "learning_rate": 2.7771240157109355e-06, + "logits/chosen": -1.4083255529403687, + "logits/rejected": -1.508378028869629, + "logps/chosen": -155.93951416015625, + "logps/rejected": -284.63372802734375, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5155107975006104, + "rewards/margins": 9.443857192993164, + "rewards/rejected": -9.959368705749512, + "step": 1960 + }, + { + "epoch": 2.57, + "learning_rate": 2.7607340263143073e-06, + "logits/chosen": -1.8752855062484741, + "logits/rejected": -1.9348137378692627, + "logps/chosen": -153.65895080566406, + "logps/rejected": -250.3507843017578, + "loss": 0.0876, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3873172700405121, + "rewards/margins": 9.81559944152832, + "rewards/rejected": -10.202917098999023, + "step": 1961 + }, + { + "epoch": 2.57, + "learning_rate": 2.7443897182771794e-06, + "logits/chosen": -1.6465787887573242, + "logits/rejected": -1.674967885017395, + "logps/chosen": -198.01809692382812, + "logps/rejected": -266.2185363769531, + "loss": 0.0869, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18833279609680176, + "rewards/margins": 9.031142234802246, + "rewards/rejected": -9.219474792480469, + "step": 1962 + }, + { + "epoch": 2.57, + "learning_rate": 2.7280911251721748e-06, + "logits/chosen": -1.6990039348602295, + "logits/rejected": -1.6893970966339111, + "logps/chosen": -169.86581420898438, + "logps/rejected": -263.7157897949219, + "loss": 0.0439, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36748600006103516, + "rewards/margins": 10.012055397033691, + "rewards/rejected": -9.644569396972656, + "step": 1963 + }, + { + "epoch": 2.57, + "learning_rate": 2.711838280477988e-06, + "logits/chosen": -1.924182653427124, + "logits/rejected": -1.8527957201004028, + "logps/chosen": -188.111572265625, + "logps/rejected": -287.9988098144531, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.28927695751190186, + "rewards/margins": 12.059555053710938, + "rewards/rejected": -11.770277976989746, + "step": 1964 + }, + { + "epoch": 2.57, + "learning_rate": 2.6956312175793613e-06, + "logits/chosen": -1.5777390003204346, + "logits/rejected": -1.5506999492645264, + "logps/chosen": -187.0686798095703, + "logps/rejected": -323.84222412109375, + "loss": 0.0434, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8692817687988281, + "rewards/margins": 13.49492359161377, + "rewards/rejected": -12.625641822814941, + "step": 1965 + }, + { + "epoch": 2.57, + "learning_rate": 2.679469969767001e-06, + "logits/chosen": -1.6803758144378662, + "logits/rejected": -1.680100440979004, + "logps/chosen": -177.32545471191406, + "logps/rejected": -302.4091491699219, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01150105893611908, + "rewards/margins": 11.221701622009277, + "rewards/rejected": -11.210201263427734, + "step": 1966 + }, + { + "epoch": 2.57, + "learning_rate": 2.663354570237481e-06, + "logits/chosen": -1.8851364850997925, + "logits/rejected": -1.9446914196014404, + "logps/chosen": -201.47381591796875, + "logps/rejected": -293.5780944824219, + "loss": 0.1749, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5440109372138977, + "rewards/margins": 8.664443969726562, + "rewards/rejected": -9.208455085754395, + "step": 1967 + }, + { + "epoch": 2.58, + "learning_rate": 2.647285052093218e-06, + "logits/chosen": -1.740563154220581, + "logits/rejected": -1.7548713684082031, + "logps/chosen": -167.3013153076172, + "logps/rejected": -271.1091003417969, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.23233649134635925, + "rewards/margins": 9.510126113891602, + "rewards/rejected": -9.742464065551758, + "step": 1968 + }, + { + "epoch": 2.58, + "learning_rate": 2.631261448342387e-06, + "logits/chosen": -1.753013253211975, + "logits/rejected": -1.763864517211914, + "logps/chosen": -173.18917846679688, + "logps/rejected": -293.9700622558594, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2711889147758484, + "rewards/margins": 10.491897583007812, + "rewards/rejected": -10.76308536529541, + "step": 1969 + }, + { + "epoch": 2.58, + "learning_rate": 2.615283791898837e-06, + "logits/chosen": -1.6171423196792603, + "logits/rejected": -1.567841649055481, + "logps/chosen": -202.53741455078125, + "logps/rejected": -282.03570556640625, + "loss": 0.0934, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6379486918449402, + "rewards/margins": 9.356935501098633, + "rewards/rejected": -9.994885444641113, + "step": 1970 + }, + { + "epoch": 2.58, + "learning_rate": 2.599352115582046e-06, + "logits/chosen": -1.6400189399719238, + "logits/rejected": -1.6987359523773193, + "logps/chosen": -176.38868713378906, + "logps/rejected": -308.7548828125, + "loss": 0.0876, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.37076646089553833, + "rewards/margins": 8.949944496154785, + "rewards/rejected": -9.320711135864258, + "step": 1971 + }, + { + "epoch": 2.58, + "learning_rate": 2.5834664521170504e-06, + "logits/chosen": -1.886258602142334, + "logits/rejected": -1.8440604209899902, + "logps/chosen": -159.1253662109375, + "logps/rejected": -268.2269592285156, + "loss": 0.087, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5194556713104248, + "rewards/margins": 10.651453018188477, + "rewards/rejected": -10.131998062133789, + "step": 1972 + }, + { + "epoch": 2.58, + "learning_rate": 2.5676268341343622e-06, + "logits/chosen": -1.7307236194610596, + "logits/rejected": -1.6705318689346313, + "logps/chosen": -205.23074340820312, + "logps/rejected": -291.70751953125, + "loss": 0.1302, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3530801236629486, + "rewards/margins": 8.858304023742676, + "rewards/rejected": -9.211383819580078, + "step": 1973 + }, + { + "epoch": 2.58, + "learning_rate": 2.5518332941699056e-06, + "logits/chosen": -1.7572276592254639, + "logits/rejected": -1.7965366840362549, + "logps/chosen": -174.769287109375, + "logps/rejected": -305.6276550292969, + "loss": 0.048, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2510526478290558, + "rewards/margins": 11.792522430419922, + "rewards/rejected": -11.54146957397461, + "step": 1974 + }, + { + "epoch": 2.58, + "learning_rate": 2.5360858646649722e-06, + "logits/chosen": -1.7264833450317383, + "logits/rejected": -1.7860783338546753, + "logps/chosen": -168.62179565429688, + "logps/rejected": -256.6726379394531, + "loss": 0.0484, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6712649464607239, + "rewards/margins": 9.836893081665039, + "rewards/rejected": -9.165628433227539, + "step": 1975 + }, + { + "epoch": 2.59, + "learning_rate": 2.520384577966142e-06, + "logits/chosen": -1.859262466430664, + "logits/rejected": -1.881242036819458, + "logps/chosen": -182.73577880859375, + "logps/rejected": -301.6014099121094, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37121614813804626, + "rewards/margins": 11.816190719604492, + "rewards/rejected": -12.187408447265625, + "step": 1976 + }, + { + "epoch": 2.59, + "learning_rate": 2.5047294663251953e-06, + "logits/chosen": -1.7811390161514282, + "logits/rejected": -1.6868927478790283, + "logps/chosen": -219.08047485351562, + "logps/rejected": -366.65789794921875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34529489278793335, + "rewards/margins": 12.350276947021484, + "rewards/rejected": -12.00498104095459, + "step": 1977 + }, + { + "epoch": 2.59, + "learning_rate": 2.4891205618990666e-06, + "logits/chosen": -1.626390814781189, + "logits/rejected": -1.6346192359924316, + "logps/chosen": -173.81222534179688, + "logps/rejected": -279.78448486328125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7116873264312744, + "rewards/margins": 11.487451553344727, + "rewards/rejected": -10.775765419006348, + "step": 1978 + }, + { + "epoch": 2.59, + "learning_rate": 2.4735578967497953e-06, + "logits/chosen": -1.9429997205734253, + "logits/rejected": -1.9693106412887573, + "logps/chosen": -152.92579650878906, + "logps/rejected": -254.26124572753906, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14168325066566467, + "rewards/margins": 9.659777641296387, + "rewards/rejected": -9.51809310913086, + "step": 1979 + }, + { + "epoch": 2.59, + "learning_rate": 2.4580415028444326e-06, + "logits/chosen": -2.038975238800049, + "logits/rejected": -2.078749418258667, + "logps/chosen": -149.88754272460938, + "logps/rejected": -266.1400146484375, + "loss": 0.0871, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11362949013710022, + "rewards/margins": 10.078556060791016, + "rewards/rejected": -9.964925765991211, + "step": 1980 + }, + { + "epoch": 2.59, + "learning_rate": 2.4425714120549726e-06, + "logits/chosen": -1.8182258605957031, + "logits/rejected": -1.8650504350662231, + "logps/chosen": -149.2694091796875, + "logps/rejected": -311.80682373046875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5802469849586487, + "rewards/margins": 13.599267959594727, + "rewards/rejected": -13.019020080566406, + "step": 1981 + }, + { + "epoch": 2.59, + "learning_rate": 2.42714765615831e-06, + "logits/chosen": -1.654744029045105, + "logits/rejected": -1.753852128982544, + "logps/chosen": -160.8245086669922, + "logps/rejected": -293.518798828125, + "loss": 0.0487, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2480771243572235, + "rewards/margins": 11.549545288085938, + "rewards/rejected": -11.30146598815918, + "step": 1982 + }, + { + "epoch": 2.6, + "learning_rate": 2.4117702668361777e-06, + "logits/chosen": -1.761013150215149, + "logits/rejected": -1.774271011352539, + "logps/chosen": -179.68899536132812, + "logps/rejected": -328.4815673828125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.621519923210144, + "rewards/margins": 11.40289306640625, + "rewards/rejected": -12.024412155151367, + "step": 1983 + }, + { + "epoch": 2.6, + "learning_rate": 2.3964392756750276e-06, + "logits/chosen": -1.8555997610092163, + "logits/rejected": -1.8255681991577148, + "logps/chosen": -221.62515258789062, + "logps/rejected": -317.10791015625, + "loss": 0.1305, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26080620288848877, + "rewards/margins": 8.760135650634766, + "rewards/rejected": -9.020940780639648, + "step": 1984 + }, + { + "epoch": 2.6, + "learning_rate": 2.381154714166045e-06, + "logits/chosen": -1.7783267498016357, + "logits/rejected": -1.793891191482544, + "logps/chosen": -182.63229370117188, + "logps/rejected": -275.4841613769531, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7096694111824036, + "rewards/margins": 9.934755325317383, + "rewards/rejected": -10.644424438476562, + "step": 1985 + }, + { + "epoch": 2.6, + "learning_rate": 2.3659166137050297e-06, + "logits/chosen": -1.6395975351333618, + "logits/rejected": -1.6640281677246094, + "logps/chosen": -199.27072143554688, + "logps/rejected": -353.10137939453125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5331687331199646, + "rewards/margins": 13.085333824157715, + "rewards/rejected": -13.61850357055664, + "step": 1986 + }, + { + "epoch": 2.6, + "learning_rate": 2.3507250055923384e-06, + "logits/chosen": -2.0247952938079834, + "logits/rejected": -2.002014636993408, + "logps/chosen": -151.59063720703125, + "logps/rejected": -256.08233642578125, + "loss": 0.1304, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1337849497795105, + "rewards/margins": 9.709541320800781, + "rewards/rejected": -9.575756072998047, + "step": 1987 + }, + { + "epoch": 2.6, + "learning_rate": 2.335579921032849e-06, + "logits/chosen": -1.8732693195343018, + "logits/rejected": -1.9024027585983276, + "logps/chosen": -181.53271484375, + "logps/rejected": -284.82000732421875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05724672973155975, + "rewards/margins": 10.290583610534668, + "rewards/rejected": -10.23333740234375, + "step": 1988 + }, + { + "epoch": 2.6, + "learning_rate": 2.3204813911358535e-06, + "logits/chosen": -1.6087309122085571, + "logits/rejected": -1.5883159637451172, + "logps/chosen": -172.81671142578125, + "logps/rejected": -260.9825744628906, + "loss": 0.0444, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6146831512451172, + "rewards/margins": 9.345158576965332, + "rewards/rejected": -9.959840774536133, + "step": 1989 + }, + { + "epoch": 2.6, + "learning_rate": 2.305429446915036e-06, + "logits/chosen": -1.738578200340271, + "logits/rejected": -1.7823458909988403, + "logps/chosen": -178.59544372558594, + "logps/rejected": -314.980224609375, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46294763684272766, + "rewards/margins": 12.770978927612305, + "rewards/rejected": -12.308032035827637, + "step": 1990 + }, + { + "epoch": 2.61, + "learning_rate": 2.2904241192883703e-06, + "logits/chosen": -1.4852935075759888, + "logits/rejected": -1.5319143533706665, + "logps/chosen": -169.06687927246094, + "logps/rejected": -307.6646728515625, + "loss": 0.0437, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4389601945877075, + "rewards/margins": 11.484118461608887, + "rewards/rejected": -11.923078536987305, + "step": 1991 + }, + { + "epoch": 2.61, + "learning_rate": 2.2754654390780924e-06, + "logits/chosen": -1.8649027347564697, + "logits/rejected": -1.944092869758606, + "logps/chosen": -195.75340270996094, + "logps/rejected": -284.3547058105469, + "loss": 0.0439, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40997207164764404, + "rewards/margins": 10.181391716003418, + "rewards/rejected": -9.771418571472168, + "step": 1992 + }, + { + "epoch": 2.61, + "learning_rate": 2.260553437010621e-06, + "logits/chosen": -1.876404047012329, + "logits/rejected": -1.9384878873825073, + "logps/chosen": -186.94473266601562, + "logps/rejected": -306.932861328125, + "loss": 0.0438, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7158739566802979, + "rewards/margins": 11.20254898071289, + "rewards/rejected": -11.91842269897461, + "step": 1993 + }, + { + "epoch": 2.61, + "learning_rate": 2.245688143716476e-06, + "logits/chosen": -1.7049798965454102, + "logits/rejected": -1.7042391300201416, + "logps/chosen": -185.9993896484375, + "logps/rejected": -272.52740478515625, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.53288334608078, + "rewards/margins": 11.466306686401367, + "rewards/rejected": -10.933424949645996, + "step": 1994 + }, + { + "epoch": 2.61, + "learning_rate": 2.2308695897302472e-06, + "logits/chosen": -1.8967386484146118, + "logits/rejected": -1.8946254253387451, + "logps/chosen": -153.57508850097656, + "logps/rejected": -252.23887634277344, + "loss": 0.0877, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28578105568885803, + "rewards/margins": 9.016427993774414, + "rewards/rejected": -8.730647087097168, + "step": 1995 + }, + { + "epoch": 2.61, + "learning_rate": 2.216097805490516e-06, + "logits/chosen": -1.7306807041168213, + "logits/rejected": -1.6733770370483398, + "logps/chosen": -176.50511169433594, + "logps/rejected": -264.79034423828125, + "loss": 0.0446, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.08303779363632202, + "rewards/margins": 10.34005069732666, + "rewards/rejected": -10.257013320922852, + "step": 1996 + }, + { + "epoch": 2.61, + "learning_rate": 2.2013728213398006e-06, + "logits/chosen": -1.7143869400024414, + "logits/rejected": -1.7067701816558838, + "logps/chosen": -175.7965545654297, + "logps/rejected": -314.23980712890625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18513381481170654, + "rewards/margins": 11.556571960449219, + "rewards/rejected": -11.741704940795898, + "step": 1997 + }, + { + "epoch": 2.61, + "learning_rate": 2.1866946675244692e-06, + "logits/chosen": -1.9566376209259033, + "logits/rejected": -1.8643081188201904, + "logps/chosen": -180.88778686523438, + "logps/rejected": -327.32513427734375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1013829708099365, + "rewards/margins": 14.414881706237793, + "rewards/rejected": -13.313498497009277, + "step": 1998 + }, + { + "epoch": 2.62, + "learning_rate": 2.1720633741947187e-06, + "logits/chosen": -1.9128525257110596, + "logits/rejected": -1.901835322380066, + "logps/chosen": -165.06918334960938, + "logps/rejected": -273.7917785644531, + "loss": 0.0593, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.46890270709991455, + "rewards/margins": 11.430022239685059, + "rewards/rejected": -10.961119651794434, + "step": 1999 + }, + { + "epoch": 2.62, + "learning_rate": 2.157478971404478e-06, + "logits/chosen": -1.9193739891052246, + "logits/rejected": -1.8429605960845947, + "logps/chosen": -202.1808624267578, + "logps/rejected": -294.02044677734375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.518018901348114, + "rewards/margins": 11.590211868286133, + "rewards/rejected": -12.108230590820312, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 2292, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}