diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4150 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 100, + "global_step": 2907, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.7182130584192438e-09, + "logits/chosen": -1.8825098276138306, + "logits/rejected": -1.6692813634872437, + "logps/chosen": -107.98798370361328, + "logps/rejected": -99.48463439941406, + "loss": 1.0, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": -1.912903904914856, + "logits/rejected": -1.679023265838623, + "logps/chosen": -232.9512481689453, + "logps/rejected": -205.12588500976562, + "loss": 1.0008, + "rewards/accuracies": 0.4861111044883728, + "rewards/chosen": 0.004386746324598789, + "rewards/margins": 0.0018502763705328107, + "rewards/rejected": 0.0025364691391587257, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": -2.007096529006958, + "logits/rejected": -1.9176105260849, + "logps/chosen": -270.76263427734375, + "logps/rejected": -241.11474609375, + "loss": 0.9998, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.013296608813107014, + "rewards/margins": 0.011036807671189308, + "rewards/rejected": 0.0022598025389015675, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": -1.996591329574585, + "logits/rejected": -1.9179508686065674, + "logps/chosen": -264.7063903808594, + "logps/rejected": -223.7097930908203, + "loss": 1.0021, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0007220959523692727, + "rewards/margins": 0.004284311085939407, + "rewards/rejected": -0.0050064073875546455, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 6.872852233676976e-08, + "logits/chosen": -2.0471560955047607, + "logits/rejected": -1.9881904125213623, + "logps/chosen": -309.6553955078125, + "logps/rejected": -272.2703552246094, + "loss": 0.9983, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.0036458049435168505, + "rewards/margins": 0.0004331391828600317, + "rewards/rejected": 0.003212666604667902, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 8.59106529209622e-08, + "logits/chosen": -1.822840929031372, + "logits/rejected": -1.8331642150878906, + "logps/chosen": -333.86126708984375, + "logps/rejected": -206.30355834960938, + "loss": 1.0007, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.000272188161034137, + "rewards/margins": 0.007083783857524395, + "rewards/rejected": -0.006811595521867275, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": -1.7849409580230713, + "logits/rejected": -1.8800642490386963, + "logps/chosen": -249.81332397460938, + "logps/rejected": -225.6211395263672, + "loss": 1.0016, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.005839993245899677, + "rewards/margins": -0.003313907189294696, + "rewards/rejected": -0.0025260853581130505, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 1.202749140893471e-07, + "logits/chosen": -1.9829210042953491, + "logits/rejected": -1.818284034729004, + "logps/chosen": -328.28521728515625, + "logps/rejected": -236.7754364013672, + "loss": 0.9958, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.002883409382775426, + "rewards/margins": 0.003594732377678156, + "rewards/rejected": -0.0007113233441486955, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 1.3745704467353952e-07, + "logits/chosen": -1.973283052444458, + "logits/rejected": -1.9369332790374756, + "logps/chosen": -258.3551940917969, + "logps/rejected": -212.8301544189453, + "loss": 0.9948, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0014923412818461657, + "rewards/margins": 0.004948171321302652, + "rewards/rejected": -0.0034558300394564867, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": -1.9635775089263916, + "logits/rejected": -1.8522634506225586, + "logps/chosen": -256.3603820800781, + "logps/rejected": -208.6153106689453, + "loss": 0.9963, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.003207797883078456, + "rewards/margins": 0.00904668215662241, + "rewards/rejected": -0.01225447840988636, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 1.718213058419244e-07, + "logits/chosen": -1.7430416345596313, + "logits/rejected": -1.8678181171417236, + "logps/chosen": -271.9082946777344, + "logps/rejected": -195.18521118164062, + "loss": 0.9982, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00269494135864079, + "rewards/margins": 0.0009464768809266388, + "rewards/rejected": 0.0017484650015830994, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 1.8900343642611682e-07, + "logits/chosen": -1.938391089439392, + "logits/rejected": -1.8031425476074219, + "logps/chosen": -250.1339874267578, + "logps/rejected": -235.4950714111328, + "loss": 0.9934, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.004927259869873524, + "rewards/margins": 0.009144905023276806, + "rewards/rejected": -0.004217647016048431, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": -1.836554765701294, + "logits/rejected": -1.8783756494522095, + "logps/chosen": -319.69873046875, + "logps/rejected": -234.7427215576172, + "loss": 0.9934, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.002145239384844899, + "rewards/margins": 0.012560705654323101, + "rewards/rejected": -0.010415466502308846, + "step": 120 + }, + { + "epoch": 0.13, + "learning_rate": 2.2336769759450173e-07, + "logits/chosen": -1.9726636409759521, + "logits/rejected": -1.9809471368789673, + "logps/chosen": -283.53985595703125, + "logps/rejected": -239.5770721435547, + "loss": 0.9926, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.012158048339188099, + "rewards/margins": 0.019386615604162216, + "rewards/rejected": -0.007228570524603128, + "step": 130 + }, + { + "epoch": 0.14, + "learning_rate": 2.405498281786942e-07, + "logits/chosen": -2.024657726287842, + "logits/rejected": -1.880409836769104, + "logps/chosen": -269.13629150390625, + "logps/rejected": -213.2104949951172, + "loss": 0.9939, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.0044730305671691895, + "rewards/margins": 0.00196995516307652, + "rewards/rejected": 0.0025030761025846004, + "step": 140 + }, + { + "epoch": 0.15, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": -1.8670217990875244, + "logits/rejected": -2.0206334590911865, + "logps/chosen": -268.1510925292969, + "logps/rejected": -225.56527709960938, + "loss": 0.9891, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.011090939864516258, + "rewards/margins": 0.014944592490792274, + "rewards/rejected": -0.0038536519277840853, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 2.7491408934707903e-07, + "logits/chosen": -2.1929473876953125, + "logits/rejected": -2.0768256187438965, + "logps/chosen": -311.89215087890625, + "logps/rejected": -241.4412078857422, + "loss": 0.9796, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.014715956524014473, + "rewards/margins": 0.023210834711790085, + "rewards/rejected": -0.008494878187775612, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 2.9209621993127146e-07, + "logits/chosen": -2.0199875831604004, + "logits/rejected": -2.028744697570801, + "logps/chosen": -270.58197021484375, + "logps/rejected": -221.40823364257812, + "loss": 0.9859, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.008927601389586926, + "rewards/margins": 0.004688750021159649, + "rewards/rejected": 0.004238851368427277, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": -1.9151681661605835, + "logits/rejected": -1.9035532474517822, + "logps/chosen": -215.2420196533203, + "logps/rejected": -195.22682189941406, + "loss": 0.9779, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.010352599434554577, + "rewards/margins": 0.02298773266375065, + "rewards/rejected": -0.012635131366550922, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 3.2646048109965636e-07, + "logits/chosen": -1.9542953968048096, + "logits/rejected": -1.8753105401992798, + "logps/chosen": -266.09234619140625, + "logps/rejected": -196.09292602539062, + "loss": 0.9728, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.01909886673092842, + "rewards/margins": 0.028360243886709213, + "rewards/rejected": -0.009261379018425941, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 3.436426116838488e-07, + "logits/chosen": -1.8358113765716553, + "logits/rejected": -1.9175758361816406, + "logps/chosen": -221.7442169189453, + "logps/rejected": -154.5623016357422, + "loss": 0.9761, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.018382752314209938, + "rewards/margins": 0.026139695197343826, + "rewards/rejected": -0.007756945677101612, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": -1.9191606044769287, + "logits/rejected": -1.90776789188385, + "logps/chosen": -299.2118225097656, + "logps/rejected": -205.9092559814453, + "loss": 0.9656, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.016829822212457657, + "rewards/margins": 0.03627743944525719, + "rewards/rejected": -0.01944761723279953, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 3.7800687285223364e-07, + "logits/chosen": -1.827575445175171, + "logits/rejected": -1.7433605194091797, + "logps/chosen": -220.52767944335938, + "logps/rejected": -210.1706085205078, + "loss": 0.9578, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.00790109671652317, + "rewards/margins": 0.03705426678061485, + "rewards/rejected": -0.029153168201446533, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 3.9518900343642607e-07, + "logits/chosen": -1.8479468822479248, + "logits/rejected": -1.7774213552474976, + "logps/chosen": -235.8931427001953, + "logps/rejected": -207.35610961914062, + "loss": 0.9595, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01835930533707142, + "rewards/margins": 0.04197651147842407, + "rewards/rejected": -0.023617204278707504, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": -1.9261528253555298, + "logits/rejected": -1.9167568683624268, + "logps/chosen": -313.67254638671875, + "logps/rejected": -203.87185668945312, + "loss": 0.9514, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.016246426850557327, + "rewards/margins": 0.03881000727415085, + "rewards/rejected": -0.022563578560948372, + "step": 240 + }, + { + "epoch": 0.26, + "learning_rate": 4.2955326460481097e-07, + "logits/chosen": -1.9955661296844482, + "logits/rejected": -1.906306266784668, + "logps/chosen": -255.4471435546875, + "logps/rejected": -228.58023071289062, + "loss": 0.945, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02427186816930771, + "rewards/margins": 0.06289727985858917, + "rewards/rejected": -0.03862541541457176, + "step": 250 + }, + { + "epoch": 0.27, + "learning_rate": 4.4673539518900345e-07, + "logits/chosen": -1.961930274963379, + "logits/rejected": -1.925752878189087, + "logps/chosen": -267.53271484375, + "logps/rejected": -198.67776489257812, + "loss": 0.935, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.03963503614068031, + "rewards/margins": 0.08463665097951889, + "rewards/rejected": -0.045001622289419174, + "step": 260 + }, + { + "epoch": 0.28, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": -2.0039916038513184, + "logits/rejected": -2.0741240978240967, + "logps/chosen": -292.31549072265625, + "logps/rejected": -226.5825958251953, + "loss": 0.9226, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00682613393291831, + "rewards/margins": 0.052050817757844925, + "rewards/rejected": -0.04522468149662018, + "step": 270 + }, + { + "epoch": 0.29, + "learning_rate": 4.810996563573884e-07, + "logits/chosen": -1.9262123107910156, + "logits/rejected": -1.8254365921020508, + "logps/chosen": -271.57513427734375, + "logps/rejected": -218.11032104492188, + "loss": 0.9163, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.05085228011012077, + "rewards/margins": 0.0943886935710907, + "rewards/rejected": -0.04353641718626022, + "step": 280 + }, + { + "epoch": 0.3, + "learning_rate": 4.982817869415807e-07, + "logits/chosen": -2.0598292350769043, + "logits/rejected": -1.8968032598495483, + "logps/chosen": -271.613037109375, + "logps/rejected": -210.1744384765625, + "loss": 0.9097, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0635281652212143, + "rewards/margins": 0.12082493305206299, + "rewards/rejected": -0.0572967603802681, + "step": 290 + }, + { + "epoch": 0.31, + "learning_rate": 4.982798165137615e-07, + "logits/chosen": -1.8396161794662476, + "logits/rejected": -1.779496431350708, + "logps/chosen": -227.2111358642578, + "logps/rejected": -224.27841186523438, + "loss": 0.8899, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.007174974773079157, + "rewards/margins": 0.07386655360460281, + "rewards/rejected": -0.06669158488512039, + "step": 300 + }, + { + "epoch": 0.32, + "learning_rate": 4.963685015290519e-07, + "logits/chosen": -1.9974682331085205, + "logits/rejected": -2.1091110706329346, + "logps/chosen": -316.4252014160156, + "logps/rejected": -260.0753173828125, + "loss": 0.8829, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04557584971189499, + "rewards/margins": 0.1289386749267578, + "rewards/rejected": -0.08336281031370163, + "step": 310 + }, + { + "epoch": 0.33, + "learning_rate": 4.944571865443424e-07, + "logits/chosen": -1.846778154373169, + "logits/rejected": -1.7403427362442017, + "logps/chosen": -248.9180450439453, + "logps/rejected": -179.14321899414062, + "loss": 0.8495, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.02153083309531212, + "rewards/margins": 0.15574803948402405, + "rewards/rejected": -0.13421721756458282, + "step": 320 + }, + { + "epoch": 0.34, + "learning_rate": 4.92545871559633e-07, + "logits/chosen": -1.9045463800430298, + "logits/rejected": -1.9483264684677124, + "logps/chosen": -325.5777282714844, + "logps/rejected": -232.71499633789062, + "loss": 0.8641, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.03281577304005623, + "rewards/margins": 0.12244097143411636, + "rewards/rejected": -0.08962519466876984, + "step": 330 + }, + { + "epoch": 0.35, + "learning_rate": 4.906345565749235e-07, + "logits/chosen": -1.721255898475647, + "logits/rejected": -1.6868213415145874, + "logps/chosen": -236.89242553710938, + "logps/rejected": -216.6375732421875, + "loss": 0.8565, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004082153085619211, + "rewards/margins": 0.11858376115560532, + "rewards/rejected": -0.11450158059597015, + "step": 340 + }, + { + "epoch": 0.36, + "learning_rate": 4.88723241590214e-07, + "logits/chosen": -1.8765642642974854, + "logits/rejected": -1.8209621906280518, + "logps/chosen": -297.5986328125, + "logps/rejected": -230.46334838867188, + "loss": 0.8171, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.040931276977062225, + "rewards/margins": 0.19416043162345886, + "rewards/rejected": -0.15322914719581604, + "step": 350 + }, + { + "epoch": 0.37, + "learning_rate": 4.868119266055046e-07, + "logits/chosen": -1.9383245706558228, + "logits/rejected": -1.9298241138458252, + "logps/chosen": -283.68145751953125, + "logps/rejected": -255.94949340820312, + "loss": 0.7955, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.003488479647785425, + "rewards/margins": 0.1996344029903412, + "rewards/rejected": -0.20312288403511047, + "step": 360 + }, + { + "epoch": 0.38, + "learning_rate": 4.849006116207951e-07, + "logits/chosen": -2.060605764389038, + "logits/rejected": -1.9366719722747803, + "logps/chosen": -284.1970520019531, + "logps/rejected": -244.7628936767578, + "loss": 0.7797, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06834305822849274, + "rewards/margins": 0.258323609828949, + "rewards/rejected": -0.18998056650161743, + "step": 370 + }, + { + "epoch": 0.39, + "learning_rate": 4.829892966360856e-07, + "logits/chosen": -1.9930108785629272, + "logits/rejected": -1.8641077280044556, + "logps/chosen": -305.69573974609375, + "logps/rejected": -261.52850341796875, + "loss": 0.8123, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0743982270359993, + "rewards/margins": 0.23559841513633728, + "rewards/rejected": -0.1612001657485962, + "step": 380 + }, + { + "epoch": 0.4, + "learning_rate": 4.810779816513762e-07, + "logits/chosen": -1.9459863901138306, + "logits/rejected": -1.8753139972686768, + "logps/chosen": -236.7461700439453, + "logps/rejected": -215.2508087158203, + "loss": 0.781, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02323376014828682, + "rewards/margins": 0.22391769289970398, + "rewards/rejected": -0.2471514195203781, + "step": 390 + }, + { + "epoch": 0.41, + "learning_rate": 4.791666666666667e-07, + "logits/chosen": -1.850494623184204, + "logits/rejected": -1.8692734241485596, + "logps/chosen": -270.0162658691406, + "logps/rejected": -190.91690063476562, + "loss": 0.7595, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03287973254919052, + "rewards/margins": 0.2878955900669098, + "rewards/rejected": -0.2550157904624939, + "step": 400 + }, + { + "epoch": 0.42, + "learning_rate": 4.772553516819572e-07, + "logits/chosen": -1.9037336111068726, + "logits/rejected": -1.889154076576233, + "logps/chosen": -283.4707336425781, + "logps/rejected": -263.4888000488281, + "loss": 0.7516, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.023483851924538612, + "rewards/margins": 0.3119986057281494, + "rewards/rejected": -0.2885147035121918, + "step": 410 + }, + { + "epoch": 0.43, + "learning_rate": 4.753440366972477e-07, + "logits/chosen": -2.035512924194336, + "logits/rejected": -1.9027979373931885, + "logps/chosen": -223.506103515625, + "logps/rejected": -222.5137176513672, + "loss": 0.7745, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.08041305840015411, + "rewards/margins": 0.13345971703529358, + "rewards/rejected": -0.2138727903366089, + "step": 420 + }, + { + "epoch": 0.44, + "learning_rate": 4.7343272171253825e-07, + "logits/chosen": -1.9109729528427124, + "logits/rejected": -1.9474281072616577, + "logps/chosen": -241.5768280029297, + "logps/rejected": -223.0868682861328, + "loss": 0.7334, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.05294427275657654, + "rewards/margins": 0.23036575317382812, + "rewards/rejected": -0.28331005573272705, + "step": 430 + }, + { + "epoch": 0.45, + "learning_rate": 4.715214067278288e-07, + "logits/chosen": -1.9270904064178467, + "logits/rejected": -1.9717504978179932, + "logps/chosen": -278.0482177734375, + "logps/rejected": -216.4290008544922, + "loss": 0.6853, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.031850665807724, + "rewards/margins": 0.4009469151496887, + "rewards/rejected": -0.43279749155044556, + "step": 440 + }, + { + "epoch": 0.46, + "learning_rate": 4.696100917431192e-07, + "logits/chosen": -1.8353502750396729, + "logits/rejected": -1.9710232019424438, + "logps/chosen": -314.343505859375, + "logps/rejected": -263.90374755859375, + "loss": 0.7716, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0073336451314389706, + "rewards/margins": 0.2296525537967682, + "rewards/rejected": -0.22231896221637726, + "step": 450 + }, + { + "epoch": 0.47, + "learning_rate": 4.6769877675840974e-07, + "logits/chosen": -1.8592478036880493, + "logits/rejected": -1.7888505458831787, + "logps/chosen": -253.55599975585938, + "logps/rejected": -232.0161895751953, + "loss": 0.6498, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07120613008737564, + "rewards/margins": 0.2692238688468933, + "rewards/rejected": -0.34042999148368835, + "step": 460 + }, + { + "epoch": 0.49, + "learning_rate": 4.6578746177370027e-07, + "logits/chosen": -1.830445647239685, + "logits/rejected": -1.8560142517089844, + "logps/chosen": -196.4136962890625, + "logps/rejected": -182.5649871826172, + "loss": 0.6393, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.06338037550449371, + "rewards/margins": 0.29189833998680115, + "rewards/rejected": -0.35527873039245605, + "step": 470 + }, + { + "epoch": 0.5, + "learning_rate": 4.638761467889908e-07, + "logits/chosen": -1.887717604637146, + "logits/rejected": -1.8548129796981812, + "logps/chosen": -274.1175537109375, + "logps/rejected": -214.11209106445312, + "loss": 0.687, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.057955551892519, + "rewards/margins": 0.43419212102890015, + "rewards/rejected": -0.4921477437019348, + "step": 480 + }, + { + "epoch": 0.51, + "learning_rate": 4.6196483180428133e-07, + "logits/chosen": -1.9349689483642578, + "logits/rejected": -1.9116013050079346, + "logps/chosen": -269.4372863769531, + "logps/rejected": -212.83779907226562, + "loss": 0.6166, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.041172344237565994, + "rewards/margins": 0.3499607443809509, + "rewards/rejected": -0.39113301038742065, + "step": 490 + }, + { + "epoch": 0.52, + "learning_rate": 4.600535168195718e-07, + "logits/chosen": -1.8402849435806274, + "logits/rejected": -1.827770471572876, + "logps/chosen": -213.6937255859375, + "logps/rejected": -208.28866577148438, + "loss": 0.6061, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18305625021457672, + "rewards/margins": 0.20005738735198975, + "rewards/rejected": -0.3831135928630829, + "step": 500 + }, + { + "epoch": 0.53, + "learning_rate": 4.5814220183486234e-07, + "logits/chosen": -1.8466438055038452, + "logits/rejected": -1.8012769222259521, + "logps/chosen": -292.2630310058594, + "logps/rejected": -263.00091552734375, + "loss": 0.5948, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.13879308104515076, + "rewards/margins": 0.31510522961616516, + "rewards/rejected": -0.45389825105667114, + "step": 510 + }, + { + "epoch": 0.54, + "learning_rate": 4.562308868501529e-07, + "logits/chosen": -1.909223198890686, + "logits/rejected": -1.7789217233657837, + "logps/chosen": -255.5054931640625, + "logps/rejected": -254.28439331054688, + "loss": 0.609, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.03209429234266281, + "rewards/margins": 0.5229288935661316, + "rewards/rejected": -0.555023193359375, + "step": 520 + }, + { + "epoch": 0.55, + "learning_rate": 4.543195718654434e-07, + "logits/chosen": -1.7467231750488281, + "logits/rejected": -1.8646119832992554, + "logps/chosen": -221.7431640625, + "logps/rejected": -200.1488494873047, + "loss": 0.6115, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2338542938232422, + "rewards/margins": 0.2427963763475418, + "rewards/rejected": -0.4766507148742676, + "step": 530 + }, + { + "epoch": 0.56, + "learning_rate": 4.5240825688073394e-07, + "logits/chosen": -1.8729002475738525, + "logits/rejected": -1.6796636581420898, + "logps/chosen": -280.7690734863281, + "logps/rejected": -226.85562133789062, + "loss": 0.5931, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08351187407970428, + "rewards/margins": 0.5537512898445129, + "rewards/rejected": -0.6372631788253784, + "step": 540 + }, + { + "epoch": 0.57, + "learning_rate": 4.504969418960244e-07, + "logits/chosen": -2.0048041343688965, + "logits/rejected": -1.637963056564331, + "logps/chosen": -254.12939453125, + "logps/rejected": -259.4322814941406, + "loss": 0.5974, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.12823496758937836, + "rewards/margins": 0.29864269495010376, + "rewards/rejected": -0.4268776476383209, + "step": 550 + }, + { + "epoch": 0.58, + "learning_rate": 4.4858562691131495e-07, + "logits/chosen": -2.012173652648926, + "logits/rejected": -1.9192218780517578, + "logps/chosen": -291.0408935546875, + "logps/rejected": -280.9718933105469, + "loss": 0.524, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.1138150691986084, + "rewards/margins": 0.5268052816390991, + "rewards/rejected": -0.6406203508377075, + "step": 560 + }, + { + "epoch": 0.59, + "learning_rate": 4.466743119266055e-07, + "logits/chosen": -1.9362258911132812, + "logits/rejected": -1.841904878616333, + "logps/chosen": -275.77545166015625, + "logps/rejected": -239.5581817626953, + "loss": 0.5702, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2778164744377136, + "rewards/margins": 0.3516896367073059, + "rewards/rejected": -0.6295061111450195, + "step": 570 + }, + { + "epoch": 0.6, + "learning_rate": 4.44762996941896e-07, + "logits/chosen": -1.8746612071990967, + "logits/rejected": -1.979645013809204, + "logps/chosen": -292.9163513183594, + "logps/rejected": -251.6305694580078, + "loss": 0.493, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.10857485234737396, + "rewards/margins": 0.647341251373291, + "rewards/rejected": -0.7559161186218262, + "step": 580 + }, + { + "epoch": 0.61, + "learning_rate": 4.4285168195718655e-07, + "logits/chosen": -1.7814861536026, + "logits/rejected": -1.7613455057144165, + "logps/chosen": -233.90591430664062, + "logps/rejected": -235.93832397460938, + "loss": 0.4851, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2671836018562317, + "rewards/margins": 0.46907442808151245, + "rewards/rejected": -0.7362579107284546, + "step": 590 + }, + { + "epoch": 0.62, + "learning_rate": 4.40940366972477e-07, + "logits/chosen": -1.9857203960418701, + "logits/rejected": -1.810185194015503, + "logps/chosen": -234.3582763671875, + "logps/rejected": -216.35922241210938, + "loss": 0.514, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.29425790905952454, + "rewards/margins": 0.4220006465911865, + "rewards/rejected": -0.7162585258483887, + "step": 600 + }, + { + "epoch": 0.63, + "learning_rate": 4.3902905198776756e-07, + "logits/chosen": -1.8552411794662476, + "logits/rejected": -1.839869499206543, + "logps/chosen": -275.1326904296875, + "logps/rejected": -226.4814910888672, + "loss": 0.4751, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.1326485425233841, + "rewards/margins": 0.5507789254188538, + "rewards/rejected": -0.6834274530410767, + "step": 610 + }, + { + "epoch": 0.64, + "learning_rate": 4.371177370030581e-07, + "logits/chosen": -1.9499883651733398, + "logits/rejected": -1.8449184894561768, + "logps/chosen": -274.458984375, + "logps/rejected": -231.59921264648438, + "loss": 0.3819, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2783154845237732, + "rewards/margins": 0.6603595614433289, + "rewards/rejected": -0.938675045967102, + "step": 620 + }, + { + "epoch": 0.65, + "learning_rate": 4.352064220183486e-07, + "logits/chosen": -1.7157669067382812, + "logits/rejected": -1.7650973796844482, + "logps/chosen": -215.53311157226562, + "logps/rejected": -200.57073974609375, + "loss": 0.434, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.27303647994995117, + "rewards/margins": 0.4611503481864929, + "rewards/rejected": -0.7341868281364441, + "step": 630 + }, + { + "epoch": 0.66, + "learning_rate": 4.3329510703363915e-07, + "logits/chosen": -1.8946468830108643, + "logits/rejected": -1.8263728618621826, + "logps/chosen": -255.1796417236328, + "logps/rejected": -215.44821166992188, + "loss": 0.3838, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2462444007396698, + "rewards/margins": 0.8292306661605835, + "rewards/rejected": -1.0754752159118652, + "step": 640 + }, + { + "epoch": 0.67, + "learning_rate": 4.313837920489297e-07, + "logits/chosen": -1.8516194820404053, + "logits/rejected": -1.8784716129302979, + "logps/chosen": -231.55343627929688, + "logps/rejected": -226.981201171875, + "loss": 0.3954, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19073012471199036, + "rewards/margins": 0.6038515567779541, + "rewards/rejected": -0.7945817112922668, + "step": 650 + }, + { + "epoch": 0.68, + "learning_rate": 4.2947247706422016e-07, + "logits/chosen": -1.8391790390014648, + "logits/rejected": -1.9652955532073975, + "logps/chosen": -252.66983032226562, + "logps/rejected": -235.1903533935547, + "loss": 0.3506, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.33843857049942017, + "rewards/margins": 0.6845678091049194, + "rewards/rejected": -1.0230063199996948, + "step": 660 + }, + { + "epoch": 0.69, + "learning_rate": 4.275611620795107e-07, + "logits/chosen": -1.9520423412322998, + "logits/rejected": -1.8214895725250244, + "logps/chosen": -308.54522705078125, + "logps/rejected": -243.960693359375, + "loss": 0.3558, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.36050236225128174, + "rewards/margins": 0.6172996759414673, + "rewards/rejected": -0.9778021574020386, + "step": 670 + }, + { + "epoch": 0.7, + "learning_rate": 4.2564984709480123e-07, + "logits/chosen": -1.9856923818588257, + "logits/rejected": -1.8237674236297607, + "logps/chosen": -291.28717041015625, + "logps/rejected": -226.1394500732422, + "loss": 0.4798, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4318040907382965, + "rewards/margins": 0.5425896644592285, + "rewards/rejected": -0.9743936657905579, + "step": 680 + }, + { + "epoch": 0.71, + "learning_rate": 4.2373853211009176e-07, + "logits/chosen": -2.0162062644958496, + "logits/rejected": -1.994715690612793, + "logps/chosen": -265.1205139160156, + "logps/rejected": -243.013916015625, + "loss": 0.4168, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4797751307487488, + "rewards/margins": 0.790332555770874, + "rewards/rejected": -1.2701075077056885, + "step": 690 + }, + { + "epoch": 0.72, + "learning_rate": 4.2182721712538224e-07, + "logits/chosen": -1.9402358531951904, + "logits/rejected": -1.7293392419815063, + "logps/chosen": -283.6700744628906, + "logps/rejected": -226.171142578125, + "loss": 0.3693, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.17032106220722198, + "rewards/margins": 1.0015548467636108, + "rewards/rejected": -1.171876072883606, + "step": 700 + }, + { + "epoch": 0.73, + "learning_rate": 4.199159021406727e-07, + "logits/chosen": -1.9769999980926514, + "logits/rejected": -1.6814262866973877, + "logps/chosen": -243.1233367919922, + "logps/rejected": -221.1171875, + "loss": 0.3979, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5523873567581177, + "rewards/margins": 0.596055269241333, + "rewards/rejected": -1.1484426259994507, + "step": 710 + }, + { + "epoch": 0.74, + "learning_rate": 4.1800458715596325e-07, + "logits/chosen": -1.927851915359497, + "logits/rejected": -1.8016504049301147, + "logps/chosen": -311.8240966796875, + "logps/rejected": -259.55096435546875, + "loss": 0.3839, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5330373644828796, + "rewards/margins": 0.41053658723831177, + "rewards/rejected": -0.9435739517211914, + "step": 720 + }, + { + "epoch": 0.75, + "learning_rate": 4.160932721712538e-07, + "logits/chosen": -1.934565544128418, + "logits/rejected": -1.8886038064956665, + "logps/chosen": -286.6680908203125, + "logps/rejected": -258.78961181640625, + "loss": 0.3174, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.48792019486427307, + "rewards/margins": 0.4587516784667969, + "rewards/rejected": -0.9466718435287476, + "step": 730 + }, + { + "epoch": 0.76, + "learning_rate": 4.141819571865443e-07, + "logits/chosen": -1.7957875728607178, + "logits/rejected": -1.8168989419937134, + "logps/chosen": -255.51919555664062, + "logps/rejected": -200.88294982910156, + "loss": 0.2208, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5373031497001648, + "rewards/margins": 0.7255297899246216, + "rewards/rejected": -1.2628331184387207, + "step": 740 + }, + { + "epoch": 0.77, + "learning_rate": 4.1227064220183485e-07, + "logits/chosen": -1.784045934677124, + "logits/rejected": -1.66024649143219, + "logps/chosen": -261.5906677246094, + "logps/rejected": -236.45486450195312, + "loss": 0.2914, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6871198415756226, + "rewards/margins": 0.8158855438232422, + "rewards/rejected": -1.5030055046081543, + "step": 750 + }, + { + "epoch": 0.78, + "learning_rate": 4.103593272171253e-07, + "logits/chosen": -1.8295629024505615, + "logits/rejected": -1.8517471551895142, + "logps/chosen": -273.9855651855469, + "logps/rejected": -268.47784423828125, + "loss": 0.2635, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6461771130561829, + "rewards/margins": 0.6871333122253418, + "rewards/rejected": -1.3333103656768799, + "step": 760 + }, + { + "epoch": 0.79, + "learning_rate": 4.0844801223241586e-07, + "logits/chosen": -1.7927148342132568, + "logits/rejected": -1.8215078115463257, + "logps/chosen": -280.751220703125, + "logps/rejected": -242.97573852539062, + "loss": 0.1871, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.48614630103111267, + "rewards/margins": 0.6972783207893372, + "rewards/rejected": -1.183424472808838, + "step": 770 + }, + { + "epoch": 0.8, + "learning_rate": 4.065366972477064e-07, + "logits/chosen": -1.9858176708221436, + "logits/rejected": -1.8561077117919922, + "logps/chosen": -287.98516845703125, + "logps/rejected": -252.1136932373047, + "loss": 0.2167, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.689589262008667, + "rewards/margins": 0.799708366394043, + "rewards/rejected": -1.4892975091934204, + "step": 780 + }, + { + "epoch": 0.82, + "learning_rate": 4.046253822629969e-07, + "logits/chosen": -1.8583405017852783, + "logits/rejected": -1.8265085220336914, + "logps/chosen": -265.4439697265625, + "logps/rejected": -234.42135620117188, + "loss": 0.0371, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.582925021648407, + "rewards/margins": 0.8660266995429993, + "rewards/rejected": -1.4489517211914062, + "step": 790 + }, + { + "epoch": 0.83, + "learning_rate": 4.0271406727828745e-07, + "logits/chosen": -1.8910176753997803, + "logits/rejected": -1.8912360668182373, + "logps/chosen": -259.268310546875, + "logps/rejected": -229.13623046875, + "loss": 0.2407, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8472847938537598, + "rewards/margins": 0.5785714387893677, + "rewards/rejected": -1.425856351852417, + "step": 800 + }, + { + "epoch": 0.84, + "learning_rate": 4.00802752293578e-07, + "logits/chosen": -1.8274396657943726, + "logits/rejected": -1.7036545276641846, + "logps/chosen": -273.0886535644531, + "logps/rejected": -241.332763671875, + "loss": 0.0946, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7010241746902466, + "rewards/margins": 0.7486527562141418, + "rewards/rejected": -1.449676752090454, + "step": 810 + }, + { + "epoch": 0.85, + "learning_rate": 3.9889143730886847e-07, + "logits/chosen": -2.0030179023742676, + "logits/rejected": -1.938431978225708, + "logps/chosen": -308.9624328613281, + "logps/rejected": -252.2946319580078, + "loss": 0.1224, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5951758623123169, + "rewards/margins": 1.0006935596466064, + "rewards/rejected": -1.5958693027496338, + "step": 820 + }, + { + "epoch": 0.86, + "learning_rate": 3.96980122324159e-07, + "logits/chosen": -1.889276146888733, + "logits/rejected": -1.7101726531982422, + "logps/chosen": -264.00048828125, + "logps/rejected": -214.24649047851562, + "loss": -0.045, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6674525737762451, + "rewards/margins": 1.1390091180801392, + "rewards/rejected": -1.8064616918563843, + "step": 830 + }, + { + "epoch": 0.87, + "learning_rate": 3.9506880733944953e-07, + "logits/chosen": -1.877962350845337, + "logits/rejected": -1.7040239572525024, + "logps/chosen": -257.03582763671875, + "logps/rejected": -227.9223175048828, + "loss": 0.3336, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.072399377822876, + "rewards/margins": 0.5846762657165527, + "rewards/rejected": -1.6570755243301392, + "step": 840 + }, + { + "epoch": 0.88, + "learning_rate": 3.9315749235474006e-07, + "logits/chosen": -1.840431809425354, + "logits/rejected": -1.892249345779419, + "logps/chosen": -278.1147766113281, + "logps/rejected": -272.54327392578125, + "loss": 0.1259, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8901047706604004, + "rewards/margins": 0.7304579019546509, + "rewards/rejected": -1.6205627918243408, + "step": 850 + }, + { + "epoch": 0.89, + "learning_rate": 3.912461773700306e-07, + "logits/chosen": -2.031358003616333, + "logits/rejected": -1.8924942016601562, + "logps/chosen": -300.32110595703125, + "logps/rejected": -295.2864074707031, + "loss": 0.0646, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8579466938972473, + "rewards/margins": 1.062766432762146, + "rewards/rejected": -1.920713186264038, + "step": 860 + }, + { + "epoch": 0.9, + "learning_rate": 3.8933486238532107e-07, + "logits/chosen": -1.6753685474395752, + "logits/rejected": -1.6504751443862915, + "logps/chosen": -313.80853271484375, + "logps/rejected": -248.9676513671875, + "loss": 0.0988, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7647205591201782, + "rewards/margins": 1.3286702632904053, + "rewards/rejected": -2.093390941619873, + "step": 870 + }, + { + "epoch": 0.91, + "learning_rate": 3.874235474006116e-07, + "logits/chosen": -1.8231757879257202, + "logits/rejected": -1.7661590576171875, + "logps/chosen": -287.50677490234375, + "logps/rejected": -241.2566375732422, + "loss": -0.0319, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.6531956195831299, + "rewards/margins": 1.3242131471633911, + "rewards/rejected": -1.9774086475372314, + "step": 880 + }, + { + "epoch": 0.92, + "learning_rate": 3.8551223241590214e-07, + "logits/chosen": -1.8550631999969482, + "logits/rejected": -1.7476036548614502, + "logps/chosen": -261.09033203125, + "logps/rejected": -238.5321502685547, + "loss": 0.1266, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9579564332962036, + "rewards/margins": 1.210017442703247, + "rewards/rejected": -2.1679739952087402, + "step": 890 + }, + { + "epoch": 0.93, + "learning_rate": 3.8360091743119267e-07, + "logits/chosen": -1.9090309143066406, + "logits/rejected": -1.6929905414581299, + "logps/chosen": -267.00323486328125, + "logps/rejected": -239.107177734375, + "loss": -0.0438, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9265840649604797, + "rewards/margins": 1.2149267196655273, + "rewards/rejected": -2.1415107250213623, + "step": 900 + }, + { + "epoch": 0.94, + "learning_rate": 3.816896024464832e-07, + "logits/chosen": -1.7493388652801514, + "logits/rejected": -1.9231748580932617, + "logps/chosen": -240.63134765625, + "logps/rejected": -220.7272186279297, + "loss": -0.0152, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0793166160583496, + "rewards/margins": 0.8653289675712585, + "rewards/rejected": -1.9446455240249634, + "step": 910 + }, + { + "epoch": 0.95, + "learning_rate": 3.797782874617737e-07, + "logits/chosen": -1.9112157821655273, + "logits/rejected": -2.013073682785034, + "logps/chosen": -284.13824462890625, + "logps/rejected": -243.91806030273438, + "loss": -0.0423, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1301993131637573, + "rewards/margins": 0.8508475422859192, + "rewards/rejected": -1.9810469150543213, + "step": 920 + }, + { + "epoch": 0.96, + "learning_rate": 3.778669724770642e-07, + "logits/chosen": -1.779415488243103, + "logits/rejected": -1.688969612121582, + "logps/chosen": -264.59710693359375, + "logps/rejected": -212.7026824951172, + "loss": -0.0335, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0757131576538086, + "rewards/margins": 1.205359697341919, + "rewards/rejected": -2.2810730934143066, + "step": 930 + }, + { + "epoch": 0.97, + "learning_rate": 3.7595565749235474e-07, + "logits/chosen": -1.9989725351333618, + "logits/rejected": -1.9091949462890625, + "logps/chosen": -287.1456604003906, + "logps/rejected": -226.24771118164062, + "loss": 0.0097, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8323071599006653, + "rewards/margins": 1.2861956357955933, + "rewards/rejected": -2.1185028553009033, + "step": 940 + }, + { + "epoch": 0.98, + "learning_rate": 3.740443425076452e-07, + "logits/chosen": -1.9547460079193115, + "logits/rejected": -1.8703094720840454, + "logps/chosen": -287.35198974609375, + "logps/rejected": -261.2727966308594, + "loss": -0.0247, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3492201566696167, + "rewards/margins": 0.8363786935806274, + "rewards/rejected": -2.185598850250244, + "step": 950 + }, + { + "epoch": 0.99, + "learning_rate": 3.7213302752293575e-07, + "logits/chosen": -1.86129891872406, + "logits/rejected": -1.7522859573364258, + "logps/chosen": -273.3508605957031, + "logps/rejected": -244.61569213867188, + "loss": -0.2007, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0919667482376099, + "rewards/margins": 0.9706255793571472, + "rewards/rejected": -2.0625922679901123, + "step": 960 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -2.097571611404419, + "eval_logits/rejected": -1.9875798225402832, + "eval_logps/chosen": -301.2187805175781, + "eval_logps/rejected": -259.9261779785156, + "eval_loss": -0.09876806288957596, + "eval_rewards/accuracies": 0.6746031641960144, + "eval_rewards/chosen": -1.1416146755218506, + "eval_rewards/margins": 1.3577306270599365, + "eval_rewards/rejected": -2.499345302581787, + "eval_runtime": 238.6969, + "eval_samples_per_second": 8.379, + "eval_steps_per_second": 0.264, + "step": 969 + }, + { + "epoch": 1.0, + "learning_rate": 3.702217125382263e-07, + "logits/chosen": -1.9615962505340576, + "logits/rejected": -1.7775417566299438, + "logps/chosen": -293.53924560546875, + "logps/rejected": -264.0589599609375, + "loss": -0.1153, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.265012264251709, + "rewards/margins": 1.2176616191864014, + "rewards/rejected": -2.4826738834381104, + "step": 970 + }, + { + "epoch": 1.01, + "learning_rate": 3.6831039755351677e-07, + "logits/chosen": -1.7482898235321045, + "logits/rejected": -1.872532844543457, + "logps/chosen": -272.051025390625, + "logps/rejected": -255.67703247070312, + "loss": -0.1987, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1273905038833618, + "rewards/margins": 1.4816131591796875, + "rewards/rejected": -2.609003782272339, + "step": 980 + }, + { + "epoch": 1.02, + "learning_rate": 3.663990825688073e-07, + "logits/chosen": -1.8804614543914795, + "logits/rejected": -1.6789354085922241, + "logps/chosen": -262.64892578125, + "logps/rejected": -241.96353149414062, + "loss": 0.1311, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6520445346832275, + "rewards/margins": 0.846520721912384, + "rewards/rejected": -2.4985649585723877, + "step": 990 + }, + { + "epoch": 1.03, + "learning_rate": 3.6448776758409783e-07, + "logits/chosen": -1.989675760269165, + "logits/rejected": -1.7391383647918701, + "logps/chosen": -288.917724609375, + "logps/rejected": -266.159912109375, + "loss": -0.0103, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2550522089004517, + "rewards/margins": 1.0477917194366455, + "rewards/rejected": -2.3028438091278076, + "step": 1000 + }, + { + "epoch": 1.04, + "learning_rate": 3.6257645259938836e-07, + "logits/chosen": -1.848597526550293, + "logits/rejected": -1.7234575748443604, + "logps/chosen": -254.86672973632812, + "logps/rejected": -258.8224792480469, + "loss": -0.1996, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1828529834747314, + "rewards/margins": 1.4569838047027588, + "rewards/rejected": -2.6398367881774902, + "step": 1010 + }, + { + "epoch": 1.05, + "learning_rate": 3.606651376146789e-07, + "logits/chosen": -1.6995182037353516, + "logits/rejected": -1.8116445541381836, + "logps/chosen": -258.5903015136719, + "logps/rejected": -209.5008544921875, + "loss": -0.3461, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5025261640548706, + "rewards/margins": 1.1272718906402588, + "rewards/rejected": -2.629798412322998, + "step": 1020 + }, + { + "epoch": 1.06, + "learning_rate": 3.5875382262996937e-07, + "logits/chosen": -1.7140220403671265, + "logits/rejected": -1.7449703216552734, + "logps/chosen": -283.09796142578125, + "logps/rejected": -295.8421325683594, + "loss": -0.232, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.183707594871521, + "rewards/margins": 1.0434316396713257, + "rewards/rejected": -2.2271392345428467, + "step": 1030 + }, + { + "epoch": 1.07, + "learning_rate": 3.568425076452599e-07, + "logits/chosen": -1.8618385791778564, + "logits/rejected": -1.7049167156219482, + "logps/chosen": -295.1578063964844, + "logps/rejected": -260.01678466796875, + "loss": -0.2474, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6388660669326782, + "rewards/margins": 0.968669056892395, + "rewards/rejected": -2.6075356006622314, + "step": 1040 + }, + { + "epoch": 1.08, + "learning_rate": 3.5493119266055044e-07, + "logits/chosen": -1.7115428447723389, + "logits/rejected": -1.6858937740325928, + "logps/chosen": -273.44049072265625, + "logps/rejected": -214.3227996826172, + "loss": -0.3821, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6126127243041992, + "rewards/margins": 1.3140199184417725, + "rewards/rejected": -2.9266326427459717, + "step": 1050 + }, + { + "epoch": 1.09, + "learning_rate": 3.5301987767584097e-07, + "logits/chosen": -1.7064205408096313, + "logits/rejected": -1.6017926931381226, + "logps/chosen": -268.70416259765625, + "logps/rejected": -272.87066650390625, + "loss": -0.3771, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3269054889678955, + "rewards/margins": 1.9102023839950562, + "rewards/rejected": -3.237107515335083, + "step": 1060 + }, + { + "epoch": 1.1, + "learning_rate": 3.511085626911315e-07, + "logits/chosen": -1.8583990335464478, + "logits/rejected": -1.5322113037109375, + "logps/chosen": -333.03924560546875, + "logps/rejected": -262.2927551269531, + "loss": -0.4133, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4196674823760986, + "rewards/margins": 1.6081445217132568, + "rewards/rejected": -3.0278122425079346, + "step": 1070 + }, + { + "epoch": 1.11, + "learning_rate": 3.49197247706422e-07, + "logits/chosen": -1.6976341009140015, + "logits/rejected": -1.6782453060150146, + "logps/chosen": -233.9385223388672, + "logps/rejected": -220.5770263671875, + "loss": -0.4013, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9680936336517334, + "rewards/margins": 0.8472940325737, + "rewards/rejected": -2.8153879642486572, + "step": 1080 + }, + { + "epoch": 1.12, + "learning_rate": 3.472859327217125e-07, + "logits/chosen": -1.7105554342269897, + "logits/rejected": -1.4681594371795654, + "logps/chosen": -300.76226806640625, + "logps/rejected": -273.32696533203125, + "loss": -0.3314, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4938586950302124, + "rewards/margins": 1.572718858718872, + "rewards/rejected": -3.066577196121216, + "step": 1090 + }, + { + "epoch": 1.14, + "learning_rate": 3.4537461773700304e-07, + "logits/chosen": -1.5856693983078003, + "logits/rejected": -1.7974258661270142, + "logps/chosen": -225.534423828125, + "logps/rejected": -204.51913452148438, + "loss": -0.311, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.020268201828003, + "rewards/margins": 1.0005836486816406, + "rewards/rejected": -3.0208516120910645, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 3.434633027522936e-07, + "logits/chosen": -1.7942464351654053, + "logits/rejected": -1.6902000904083252, + "logps/chosen": -293.67706298828125, + "logps/rejected": -255.2344512939453, + "loss": -0.3927, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.663823127746582, + "rewards/margins": 1.3553807735443115, + "rewards/rejected": -3.0192039012908936, + "step": 1110 + }, + { + "epoch": 1.16, + "learning_rate": 3.415519877675841e-07, + "logits/chosen": -1.7715240716934204, + "logits/rejected": -1.643781065940857, + "logps/chosen": -252.25729370117188, + "logps/rejected": -269.1502685546875, + "loss": -0.4037, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5638118982315063, + "rewards/margins": 1.3495436906814575, + "rewards/rejected": -2.9133553504943848, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 3.3964067278287464e-07, + "logits/chosen": -1.7634483575820923, + "logits/rejected": -1.7055590152740479, + "logps/chosen": -297.56658935546875, + "logps/rejected": -240.73849487304688, + "loss": -0.4873, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6788049936294556, + "rewards/margins": 1.7267656326293945, + "rewards/rejected": -3.4055705070495605, + "step": 1130 + }, + { + "epoch": 1.18, + "learning_rate": 3.377293577981651e-07, + "logits/chosen": -1.6601498126983643, + "logits/rejected": -1.5715376138687134, + "logps/chosen": -248.473388671875, + "logps/rejected": -251.2434844970703, + "loss": -0.6724, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1505351066589355, + "rewards/margins": 1.4957424402236938, + "rewards/rejected": -3.646277666091919, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 3.3581804281345565e-07, + "logits/chosen": -1.7773525714874268, + "logits/rejected": -1.5578742027282715, + "logps/chosen": -305.3437805175781, + "logps/rejected": -243.5063018798828, + "loss": -0.2898, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0654540061950684, + "rewards/margins": 1.2946364879608154, + "rewards/rejected": -3.3600902557373047, + "step": 1150 + }, + { + "epoch": 1.2, + "learning_rate": 3.339067278287462e-07, + "logits/chosen": -1.5956556797027588, + "logits/rejected": -1.5084911584854126, + "logps/chosen": -269.35736083984375, + "logps/rejected": -251.64675903320312, + "loss": -0.5003, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1167237758636475, + "rewards/margins": 1.4275890588760376, + "rewards/rejected": -3.5443129539489746, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 3.319954128440367e-07, + "logits/chosen": -1.8844757080078125, + "logits/rejected": -1.7271381616592407, + "logps/chosen": -299.93609619140625, + "logps/rejected": -243.735595703125, + "loss": -0.5037, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.080448627471924, + "rewards/margins": 1.4392142295837402, + "rewards/rejected": -3.519662857055664, + "step": 1170 + }, + { + "epoch": 1.22, + "learning_rate": 3.3008409785932725e-07, + "logits/chosen": -1.6695266962051392, + "logits/rejected": -1.7183201313018799, + "logps/chosen": -266.7185974121094, + "logps/rejected": -248.3719482421875, + "loss": -0.5103, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2367124557495117, + "rewards/margins": 1.7176882028579712, + "rewards/rejected": -3.9544003009796143, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 3.2817278287461773e-07, + "logits/chosen": -1.7001529932022095, + "logits/rejected": -1.6251550912857056, + "logps/chosen": -256.841064453125, + "logps/rejected": -273.620849609375, + "loss": -0.6167, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.5783565044403076, + "rewards/margins": 1.5206773281097412, + "rewards/rejected": -4.099034309387207, + "step": 1190 + }, + { + "epoch": 1.24, + "learning_rate": 3.262614678899082e-07, + "logits/chosen": -1.7264814376831055, + "logits/rejected": -1.5964632034301758, + "logps/chosen": -270.0255126953125, + "logps/rejected": -285.0582580566406, + "loss": -0.8394, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6388707160949707, + "rewards/margins": 1.2730156183242798, + "rewards/rejected": -3.911886692047119, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 3.2435015290519874e-07, + "logits/chosen": -1.704097032546997, + "logits/rejected": -1.581463098526001, + "logps/chosen": -246.20022583007812, + "logps/rejected": -232.1162872314453, + "loss": -0.4922, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0876688957214355, + "rewards/margins": 1.698892593383789, + "rewards/rejected": -3.7865612506866455, + "step": 1210 + }, + { + "epoch": 1.26, + "learning_rate": 3.2243883792048927e-07, + "logits/chosen": -1.7485994100570679, + "logits/rejected": -1.7491531372070312, + "logps/chosen": -307.69842529296875, + "logps/rejected": -313.2864685058594, + "loss": -0.4846, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.471188545227051, + "rewards/margins": 1.2810360193252563, + "rewards/rejected": -3.7522246837615967, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 3.205275229357798e-07, + "logits/chosen": -1.7512283325195312, + "logits/rejected": -1.559390902519226, + "logps/chosen": -316.75592041015625, + "logps/rejected": -306.21160888671875, + "loss": -0.5032, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.4955337047576904, + "rewards/margins": 1.243558645248413, + "rewards/rejected": -3.7390923500061035, + "step": 1230 + }, + { + "epoch": 1.28, + "learning_rate": 3.186162079510703e-07, + "logits/chosen": -1.6653553247451782, + "logits/rejected": -1.6318597793579102, + "logps/chosen": -300.54473876953125, + "logps/rejected": -305.09423828125, + "loss": -0.7512, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3271665573120117, + "rewards/margins": 1.914544701576233, + "rewards/rejected": -4.241711616516113, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 3.167048929663608e-07, + "logits/chosen": -1.6567821502685547, + "logits/rejected": -1.5714284181594849, + "logps/chosen": -250.9105682373047, + "logps/rejected": -234.5415802001953, + "loss": -0.7222, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.356020927429199, + "rewards/margins": 1.9912364482879639, + "rewards/rejected": -4.347257614135742, + "step": 1250 + }, + { + "epoch": 1.3, + "learning_rate": 3.1479357798165134e-07, + "logits/chosen": -1.7020305395126343, + "logits/rejected": -1.78921377658844, + "logps/chosen": -308.86163330078125, + "logps/rejected": -269.3481140136719, + "loss": -0.6017, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.404141902923584, + "rewards/margins": 1.7603607177734375, + "rewards/rejected": -4.1645026206970215, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 3.128822629969419e-07, + "logits/chosen": -1.7761846780776978, + "logits/rejected": -1.8085511922836304, + "logps/chosen": -312.18280029296875, + "logps/rejected": -287.8406677246094, + "loss": -0.6923, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.880483388900757, + "rewards/margins": 2.123534917831421, + "rewards/rejected": -5.004018306732178, + "step": 1270 + }, + { + "epoch": 1.32, + "learning_rate": 3.109709480122324e-07, + "logits/chosen": -1.5170055627822876, + "logits/rejected": -1.7044398784637451, + "logps/chosen": -265.9775085449219, + "logps/rejected": -256.2364501953125, + "loss": -0.5623, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.711970806121826, + "rewards/margins": 1.4848562479019165, + "rewards/rejected": -4.196827411651611, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 3.0905963302752294e-07, + "logits/chosen": -1.6849790811538696, + "logits/rejected": -1.4959250688552856, + "logps/chosen": -266.03338623046875, + "logps/rejected": -265.19293212890625, + "loss": -0.5952, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.8066792488098145, + "rewards/margins": 1.4092586040496826, + "rewards/rejected": -4.215937614440918, + "step": 1290 + }, + { + "epoch": 1.34, + "learning_rate": 3.071483180428134e-07, + "logits/chosen": -1.6794811487197876, + "logits/rejected": -1.7896066904067993, + "logps/chosen": -349.8259582519531, + "logps/rejected": -259.8904113769531, + "loss": -0.8813, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.7685770988464355, + "rewards/margins": 2.0453274250030518, + "rewards/rejected": -4.813904285430908, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 3.0523700305810395e-07, + "logits/chosen": -1.826123833656311, + "logits/rejected": -1.6994121074676514, + "logps/chosen": -292.24188232421875, + "logps/rejected": -294.88665771484375, + "loss": -0.82, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.468705892562866, + "rewards/margins": 2.3952927589416504, + "rewards/rejected": -4.863998889923096, + "step": 1310 + }, + { + "epoch": 1.36, + "learning_rate": 3.033256880733945e-07, + "logits/chosen": -1.6138427257537842, + "logits/rejected": -1.5653212070465088, + "logps/chosen": -272.35723876953125, + "logps/rejected": -244.7309112548828, + "loss": -1.0125, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.5213189125061035, + "rewards/margins": 1.1809017658233643, + "rewards/rejected": -4.702220439910889, + "step": 1320 + }, + { + "epoch": 1.37, + "learning_rate": 3.01414373088685e-07, + "logits/chosen": -1.678091287612915, + "logits/rejected": -1.6383358240127563, + "logps/chosen": -305.22039794921875, + "logps/rejected": -270.16644287109375, + "loss": -0.7008, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.750168800354004, + "rewards/margins": 1.9968726634979248, + "rewards/rejected": -4.747041702270508, + "step": 1330 + }, + { + "epoch": 1.38, + "learning_rate": 2.9950305810397555e-07, + "logits/chosen": -1.8723211288452148, + "logits/rejected": -1.6131559610366821, + "logps/chosen": -302.5225524902344, + "logps/rejected": -265.1141662597656, + "loss": -0.8252, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.2168407440185547, + "rewards/margins": 1.872542381286621, + "rewards/rejected": -5.089383125305176, + "step": 1340 + }, + { + "epoch": 1.39, + "learning_rate": 2.9759174311926603e-07, + "logits/chosen": -1.6552165746688843, + "logits/rejected": -1.5474542379379272, + "logps/chosen": -300.1455383300781, + "logps/rejected": -269.47235107421875, + "loss": -0.8361, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.7245125770568848, + "rewards/margins": 1.281724214553833, + "rewards/rejected": -5.006236553192139, + "step": 1350 + }, + { + "epoch": 1.4, + "learning_rate": 2.9568042813455656e-07, + "logits/chosen": -1.6618553400039673, + "logits/rejected": -1.667223334312439, + "logps/chosen": -310.4003601074219, + "logps/rejected": -243.12240600585938, + "loss": -0.731, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.615830659866333, + "rewards/margins": 1.214110016822815, + "rewards/rejected": -4.829940319061279, + "step": 1360 + }, + { + "epoch": 1.41, + "learning_rate": 2.937691131498471e-07, + "logits/chosen": -1.7880207300186157, + "logits/rejected": -1.6480613946914673, + "logps/chosen": -316.4659729003906, + "logps/rejected": -285.7911682128906, + "loss": -0.7485, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.4602973461151123, + "rewards/margins": 1.8094520568847656, + "rewards/rejected": -5.269749641418457, + "step": 1370 + }, + { + "epoch": 1.42, + "learning_rate": 2.918577981651376e-07, + "logits/chosen": -1.6994645595550537, + "logits/rejected": -1.5988848209381104, + "logps/chosen": -261.1907958984375, + "logps/rejected": -270.0497741699219, + "loss": -0.6992, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.9491472244262695, + "rewards/margins": 1.7523548603057861, + "rewards/rejected": -4.701501846313477, + "step": 1380 + }, + { + "epoch": 1.43, + "learning_rate": 2.8994648318042816e-07, + "logits/chosen": -1.7593275308609009, + "logits/rejected": -1.6046631336212158, + "logps/chosen": -299.09637451171875, + "logps/rejected": -268.380859375, + "loss": -0.9355, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.154327630996704, + "rewards/margins": 2.1164073944091797, + "rewards/rejected": -5.2707343101501465, + "step": 1390 + }, + { + "epoch": 1.44, + "learning_rate": 2.8803516819571863e-07, + "logits/chosen": -1.7734966278076172, + "logits/rejected": -1.5951334238052368, + "logps/chosen": -347.52239990234375, + "logps/rejected": -293.4967346191406, + "loss": -0.7492, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.558985948562622, + "rewards/margins": 1.6943881511688232, + "rewards/rejected": -5.253373622894287, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 2.8612385321100917e-07, + "logits/chosen": -1.573909044265747, + "logits/rejected": -1.3949878215789795, + "logps/chosen": -270.31292724609375, + "logps/rejected": -276.7618408203125, + "loss": -1.0184, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.927518129348755, + "rewards/margins": 1.0909746885299683, + "rewards/rejected": -5.018492221832275, + "step": 1410 + }, + { + "epoch": 1.47, + "learning_rate": 2.842125382262997e-07, + "logits/chosen": -1.5700122117996216, + "logits/rejected": -1.4677519798278809, + "logps/chosen": -285.81103515625, + "logps/rejected": -269.4806823730469, + "loss": -1.0111, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.3703293800354004, + "rewards/margins": 1.9672693014144897, + "rewards/rejected": -5.3375983238220215, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 2.8230122324159023e-07, + "logits/chosen": -1.5128788948059082, + "logits/rejected": -1.475921869277954, + "logps/chosen": -321.65020751953125, + "logps/rejected": -334.950439453125, + "loss": -1.0548, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.5967164039611816, + "rewards/margins": 3.013796806335449, + "rewards/rejected": -6.610513210296631, + "step": 1430 + }, + { + "epoch": 1.49, + "learning_rate": 2.8038990825688076e-07, + "logits/chosen": -1.5578891038894653, + "logits/rejected": -1.5609424114227295, + "logps/chosen": -261.49920654296875, + "logps/rejected": -235.4161376953125, + "loss": -0.9126, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.372951984405518, + "rewards/margins": 0.7127580642700195, + "rewards/rejected": -5.085709571838379, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 2.784785932721712e-07, + "logits/chosen": -1.636566400527954, + "logits/rejected": -1.4580261707305908, + "logps/chosen": -304.53240966796875, + "logps/rejected": -282.1325378417969, + "loss": -0.9912, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.88989520072937, + "rewards/margins": 2.1614978313446045, + "rewards/rejected": -6.051393032073975, + "step": 1450 + }, + { + "epoch": 1.51, + "learning_rate": 2.765672782874617e-07, + "logits/chosen": -1.5655293464660645, + "logits/rejected": -1.485058069229126, + "logps/chosen": -318.0874938964844, + "logps/rejected": -259.27130126953125, + "loss": -0.908, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.269219398498535, + "rewards/margins": 1.588846206665039, + "rewards/rejected": -5.858065605163574, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 2.7465596330275225e-07, + "logits/chosen": -1.5867637395858765, + "logits/rejected": -1.6024789810180664, + "logps/chosen": -312.49591064453125, + "logps/rejected": -236.5067596435547, + "loss": -0.9474, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.954392910003662, + "rewards/margins": 2.293760061264038, + "rewards/rejected": -6.248152732849121, + "step": 1470 + }, + { + "epoch": 1.53, + "learning_rate": 2.727446483180428e-07, + "logits/chosen": -1.7006601095199585, + "logits/rejected": -1.5267009735107422, + "logps/chosen": -298.12469482421875, + "logps/rejected": -292.24395751953125, + "loss": -1.2885, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.79081392288208, + "rewards/margins": 1.9046157598495483, + "rewards/rejected": -6.695429801940918, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 2.708333333333333e-07, + "logits/chosen": -1.4423596858978271, + "logits/rejected": -1.6754871606826782, + "logps/chosen": -288.9786376953125, + "logps/rejected": -286.6379699707031, + "loss": -1.1448, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.346983909606934, + "rewards/margins": 1.7372324466705322, + "rewards/rejected": -6.084217071533203, + "step": 1490 + }, + { + "epoch": 1.55, + "learning_rate": 2.6892201834862385e-07, + "logits/chosen": -1.5492124557495117, + "logits/rejected": -1.4746440649032593, + "logps/chosen": -301.6089782714844, + "logps/rejected": -268.57745361328125, + "loss": -1.1715, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.268374443054199, + "rewards/margins": 2.3135557174682617, + "rewards/rejected": -6.581930637359619, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 2.6701070336391433e-07, + "logits/chosen": -1.6524032354354858, + "logits/rejected": -1.7148220539093018, + "logps/chosen": -358.600830078125, + "logps/rejected": -300.6563720703125, + "loss": -1.2609, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.343426704406738, + "rewards/margins": 2.409090280532837, + "rewards/rejected": -6.752516746520996, + "step": 1510 + }, + { + "epoch": 1.57, + "learning_rate": 2.6509938837920486e-07, + "logits/chosen": -1.6305720806121826, + "logits/rejected": -1.4456074237823486, + "logps/chosen": -276.3116455078125, + "logps/rejected": -281.0287780761719, + "loss": -1.2401, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.514157295227051, + "rewards/margins": 2.1482510566711426, + "rewards/rejected": -6.662408351898193, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 2.631880733944954e-07, + "logits/chosen": -1.529931664466858, + "logits/rejected": -1.4439135789871216, + "logps/chosen": -299.05157470703125, + "logps/rejected": -275.3011474609375, + "loss": -1.5077, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.548555850982666, + "rewards/margins": 1.7041714191436768, + "rewards/rejected": -6.252727508544922, + "step": 1530 + }, + { + "epoch": 1.59, + "learning_rate": 2.612767584097859e-07, + "logits/chosen": -1.629601240158081, + "logits/rejected": -1.5165350437164307, + "logps/chosen": -306.07647705078125, + "logps/rejected": -262.46044921875, + "loss": -1.1474, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.669868469238281, + "rewards/margins": 1.6571537256240845, + "rewards/rejected": -6.327021598815918, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 2.5936544342507646e-07, + "logits/chosen": -1.682499647140503, + "logits/rejected": -1.3376775979995728, + "logps/chosen": -307.9109802246094, + "logps/rejected": -296.7003479003906, + "loss": -1.174, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.40157413482666, + "rewards/margins": 2.6601130962371826, + "rewards/rejected": -7.061687469482422, + "step": 1550 + }, + { + "epoch": 1.61, + "learning_rate": 2.5745412844036693e-07, + "logits/chosen": -1.6591355800628662, + "logits/rejected": -1.5533663034439087, + "logps/chosen": -335.03057861328125, + "logps/rejected": -287.7520751953125, + "loss": -1.3877, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -5.175102710723877, + "rewards/margins": 2.255643844604492, + "rewards/rejected": -7.430747032165527, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 2.5554281345565747e-07, + "logits/chosen": -1.6894855499267578, + "logits/rejected": -1.586287260055542, + "logps/chosen": -304.94989013671875, + "logps/rejected": -305.7507019042969, + "loss": -1.3946, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.844755172729492, + "rewards/margins": 2.5443403720855713, + "rewards/rejected": -7.389095306396484, + "step": 1570 + }, + { + "epoch": 1.63, + "learning_rate": 2.53631498470948e-07, + "logits/chosen": -1.593643069267273, + "logits/rejected": -1.5452083349227905, + "logps/chosen": -339.3976135253906, + "logps/rejected": -294.7111511230469, + "loss": -1.556, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -5.678981781005859, + "rewards/margins": 1.9970118999481201, + "rewards/rejected": -7.6759934425354, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 2.5172018348623853e-07, + "logits/chosen": -1.5461888313293457, + "logits/rejected": -1.408332109451294, + "logps/chosen": -336.61553955078125, + "logps/rejected": -301.59759521484375, + "loss": -1.0556, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -5.15664005279541, + "rewards/margins": 2.157421827316284, + "rewards/rejected": -7.314061164855957, + "step": 1590 + }, + { + "epoch": 1.65, + "learning_rate": 2.4980886850152906e-07, + "logits/chosen": -1.5243330001831055, + "logits/rejected": -1.5529712438583374, + "logps/chosen": -333.6750793457031, + "logps/rejected": -288.2501525878906, + "loss": -1.1851, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.502475738525391, + "rewards/margins": 1.9487812519073486, + "rewards/rejected": -7.451257228851318, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 2.478975535168196e-07, + "logits/chosen": -1.5254369974136353, + "logits/rejected": -1.4977315664291382, + "logps/chosen": -316.21868896484375, + "logps/rejected": -277.3542175292969, + "loss": -1.6845, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.898019790649414, + "rewards/margins": 1.867455244064331, + "rewards/rejected": -7.765474796295166, + "step": 1610 + }, + { + "epoch": 1.67, + "learning_rate": 2.459862385321101e-07, + "logits/chosen": -1.747807502746582, + "logits/rejected": -1.5742764472961426, + "logps/chosen": -369.40728759765625, + "logps/rejected": -318.35821533203125, + "loss": -1.0463, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -6.141237735748291, + "rewards/margins": 1.5390453338623047, + "rewards/rejected": -7.680283546447754, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 2.440749235474006e-07, + "logits/chosen": -1.5898773670196533, + "logits/rejected": -1.5242546796798706, + "logps/chosen": -336.76824951171875, + "logps/rejected": -315.19384765625, + "loss": -1.7444, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.8129730224609375, + "rewards/margins": 3.9923481941223145, + "rewards/rejected": -8.80532169342041, + "step": 1630 + }, + { + "epoch": 1.69, + "learning_rate": 2.421636085626911e-07, + "logits/chosen": -1.4560226202011108, + "logits/rejected": -1.421790361404419, + "logps/chosen": -328.801025390625, + "logps/rejected": -280.44549560546875, + "loss": -1.3433, + "rewards/accuracies": 0.5625, + "rewards/chosen": -6.336007118225098, + "rewards/margins": 1.1562750339508057, + "rewards/rejected": -7.492282867431641, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 2.402522935779816e-07, + "logits/chosen": -1.7103254795074463, + "logits/rejected": -1.5623626708984375, + "logps/chosen": -353.27081298828125, + "logps/rejected": -334.1893005371094, + "loss": -1.3197, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -6.708989143371582, + "rewards/margins": 1.9991058111190796, + "rewards/rejected": -8.708094596862793, + "step": 1650 + }, + { + "epoch": 1.71, + "learning_rate": 2.3834097859327215e-07, + "logits/chosen": -1.508737325668335, + "logits/rejected": -1.3252770900726318, + "logps/chosen": -288.32635498046875, + "logps/rejected": -296.4991455078125, + "loss": -1.3704, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -5.095849990844727, + "rewards/margins": 3.1136269569396973, + "rewards/rejected": -8.209476470947266, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 2.3642966360856268e-07, + "logits/chosen": -1.6645679473876953, + "logits/rejected": -1.5162229537963867, + "logps/chosen": -322.9044189453125, + "logps/rejected": -300.4761047363281, + "loss": -1.9619, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.100916862487793, + "rewards/margins": 3.4188740253448486, + "rewards/rejected": -8.519791603088379, + "step": 1670 + }, + { + "epoch": 1.73, + "learning_rate": 2.345183486238532e-07, + "logits/chosen": -1.5683923959732056, + "logits/rejected": -1.4021813869476318, + "logps/chosen": -327.93804931640625, + "logps/rejected": -343.5317077636719, + "loss": -1.6055, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -7.430581569671631, + "rewards/margins": 1.1966545581817627, + "rewards/rejected": -8.62723445892334, + "step": 1680 + }, + { + "epoch": 1.74, + "learning_rate": 2.3260703363914372e-07, + "logits/chosen": -1.631715178489685, + "logits/rejected": -1.5181890726089478, + "logps/chosen": -370.483642578125, + "logps/rejected": -336.6531677246094, + "loss": -1.8198, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -6.040884494781494, + "rewards/margins": 3.5697269439697266, + "rewards/rejected": -9.610611915588379, + "step": 1690 + }, + { + "epoch": 1.75, + "learning_rate": 2.3069571865443425e-07, + "logits/chosen": -1.4669805765151978, + "logits/rejected": -1.3994741439819336, + "logps/chosen": -295.02496337890625, + "logps/rejected": -321.65814208984375, + "loss": -2.0272, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -6.588179588317871, + "rewards/margins": 2.978868007659912, + "rewards/rejected": -9.567047119140625, + "step": 1700 + }, + { + "epoch": 1.76, + "learning_rate": 2.2878440366972476e-07, + "logits/chosen": -1.6681219339370728, + "logits/rejected": -1.491996169090271, + "logps/chosen": -338.78729248046875, + "logps/rejected": -374.81634521484375, + "loss": -1.8513, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -6.465402126312256, + "rewards/margins": 2.903998851776123, + "rewards/rejected": -9.369401931762695, + "step": 1710 + }, + { + "epoch": 1.78, + "learning_rate": 2.268730886850153e-07, + "logits/chosen": -1.4729211330413818, + "logits/rejected": -1.1880760192871094, + "logps/chosen": -334.2304992675781, + "logps/rejected": -305.21368408203125, + "loss": -2.0159, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -7.080233573913574, + "rewards/margins": 2.117523431777954, + "rewards/rejected": -9.197757720947266, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 2.249617737003058e-07, + "logits/chosen": -1.563912034034729, + "logits/rejected": -1.5858089923858643, + "logps/chosen": -378.62396240234375, + "logps/rejected": -377.3148193359375, + "loss": -1.9999, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -8.893011093139648, + "rewards/margins": 2.892535924911499, + "rewards/rejected": -11.785547256469727, + "step": 1730 + }, + { + "epoch": 1.8, + "learning_rate": 2.2305045871559633e-07, + "logits/chosen": -1.392828345298767, + "logits/rejected": -1.3615916967391968, + "logps/chosen": -361.71051025390625, + "logps/rejected": -335.33673095703125, + "loss": -2.324, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -8.41543197631836, + "rewards/margins": 2.103468418121338, + "rewards/rejected": -10.518899917602539, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 2.2113914373088686e-07, + "logits/chosen": -1.4968210458755493, + "logits/rejected": -1.4618072509765625, + "logps/chosen": -382.20367431640625, + "logps/rejected": -378.17901611328125, + "loss": -2.2244, + "rewards/accuracies": 0.6875, + "rewards/chosen": -7.971160888671875, + "rewards/margins": 4.1811017990112305, + "rewards/rejected": -12.152261734008789, + "step": 1750 + }, + { + "epoch": 1.82, + "learning_rate": 2.1922782874617736e-07, + "logits/chosen": -1.37467360496521, + "logits/rejected": -1.3709341287612915, + "logps/chosen": -313.5454406738281, + "logps/rejected": -367.4576721191406, + "loss": -1.9016, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.650922775268555, + "rewards/margins": 2.388150691986084, + "rewards/rejected": -11.039073944091797, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 2.1731651376146787e-07, + "logits/chosen": -1.4017971754074097, + "logits/rejected": -1.2825329303741455, + "logps/chosen": -311.17572021484375, + "logps/rejected": -345.6313171386719, + "loss": -1.8086, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.644105911254883, + "rewards/margins": 3.1263985633850098, + "rewards/rejected": -11.77050495147705, + "step": 1770 + }, + { + "epoch": 1.84, + "learning_rate": 2.154051987767584e-07, + "logits/chosen": -1.5087705850601196, + "logits/rejected": -1.3651801347732544, + "logps/chosen": -351.78582763671875, + "logps/rejected": -337.21295166015625, + "loss": -1.5194, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -8.838014602661133, + "rewards/margins": 3.8112175464630127, + "rewards/rejected": -12.64923095703125, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 2.134938837920489e-07, + "logits/chosen": -1.2244422435760498, + "logits/rejected": -1.352123498916626, + "logps/chosen": -281.91949462890625, + "logps/rejected": -307.07867431640625, + "loss": -1.8547, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -8.937861442565918, + "rewards/margins": 3.2620761394500732, + "rewards/rejected": -12.19993782043457, + "step": 1790 + }, + { + "epoch": 1.86, + "learning_rate": 2.1158256880733944e-07, + "logits/chosen": -1.3861466646194458, + "logits/rejected": -1.3692476749420166, + "logps/chosen": -379.1508483886719, + "logps/rejected": -348.1292724609375, + "loss": -2.9603, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.26054859161377, + "rewards/margins": 5.145617485046387, + "rewards/rejected": -13.406166076660156, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 2.0967125382262994e-07, + "logits/chosen": -1.2910804748535156, + "logits/rejected": -1.3471043109893799, + "logps/chosen": -359.8387451171875, + "logps/rejected": -327.9253845214844, + "loss": -1.6607, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -9.164748191833496, + "rewards/margins": 1.6195499897003174, + "rewards/rejected": -10.784297943115234, + "step": 1810 + }, + { + "epoch": 1.88, + "learning_rate": 2.0775993883792048e-07, + "logits/chosen": -1.4032940864562988, + "logits/rejected": -1.253796100616455, + "logps/chosen": -357.06549072265625, + "logps/rejected": -318.9412536621094, + "loss": -2.9627, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -7.581735134124756, + "rewards/margins": 3.610663652420044, + "rewards/rejected": -11.192397117614746, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 2.05848623853211e-07, + "logits/chosen": -1.4505128860473633, + "logits/rejected": -1.4566118717193604, + "logps/chosen": -400.2586975097656, + "logps/rejected": -357.13458251953125, + "loss": -2.4791, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -10.725576400756836, + "rewards/margins": 2.9340341091156006, + "rewards/rejected": -13.6596097946167, + "step": 1830 + }, + { + "epoch": 1.9, + "learning_rate": 2.0393730886850151e-07, + "logits/chosen": -1.4870072603225708, + "logits/rejected": -1.2949211597442627, + "logps/chosen": -384.02947998046875, + "logps/rejected": -346.6249084472656, + "loss": -1.9201, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -9.375011444091797, + "rewards/margins": 3.3913631439208984, + "rewards/rejected": -12.766374588012695, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 2.0202599388379205e-07, + "logits/chosen": -1.5045329332351685, + "logits/rejected": -1.3934712409973145, + "logps/chosen": -375.15155029296875, + "logps/rejected": -420.40283203125, + "loss": -2.6807, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -10.547384262084961, + "rewards/margins": 4.163573265075684, + "rewards/rejected": -14.710957527160645, + "step": 1850 + }, + { + "epoch": 1.92, + "learning_rate": 2.0011467889908258e-07, + "logits/chosen": -1.5281354188919067, + "logits/rejected": -1.2871553897857666, + "logps/chosen": -395.02471923828125, + "logps/rejected": -297.695068359375, + "loss": -2.9446, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -10.007207870483398, + "rewards/margins": 2.725371837615967, + "rewards/rejected": -12.732580184936523, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 1.9820336391437308e-07, + "logits/chosen": -1.310064673423767, + "logits/rejected": -1.3744899034500122, + "logps/chosen": -344.2419738769531, + "logps/rejected": -346.10430908203125, + "loss": -2.1029, + "rewards/accuracies": 0.6875, + "rewards/chosen": -9.30712604522705, + "rewards/margins": 4.580069541931152, + "rewards/rejected": -13.88719654083252, + "step": 1870 + }, + { + "epoch": 1.94, + "learning_rate": 1.9629204892966362e-07, + "logits/chosen": -1.4468410015106201, + "logits/rejected": -1.3818161487579346, + "logps/chosen": -382.7313232421875, + "logps/rejected": -351.123291015625, + "loss": -3.6235, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -10.61854362487793, + "rewards/margins": 3.927306652069092, + "rewards/rejected": -14.545849800109863, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 1.943807339449541e-07, + "logits/chosen": -1.280505895614624, + "logits/rejected": -1.126263976097107, + "logps/chosen": -361.83258056640625, + "logps/rejected": -395.9088439941406, + "loss": -2.9208, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -10.34544849395752, + "rewards/margins": 5.857726097106934, + "rewards/rejected": -16.20317268371582, + "step": 1890 + }, + { + "epoch": 1.96, + "learning_rate": 1.9246941896024463e-07, + "logits/chosen": -1.300330400466919, + "logits/rejected": -1.2540452480316162, + "logps/chosen": -374.8466796875, + "logps/rejected": -399.75933837890625, + "loss": -2.3092, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -12.081557273864746, + "rewards/margins": 3.7616398334503174, + "rewards/rejected": -15.8431978225708, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 1.9055810397553516e-07, + "logits/chosen": -1.3129085302352905, + "logits/rejected": -1.266152262687683, + "logps/chosen": -376.607666015625, + "logps/rejected": -378.1551513671875, + "loss": -2.5453, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.030994415283203, + "rewards/margins": 3.7824196815490723, + "rewards/rejected": -15.813413619995117, + "step": 1910 + }, + { + "epoch": 1.98, + "learning_rate": 1.8864678899082566e-07, + "logits/chosen": -1.3077343702316284, + "logits/rejected": -1.3403010368347168, + "logps/chosen": -395.0058288574219, + "logps/rejected": -348.19683837890625, + "loss": -2.6208, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -12.53403091430664, + "rewards/margins": 2.729905605316162, + "rewards/rejected": -15.263936042785645, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 1.867354740061162e-07, + "logits/chosen": -1.2000598907470703, + "logits/rejected": -1.1860705614089966, + "logps/chosen": -377.95538330078125, + "logps/rejected": -394.8722229003906, + "loss": -2.3739, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -12.864456176757812, + "rewards/margins": 4.33025598526001, + "rewards/rejected": -17.194711685180664, + "step": 1930 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.5908974409103394, + "eval_logits/rejected": -1.4396800994873047, + "eval_logps/chosen": -418.98797607421875, + "eval_logps/rejected": -413.8171691894531, + "eval_loss": -3.0139997005462646, + "eval_rewards/accuracies": 0.658730149269104, + "eval_rewards/chosen": -12.91853141784668, + "eval_rewards/margins": 4.969918251037598, + "eval_rewards/rejected": -17.888450622558594, + "eval_runtime": 237.8807, + "eval_samples_per_second": 8.408, + "eval_steps_per_second": 0.265, + "step": 1938 + }, + { + "epoch": 2.0, + "learning_rate": 1.8482415902140673e-07, + "logits/chosen": -1.3271663188934326, + "logits/rejected": -1.3011524677276611, + "logps/chosen": -392.82757568359375, + "logps/rejected": -392.5997314453125, + "loss": -2.8487, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.525858879089355, + "rewards/margins": 4.043676376342773, + "rewards/rejected": -17.569536209106445, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 1.8291284403669723e-07, + "logits/chosen": -1.2705609798431396, + "logits/rejected": -1.3038126230239868, + "logps/chosen": -373.14031982421875, + "logps/rejected": -386.13677978515625, + "loss": -3.4669, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -12.79985237121582, + "rewards/margins": 3.9696388244628906, + "rewards/rejected": -16.76949119567871, + "step": 1950 + }, + { + "epoch": 2.02, + "learning_rate": 1.8100152905198777e-07, + "logits/chosen": -1.061727523803711, + "logits/rejected": -1.110812783241272, + "logps/chosen": -381.92529296875, + "logps/rejected": -418.325927734375, + "loss": -2.5351, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -12.154891967773438, + "rewards/margins": 2.9344310760498047, + "rewards/rejected": -15.089323043823242, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 1.7909021406727827e-07, + "logits/chosen": -1.107367992401123, + "logits/rejected": -1.453131914138794, + "logps/chosen": -374.98504638671875, + "logps/rejected": -365.43121337890625, + "loss": -3.2083, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -11.676986694335938, + "rewards/margins": 4.082896709442139, + "rewards/rejected": -15.75988483428955, + "step": 1970 + }, + { + "epoch": 2.04, + "learning_rate": 1.771788990825688e-07, + "logits/chosen": -1.3608293533325195, + "logits/rejected": -1.06710684299469, + "logps/chosen": -423.4046325683594, + "logps/rejected": -425.17681884765625, + "loss": -2.8719, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -14.271319389343262, + "rewards/margins": 4.651618957519531, + "rewards/rejected": -18.92293930053711, + "step": 1980 + }, + { + "epoch": 2.05, + "learning_rate": 1.7526758409785934e-07, + "logits/chosen": -1.4061094522476196, + "logits/rejected": -1.2359219789505005, + "logps/chosen": -473.62841796875, + "logps/rejected": -462.7413024902344, + "loss": -3.7745, + "rewards/accuracies": 0.5625, + "rewards/chosen": -17.31951332092285, + "rewards/margins": 4.680100440979004, + "rewards/rejected": -21.99961280822754, + "step": 1990 + }, + { + "epoch": 2.06, + "learning_rate": 1.7335626911314984e-07, + "logits/chosen": -1.2134959697723389, + "logits/rejected": -0.9500266909599304, + "logps/chosen": -372.77880859375, + "logps/rejected": -378.95703125, + "loss": -3.7006, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.280550003051758, + "rewards/margins": 3.6492176055908203, + "rewards/rejected": -18.929767608642578, + "step": 2000 + }, + { + "epoch": 2.07, + "learning_rate": 1.7144495412844037e-07, + "logits/chosen": -1.3106526136398315, + "logits/rejected": -1.1043154001235962, + "logps/chosen": -463.447021484375, + "logps/rejected": -458.7850036621094, + "loss": -2.2585, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -16.96882438659668, + "rewards/margins": 5.767825603485107, + "rewards/rejected": -22.736652374267578, + "step": 2010 + }, + { + "epoch": 2.08, + "learning_rate": 1.6953363914373088e-07, + "logits/chosen": -1.373578667640686, + "logits/rejected": -1.0678809881210327, + "logps/chosen": -457.8382873535156, + "logps/rejected": -414.07177734375, + "loss": -2.567, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -16.41115951538086, + "rewards/margins": 3.8120930194854736, + "rewards/rejected": -20.223255157470703, + "step": 2020 + }, + { + "epoch": 2.09, + "learning_rate": 1.6762232415902138e-07, + "logits/chosen": -1.1989551782608032, + "logits/rejected": -1.2645841836929321, + "logps/chosen": -400.55767822265625, + "logps/rejected": -395.439208984375, + "loss": -2.5975, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.613690376281738, + "rewards/margins": 3.9228732585906982, + "rewards/rejected": -18.536563873291016, + "step": 2030 + }, + { + "epoch": 2.11, + "learning_rate": 1.6571100917431192e-07, + "logits/chosen": -1.3490411043167114, + "logits/rejected": -1.1970597505569458, + "logps/chosen": -399.5980224609375, + "logps/rejected": -373.1912841796875, + "loss": -2.902, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -13.886739730834961, + "rewards/margins": 3.5066256523132324, + "rewards/rejected": -17.39336585998535, + "step": 2040 + }, + { + "epoch": 2.12, + "learning_rate": 1.6379969418960242e-07, + "logits/chosen": -1.2504427433013916, + "logits/rejected": -1.1676826477050781, + "logps/chosen": -489.55914306640625, + "logps/rejected": -456.7439880371094, + "loss": -3.6402, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -17.439693450927734, + "rewards/margins": 2.561140537261963, + "rewards/rejected": -20.000835418701172, + "step": 2050 + }, + { + "epoch": 2.13, + "learning_rate": 1.6188837920489295e-07, + "logits/chosen": -1.4094626903533936, + "logits/rejected": -0.9949380159378052, + "logps/chosen": -461.00286865234375, + "logps/rejected": -403.776123046875, + "loss": -3.295, + "rewards/accuracies": 0.5625, + "rewards/chosen": -16.868534088134766, + "rewards/margins": 2.4951887130737305, + "rewards/rejected": -19.36372184753418, + "step": 2060 + }, + { + "epoch": 2.14, + "learning_rate": 1.5997706422018349e-07, + "logits/chosen": -1.5532751083374023, + "logits/rejected": -1.1935482025146484, + "logps/chosen": -410.2151794433594, + "logps/rejected": -497.067626953125, + "loss": -3.6557, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -14.347163200378418, + "rewards/margins": 6.878281593322754, + "rewards/rejected": -21.225446701049805, + "step": 2070 + }, + { + "epoch": 2.15, + "learning_rate": 1.58065749235474e-07, + "logits/chosen": -1.2084702253341675, + "logits/rejected": -1.1066341400146484, + "logps/chosen": -507.550537109375, + "logps/rejected": -450.25341796875, + "loss": -3.175, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -17.20359992980957, + "rewards/margins": 2.5353291034698486, + "rewards/rejected": -19.738927841186523, + "step": 2080 + }, + { + "epoch": 2.16, + "learning_rate": 1.5615443425076452e-07, + "logits/chosen": -1.0724289417266846, + "logits/rejected": -1.0922993421554565, + "logps/chosen": -402.9586486816406, + "logps/rejected": -448.19830322265625, + "loss": -3.6034, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -14.504638671875, + "rewards/margins": 8.439682006835938, + "rewards/rejected": -22.944318771362305, + "step": 2090 + }, + { + "epoch": 2.17, + "learning_rate": 1.5424311926605506e-07, + "logits/chosen": -0.9919137954711914, + "logits/rejected": -0.9210994839668274, + "logps/chosen": -369.6788024902344, + "logps/rejected": -405.4874572753906, + "loss": -4.0197, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -14.22148323059082, + "rewards/margins": 8.295201301574707, + "rewards/rejected": -22.51668357849121, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 1.5233180428134556e-07, + "logits/chosen": -1.2452681064605713, + "logits/rejected": -1.1532270908355713, + "logps/chosen": -423.7035217285156, + "logps/rejected": -444.843017578125, + "loss": -4.0177, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -15.260812759399414, + "rewards/margins": 7.597034454345703, + "rewards/rejected": -22.857845306396484, + "step": 2110 + }, + { + "epoch": 2.19, + "learning_rate": 1.504204892966361e-07, + "logits/chosen": -1.2690461874008179, + "logits/rejected": -1.036007046699524, + "logps/chosen": -481.8981018066406, + "logps/rejected": -458.9730529785156, + "loss": -4.7473, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -18.875347137451172, + "rewards/margins": 3.536944627761841, + "rewards/rejected": -22.412288665771484, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 1.485091743119266e-07, + "logits/chosen": -1.2708927392959595, + "logits/rejected": -0.9568039178848267, + "logps/chosen": -407.5409851074219, + "logps/rejected": -469.5133361816406, + "loss": -4.6317, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -15.849174499511719, + "rewards/margins": 7.314669609069824, + "rewards/rejected": -23.16384506225586, + "step": 2130 + }, + { + "epoch": 2.21, + "learning_rate": 1.465978593272171e-07, + "logits/chosen": -0.9912996292114258, + "logits/rejected": -0.8085284233093262, + "logps/chosen": -353.92413330078125, + "logps/rejected": -355.982666015625, + "loss": -4.0689, + "rewards/accuracies": 0.625, + "rewards/chosen": -15.852828979492188, + "rewards/margins": 5.281838417053223, + "rewards/rejected": -21.134668350219727, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 1.4468654434250764e-07, + "logits/chosen": -1.2438325881958008, + "logits/rejected": -1.1221582889556885, + "logps/chosen": -519.1368408203125, + "logps/rejected": -489.5020446777344, + "loss": -2.5507, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.783428192138672, + "rewards/margins": 1.9117088317871094, + "rewards/rejected": -21.69513702392578, + "step": 2150 + }, + { + "epoch": 2.23, + "learning_rate": 1.4277522935779814e-07, + "logits/chosen": -1.0609955787658691, + "logits/rejected": -1.2507171630859375, + "logps/chosen": -440.3553161621094, + "logps/rejected": -453.50323486328125, + "loss": -3.4898, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -16.776865005493164, + "rewards/margins": 4.883880615234375, + "rewards/rejected": -21.660743713378906, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 1.4086391437308867e-07, + "logits/chosen": -1.274261474609375, + "logits/rejected": -1.1783298254013062, + "logps/chosen": -472.8345642089844, + "logps/rejected": -521.444580078125, + "loss": -4.8646, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -17.88382339477539, + "rewards/margins": 8.282121658325195, + "rewards/rejected": -26.165943145751953, + "step": 2170 + }, + { + "epoch": 2.25, + "learning_rate": 1.389525993883792e-07, + "logits/chosen": -1.1477124691009521, + "logits/rejected": -0.8995282053947449, + "logps/chosen": -398.76043701171875, + "logps/rejected": -418.26507568359375, + "loss": -3.7138, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -17.033363342285156, + "rewards/margins": 5.318907737731934, + "rewards/rejected": -22.352272033691406, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 1.370412844036697e-07, + "logits/chosen": -1.067368507385254, + "logits/rejected": -1.2181618213653564, + "logps/chosen": -437.30010986328125, + "logps/rejected": -469.89056396484375, + "loss": -3.5077, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -19.17410659790039, + "rewards/margins": 4.917706489562988, + "rewards/rejected": -24.091812133789062, + "step": 2190 + }, + { + "epoch": 2.27, + "learning_rate": 1.3512996941896024e-07, + "logits/chosen": -1.0004960298538208, + "logits/rejected": -0.8722925186157227, + "logps/chosen": -415.56719970703125, + "logps/rejected": -430.51934814453125, + "loss": -4.4207, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -19.313739776611328, + "rewards/margins": 4.984239101409912, + "rewards/rejected": -24.297977447509766, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 1.3321865443425075e-07, + "logits/chosen": -1.0728847980499268, + "logits/rejected": -0.7316335439682007, + "logps/chosen": -489.6312561035156, + "logps/rejected": -451.337158203125, + "loss": -4.1332, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -22.230623245239258, + "rewards/margins": 0.4879101812839508, + "rewards/rejected": -22.71853256225586, + "step": 2210 + }, + { + "epoch": 2.29, + "learning_rate": 1.3130733944954128e-07, + "logits/chosen": -1.31141197681427, + "logits/rejected": -1.0552115440368652, + "logps/chosen": -473.9842834472656, + "logps/rejected": -515.6341552734375, + "loss": -3.8823, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -18.287981033325195, + "rewards/margins": 8.8409423828125, + "rewards/rejected": -27.128925323486328, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 1.293960244648318e-07, + "logits/chosen": -1.1148552894592285, + "logits/rejected": -1.010851502418518, + "logps/chosen": -501.09588623046875, + "logps/rejected": -478.259033203125, + "loss": -4.2715, + "rewards/accuracies": 0.625, + "rewards/chosen": -20.409494400024414, + "rewards/margins": 5.208123683929443, + "rewards/rejected": -25.617618560791016, + "step": 2230 + }, + { + "epoch": 2.31, + "learning_rate": 1.2748470948012232e-07, + "logits/chosen": -1.1082009077072144, + "logits/rejected": -0.9660416841506958, + "logps/chosen": -469.67169189453125, + "logps/rejected": -531.001708984375, + "loss": -5.0927, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -20.520383834838867, + "rewards/margins": 7.39895486831665, + "rewards/rejected": -27.91933822631836, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 1.2557339449541285e-07, + "logits/chosen": -1.1804434061050415, + "logits/rejected": -1.013977289199829, + "logps/chosen": -464.8408203125, + "logps/rejected": -479.40301513671875, + "loss": -5.3276, + "rewards/accuracies": 0.5625, + "rewards/chosen": -18.352712631225586, + "rewards/margins": 6.4782609939575195, + "rewards/rejected": -24.830974578857422, + "step": 2250 + }, + { + "epoch": 2.33, + "learning_rate": 1.2366207951070336e-07, + "logits/chosen": -1.1036585569381714, + "logits/rejected": -1.1067498922348022, + "logps/chosen": -482.696533203125, + "logps/rejected": -480.6744689941406, + "loss": -3.2585, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -21.47313117980957, + "rewards/margins": 3.0878939628601074, + "rewards/rejected": -24.561023712158203, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 1.217507645259939e-07, + "logits/chosen": -1.1785879135131836, + "logits/rejected": -0.8433502912521362, + "logps/chosen": -448.43017578125, + "logps/rejected": -455.5037536621094, + "loss": -3.4348, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -17.98015594482422, + "rewards/margins": 5.601017475128174, + "rewards/rejected": -23.581167221069336, + "step": 2270 + }, + { + "epoch": 2.35, + "learning_rate": 1.198394495412844e-07, + "logits/chosen": -0.9618834257125854, + "logits/rejected": -0.9488929510116577, + "logps/chosen": -486.79248046875, + "logps/rejected": -497.04498291015625, + "loss": -5.1327, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -20.99383544921875, + "rewards/margins": 5.7790398597717285, + "rewards/rejected": -26.772876739501953, + "step": 2280 + }, + { + "epoch": 2.36, + "learning_rate": 1.1792813455657493e-07, + "logits/chosen": -1.175091028213501, + "logits/rejected": -0.9602692723274231, + "logps/chosen": -455.3914489746094, + "logps/rejected": -423.8038024902344, + "loss": -5.0517, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -19.302087783813477, + "rewards/margins": 2.1102347373962402, + "rewards/rejected": -21.412322998046875, + "step": 2290 + }, + { + "epoch": 2.37, + "learning_rate": 1.1601681957186543e-07, + "logits/chosen": -1.0566465854644775, + "logits/rejected": -0.9404910802841187, + "logps/chosen": -462.96478271484375, + "logps/rejected": -497.4542541503906, + "loss": -5.7946, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -18.82391357421875, + "rewards/margins": 9.018692016601562, + "rewards/rejected": -27.842605590820312, + "step": 2300 + }, + { + "epoch": 2.38, + "learning_rate": 1.1410550458715595e-07, + "logits/chosen": -0.9795541763305664, + "logits/rejected": -1.0114076137542725, + "logps/chosen": -471.488037109375, + "logps/rejected": -525.1002807617188, + "loss": -5.3356, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.668289184570312, + "rewards/margins": 7.146371364593506, + "rewards/rejected": -28.814661026000977, + "step": 2310 + }, + { + "epoch": 2.39, + "learning_rate": 1.1219418960244648e-07, + "logits/chosen": -1.0043725967407227, + "logits/rejected": -0.814845085144043, + "logps/chosen": -471.0302734375, + "logps/rejected": -553.7667846679688, + "loss": -7.8293, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.013063430786133, + "rewards/margins": 5.8318376541137695, + "rewards/rejected": -30.84490394592285, + "step": 2320 + }, + { + "epoch": 2.4, + "learning_rate": 1.10282874617737e-07, + "logits/chosen": -1.04874587059021, + "logits/rejected": -0.8525232076644897, + "logps/chosen": -503.905517578125, + "logps/rejected": -522.6575927734375, + "loss": -5.5289, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -27.28557777404785, + "rewards/margins": 4.0987653732299805, + "rewards/rejected": -31.38434410095215, + "step": 2330 + }, + { + "epoch": 2.41, + "learning_rate": 1.0837155963302752e-07, + "logits/chosen": -0.9908869862556458, + "logits/rejected": -0.810786247253418, + "logps/chosen": -424.4300842285156, + "logps/rejected": -541.6786499023438, + "loss": -4.8461, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -21.23228645324707, + "rewards/margins": 9.21428394317627, + "rewards/rejected": -30.446569442749023, + "step": 2340 + }, + { + "epoch": 2.43, + "learning_rate": 1.0646024464831804e-07, + "logits/chosen": -0.9769414067268372, + "logits/rejected": -0.7807801961898804, + "logps/chosen": -508.21539306640625, + "logps/rejected": -505.65118408203125, + "loss": -4.3914, + "rewards/accuracies": 0.5625, + "rewards/chosen": -23.276477813720703, + "rewards/margins": 5.2235870361328125, + "rewards/rejected": -28.500064849853516, + "step": 2350 + }, + { + "epoch": 2.44, + "learning_rate": 1.0454892966360856e-07, + "logits/chosen": -0.9564868211746216, + "logits/rejected": -0.8420296907424927, + "logps/chosen": -462.8448791503906, + "logps/rejected": -527.154541015625, + "loss": -4.464, + "rewards/accuracies": 0.625, + "rewards/chosen": -20.870206832885742, + "rewards/margins": 10.262295722961426, + "rewards/rejected": -31.13250160217285, + "step": 2360 + }, + { + "epoch": 2.45, + "learning_rate": 1.0263761467889908e-07, + "logits/chosen": -0.8812466859817505, + "logits/rejected": -0.8560865521430969, + "logps/chosen": -485.3046875, + "logps/rejected": -493.06463623046875, + "loss": -4.3681, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -22.31712532043457, + "rewards/margins": 3.8712782859802246, + "rewards/rejected": -26.188400268554688, + "step": 2370 + }, + { + "epoch": 2.46, + "learning_rate": 1.007262996941896e-07, + "logits/chosen": -0.9851228594779968, + "logits/rejected": -0.6262848973274231, + "logps/chosen": -522.0337524414062, + "logps/rejected": -517.8556518554688, + "loss": -3.7686, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.0426025390625, + "rewards/margins": 6.041103839874268, + "rewards/rejected": -32.083702087402344, + "step": 2380 + }, + { + "epoch": 2.47, + "learning_rate": 9.881498470948011e-08, + "logits/chosen": -0.9182702898979187, + "logits/rejected": -0.8262530565261841, + "logps/chosen": -526.509521484375, + "logps/rejected": -501.44390869140625, + "loss": -4.2044, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.09255599975586, + "rewards/margins": 4.793452739715576, + "rewards/rejected": -29.88600730895996, + "step": 2390 + }, + { + "epoch": 2.48, + "learning_rate": 9.690366972477065e-08, + "logits/chosen": -1.020527958869934, + "logits/rejected": -0.9827292561531067, + "logps/chosen": -535.2392578125, + "logps/rejected": -474.393310546875, + "loss": -4.1335, + "rewards/accuracies": 0.4375, + "rewards/chosen": -26.45871925354004, + "rewards/margins": -0.03600626066327095, + "rewards/rejected": -26.42270851135254, + "step": 2400 + }, + { + "epoch": 2.49, + "learning_rate": 9.499235474006116e-08, + "logits/chosen": -1.0558453798294067, + "logits/rejected": -0.9320958256721497, + "logps/chosen": -537.9478759765625, + "logps/rejected": -526.4865112304688, + "loss": -5.638, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.561458587646484, + "rewards/margins": 8.904586791992188, + "rewards/rejected": -31.466039657592773, + "step": 2410 + }, + { + "epoch": 2.5, + "learning_rate": 9.308103975535168e-08, + "logits/chosen": -0.9652633666992188, + "logits/rejected": -0.9594426155090332, + "logps/chosen": -547.8624877929688, + "logps/rejected": -563.6146240234375, + "loss": -5.9731, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -25.674243927001953, + "rewards/margins": 5.594969749450684, + "rewards/rejected": -31.269210815429688, + "step": 2420 + }, + { + "epoch": 2.51, + "learning_rate": 9.116972477064219e-08, + "logits/chosen": -0.9045840501785278, + "logits/rejected": -0.9294489622116089, + "logps/chosen": -545.46923828125, + "logps/rejected": -560.5089111328125, + "loss": -4.2549, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.388031005859375, + "rewards/margins": 3.495013475418091, + "rewards/rejected": -29.883047103881836, + "step": 2430 + }, + { + "epoch": 2.52, + "learning_rate": 8.925840978593272e-08, + "logits/chosen": -1.0906932353973389, + "logits/rejected": -0.8586239814758301, + "logps/chosen": -426.898193359375, + "logps/rejected": -520.27099609375, + "loss": -5.0199, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -21.479185104370117, + "rewards/margins": 6.4797844886779785, + "rewards/rejected": -27.958969116210938, + "step": 2440 + }, + { + "epoch": 2.53, + "learning_rate": 8.734709480122324e-08, + "logits/chosen": -0.8275815844535828, + "logits/rejected": -0.8221632242202759, + "logps/chosen": -514.3942260742188, + "logps/rejected": -599.9034423828125, + "loss": -5.2968, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -26.718358993530273, + "rewards/margins": 10.353950500488281, + "rewards/rejected": -37.07230758666992, + "step": 2450 + }, + { + "epoch": 2.54, + "learning_rate": 8.543577981651376e-08, + "logits/chosen": -1.0432653427124023, + "logits/rejected": -1.0401585102081299, + "logps/chosen": -591.7424926757812, + "logps/rejected": -566.1804809570312, + "loss": -5.6217, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -23.220060348510742, + "rewards/margins": 7.772289276123047, + "rewards/rejected": -30.99234962463379, + "step": 2460 + }, + { + "epoch": 2.55, + "learning_rate": 8.352446483180428e-08, + "logits/chosen": -0.9903294444084167, + "logits/rejected": -0.9184169769287109, + "logps/chosen": -531.7313842773438, + "logps/rejected": -534.6137084960938, + "loss": -3.808, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -26.629840850830078, + "rewards/margins": 6.476201057434082, + "rewards/rejected": -33.106040954589844, + "step": 2470 + }, + { + "epoch": 2.56, + "learning_rate": 8.161314984709481e-08, + "logits/chosen": -1.09735107421875, + "logits/rejected": -0.9943147897720337, + "logps/chosen": -521.5565795898438, + "logps/rejected": -527.12744140625, + "loss": -4.6607, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -24.24032211303711, + "rewards/margins": 6.096768379211426, + "rewards/rejected": -30.33709144592285, + "step": 2480 + }, + { + "epoch": 2.57, + "learning_rate": 7.970183486238531e-08, + "logits/chosen": -0.9941670298576355, + "logits/rejected": -0.8651212453842163, + "logps/chosen": -524.2615356445312, + "logps/rejected": -511.6361389160156, + "loss": -5.4135, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -26.943328857421875, + "rewards/margins": 3.2720611095428467, + "rewards/rejected": -30.21539306640625, + "step": 2490 + }, + { + "epoch": 2.58, + "learning_rate": 7.779051987767583e-08, + "logits/chosen": -1.145651936531067, + "logits/rejected": -1.0183385610580444, + "logps/chosen": -548.2122192382812, + "logps/rejected": -548.8924560546875, + "loss": -7.1154, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -24.76950454711914, + "rewards/margins": 7.515495300292969, + "rewards/rejected": -32.284996032714844, + "step": 2500 + }, + { + "epoch": 2.59, + "learning_rate": 7.587920489296635e-08, + "logits/chosen": -1.0196164846420288, + "logits/rejected": -1.0715177059173584, + "logps/chosen": -547.984375, + "logps/rejected": -582.05126953125, + "loss": -6.2112, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.091842651367188, + "rewards/margins": 9.326883316040039, + "rewards/rejected": -35.41872787475586, + "step": 2510 + }, + { + "epoch": 2.6, + "learning_rate": 7.396788990825688e-08, + "logits/chosen": -1.106105923652649, + "logits/rejected": -0.9713128209114075, + "logps/chosen": -485.42083740234375, + "logps/rejected": -594.6617431640625, + "loss": -6.3354, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.46695327758789, + "rewards/margins": 16.13218879699707, + "rewards/rejected": -38.599143981933594, + "step": 2520 + }, + { + "epoch": 2.61, + "learning_rate": 7.20565749235474e-08, + "logits/chosen": -0.9968252182006836, + "logits/rejected": -0.8192588090896606, + "logps/chosen": -515.4625854492188, + "logps/rejected": -511.76849365234375, + "loss": -5.7957, + "rewards/accuracies": 0.5625, + "rewards/chosen": -26.7257022857666, + "rewards/margins": 5.563019752502441, + "rewards/rejected": -32.28872299194336, + "step": 2530 + }, + { + "epoch": 2.62, + "learning_rate": 7.014525993883792e-08, + "logits/chosen": -0.9310768246650696, + "logits/rejected": -0.9076264500617981, + "logps/chosen": -532.5977783203125, + "logps/rejected": -534.395263671875, + "loss": -6.54, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -25.927881240844727, + "rewards/margins": 5.213220119476318, + "rewards/rejected": -31.141101837158203, + "step": 2540 + }, + { + "epoch": 2.63, + "learning_rate": 6.823394495412843e-08, + "logits/chosen": -0.8654760122299194, + "logits/rejected": -0.9172045588493347, + "logps/chosen": -494.37005615234375, + "logps/rejected": -517.6791381835938, + "loss": -5.034, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -22.28108787536621, + "rewards/margins": 8.954570770263672, + "rewards/rejected": -31.235660552978516, + "step": 2550 + }, + { + "epoch": 2.64, + "learning_rate": 6.632262996941895e-08, + "logits/chosen": -0.8583803176879883, + "logits/rejected": -1.0703628063201904, + "logps/chosen": -545.6281127929688, + "logps/rejected": -576.9746704101562, + "loss": -6.1075, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -29.19540023803711, + "rewards/margins": 6.0790910720825195, + "rewards/rejected": -35.27449035644531, + "step": 2560 + }, + { + "epoch": 2.65, + "learning_rate": 6.441131498470948e-08, + "logits/chosen": -0.9567610621452332, + "logits/rejected": -0.7643688917160034, + "logps/chosen": -534.2616577148438, + "logps/rejected": -558.9248657226562, + "loss": -6.139, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -28.329565048217773, + "rewards/margins": 6.173336982727051, + "rewards/rejected": -34.50290298461914, + "step": 2570 + }, + { + "epoch": 2.66, + "learning_rate": 6.25e-08, + "logits/chosen": -0.973718523979187, + "logits/rejected": -0.9818305969238281, + "logps/chosen": -575.3428955078125, + "logps/rejected": -579.9434814453125, + "loss": -5.6596, + "rewards/accuracies": 0.5625, + "rewards/chosen": -26.38543701171875, + "rewards/margins": 7.383551120758057, + "rewards/rejected": -33.76898956298828, + "step": 2580 + }, + { + "epoch": 2.67, + "learning_rate": 6.058868501529052e-08, + "logits/chosen": -1.0347440242767334, + "logits/rejected": -1.049570083618164, + "logps/chosen": -574.5046997070312, + "logps/rejected": -509.9305114746094, + "loss": -5.3415, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -28.74808692932129, + "rewards/margins": -0.5448096394538879, + "rewards/rejected": -28.203277587890625, + "step": 2590 + }, + { + "epoch": 2.68, + "learning_rate": 5.8677370030581035e-08, + "logits/chosen": -1.0313562154769897, + "logits/rejected": -0.7958860397338867, + "logps/chosen": -527.613525390625, + "logps/rejected": -525.542724609375, + "loss": -6.3272, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -26.725732803344727, + "rewards/margins": 5.203427314758301, + "rewards/rejected": -31.92915916442871, + "step": 2600 + }, + { + "epoch": 2.69, + "learning_rate": 5.6766055045871554e-08, + "logits/chosen": -0.9836112260818481, + "logits/rejected": -0.9386361241340637, + "logps/chosen": -525.5948486328125, + "logps/rejected": -588.316162109375, + "loss": -7.7597, + "rewards/accuracies": 0.5625, + "rewards/chosen": -26.01352882385254, + "rewards/margins": 9.247634887695312, + "rewards/rejected": -35.26116180419922, + "step": 2610 + }, + { + "epoch": 2.7, + "learning_rate": 5.485474006116208e-08, + "logits/chosen": -1.163663387298584, + "logits/rejected": -0.8112475275993347, + "logps/chosen": -533.2066040039062, + "logps/rejected": -540.348388671875, + "loss": -5.5581, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -26.54342269897461, + "rewards/margins": 6.053246021270752, + "rewards/rejected": -32.59667205810547, + "step": 2620 + }, + { + "epoch": 2.71, + "learning_rate": 5.294342507645259e-08, + "logits/chosen": -0.9945865869522095, + "logits/rejected": -0.8910030126571655, + "logps/chosen": -593.0062866210938, + "logps/rejected": -556.135009765625, + "loss": -4.082, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -28.179473876953125, + "rewards/margins": 6.279976844787598, + "rewards/rejected": -34.459449768066406, + "step": 2630 + }, + { + "epoch": 2.72, + "learning_rate": 5.1032110091743117e-08, + "logits/chosen": -0.6196568012237549, + "logits/rejected": -0.8046578168869019, + "logps/chosen": -520.0942993164062, + "logps/rejected": -537.9401245117188, + "loss": -4.2575, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -30.394145965576172, + "rewards/margins": 2.2538414001464844, + "rewards/rejected": -32.64798355102539, + "step": 2640 + }, + { + "epoch": 2.73, + "learning_rate": 4.9120795107033635e-08, + "logits/chosen": -0.9934478998184204, + "logits/rejected": -0.9696424603462219, + "logps/chosen": -542.2866821289062, + "logps/rejected": -604.997314453125, + "loss": -7.0483, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.58272361755371, + "rewards/margins": 10.307984352111816, + "rewards/rejected": -36.890708923339844, + "step": 2650 + }, + { + "epoch": 2.75, + "learning_rate": 4.7209480122324154e-08, + "logits/chosen": -1.0938358306884766, + "logits/rejected": -0.9137821197509766, + "logps/chosen": -557.5704345703125, + "logps/rejected": -692.9163818359375, + "loss": -4.9773, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.691476821899414, + "rewards/margins": 14.579462051391602, + "rewards/rejected": -44.270938873291016, + "step": 2660 + }, + { + "epoch": 2.76, + "learning_rate": 4.529816513761467e-08, + "logits/chosen": -0.9120942950248718, + "logits/rejected": -0.9001785516738892, + "logps/chosen": -560.4894409179688, + "logps/rejected": -654.8059692382812, + "loss": -8.4192, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -25.794815063476562, + "rewards/margins": 12.174165725708008, + "rewards/rejected": -37.9689826965332, + "step": 2670 + }, + { + "epoch": 2.77, + "learning_rate": 4.33868501529052e-08, + "logits/chosen": -1.0795204639434814, + "logits/rejected": -0.798618733882904, + "logps/chosen": -502.0621643066406, + "logps/rejected": -547.533203125, + "loss": -6.4279, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -27.192279815673828, + "rewards/margins": 8.032793998718262, + "rewards/rejected": -35.225074768066406, + "step": 2680 + }, + { + "epoch": 2.78, + "learning_rate": 4.147553516819572e-08, + "logits/chosen": -0.8620451092720032, + "logits/rejected": -0.8486016988754272, + "logps/chosen": -493.64447021484375, + "logps/rejected": -515.5201416015625, + "loss": -7.8372, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -26.775470733642578, + "rewards/margins": 3.5873656272888184, + "rewards/rejected": -30.362834930419922, + "step": 2690 + }, + { + "epoch": 2.79, + "learning_rate": 3.9564220183486236e-08, + "logits/chosen": -0.9343770146369934, + "logits/rejected": -0.7297960519790649, + "logps/chosen": -554.4147338867188, + "logps/rejected": -525.2438354492188, + "loss": -9.0144, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -29.26637840270996, + "rewards/margins": 4.40227746963501, + "rewards/rejected": -33.668663024902344, + "step": 2700 + }, + { + "epoch": 2.8, + "learning_rate": 3.7652905198776755e-08, + "logits/chosen": -0.9890888333320618, + "logits/rejected": -0.8765937685966492, + "logps/chosen": -600.9437255859375, + "logps/rejected": -544.9365234375, + "loss": -5.8302, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -30.97121238708496, + "rewards/margins": 1.8391153812408447, + "rewards/rejected": -32.810325622558594, + "step": 2710 + }, + { + "epoch": 2.81, + "learning_rate": 3.574159021406728e-08, + "logits/chosen": -0.885094165802002, + "logits/rejected": -1.0295897722244263, + "logps/chosen": -576.9892578125, + "logps/rejected": -676.5888671875, + "loss": -3.6913, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -30.517303466796875, + "rewards/margins": 8.666280746459961, + "rewards/rejected": -39.1835823059082, + "step": 2720 + }, + { + "epoch": 2.82, + "learning_rate": 3.383027522935779e-08, + "logits/chosen": -0.9966446161270142, + "logits/rejected": -0.8098469972610474, + "logps/chosen": -550.4933471679688, + "logps/rejected": -582.1474609375, + "loss": -5.0871, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -28.010635375976562, + "rewards/margins": 6.666192054748535, + "rewards/rejected": -34.67682647705078, + "step": 2730 + }, + { + "epoch": 2.83, + "learning_rate": 3.191896024464832e-08, + "logits/chosen": -0.9043565988540649, + "logits/rejected": -0.7680369019508362, + "logps/chosen": -609.8352661132812, + "logps/rejected": -591.0311279296875, + "loss": -7.8269, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -32.64293670654297, + "rewards/margins": 3.9754230976104736, + "rewards/rejected": -36.618350982666016, + "step": 2740 + }, + { + "epoch": 2.84, + "learning_rate": 3.0007645259938836e-08, + "logits/chosen": -0.9044156074523926, + "logits/rejected": -0.7647527456283569, + "logps/chosen": -504.7789611816406, + "logps/rejected": -613.888671875, + "loss": -8.5506, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -28.10544204711914, + "rewards/margins": 12.60815715789795, + "rewards/rejected": -40.713600158691406, + "step": 2750 + }, + { + "epoch": 2.85, + "learning_rate": 2.809633027522936e-08, + "logits/chosen": -0.7312633395195007, + "logits/rejected": -0.8145195245742798, + "logps/chosen": -568.981689453125, + "logps/rejected": -555.5551147460938, + "loss": -8.1692, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -32.20295715332031, + "rewards/margins": 2.2871875762939453, + "rewards/rejected": -34.490150451660156, + "step": 2760 + }, + { + "epoch": 2.86, + "learning_rate": 2.6185015290519877e-08, + "logits/chosen": -0.692935049533844, + "logits/rejected": -0.9012505412101746, + "logps/chosen": -530.6353759765625, + "logps/rejected": -661.496826171875, + "loss": -7.2198, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -26.056262969970703, + "rewards/margins": 15.568461418151855, + "rewards/rejected": -41.62472152709961, + "step": 2770 + }, + { + "epoch": 2.87, + "learning_rate": 2.4273700305810396e-08, + "logits/chosen": -1.0233699083328247, + "logits/rejected": -0.7373823523521423, + "logps/chosen": -615.1844482421875, + "logps/rejected": -660.0513305664062, + "loss": -4.8984, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -30.922901153564453, + "rewards/margins": 10.078287124633789, + "rewards/rejected": -41.001190185546875, + "step": 2780 + }, + { + "epoch": 2.88, + "learning_rate": 2.2362385321100918e-08, + "logits/chosen": -1.081015944480896, + "logits/rejected": -0.7994831800460815, + "logps/chosen": -571.29150390625, + "logps/rejected": -558.0635375976562, + "loss": -7.6276, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.834667205810547, + "rewards/margins": 5.563540458679199, + "rewards/rejected": -34.39820861816406, + "step": 2790 + }, + { + "epoch": 2.89, + "learning_rate": 2.0451070336391437e-08, + "logits/chosen": -0.8437451124191284, + "logits/rejected": -0.8097723722457886, + "logps/chosen": -579.05615234375, + "logps/rejected": -576.0593872070312, + "loss": -7.1202, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -30.914493560791016, + "rewards/margins": 7.180275917053223, + "rewards/rejected": -38.094764709472656, + "step": 2800 + }, + { + "epoch": 2.9, + "learning_rate": 1.8539755351681956e-08, + "logits/chosen": -0.816103458404541, + "logits/rejected": -0.7335480451583862, + "logps/chosen": -546.3377685546875, + "logps/rejected": -556.5323486328125, + "loss": -9.8674, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.328052520751953, + "rewards/margins": 3.8435778617858887, + "rewards/rejected": -32.171630859375, + "step": 2810 + }, + { + "epoch": 2.91, + "learning_rate": 1.6628440366972478e-08, + "logits/chosen": -0.994927704334259, + "logits/rejected": -0.8127029538154602, + "logps/chosen": -540.6372680664062, + "logps/rejected": -544.7251586914062, + "loss": -4.5174, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -25.344158172607422, + "rewards/margins": 6.365922451019287, + "rewards/rejected": -31.710086822509766, + "step": 2820 + }, + { + "epoch": 2.92, + "learning_rate": 1.4717125382262997e-08, + "logits/chosen": -0.900621235370636, + "logits/rejected": -0.9584394693374634, + "logps/chosen": -547.0264892578125, + "logps/rejected": -650.3952026367188, + "loss": -7.3594, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -27.896032333374023, + "rewards/margins": 12.844169616699219, + "rewards/rejected": -40.740196228027344, + "step": 2830 + }, + { + "epoch": 2.93, + "learning_rate": 1.2805810397553517e-08, + "logits/chosen": -0.9609657526016235, + "logits/rejected": -0.9351837038993835, + "logps/chosen": -620.4771118164062, + "logps/rejected": -574.4600830078125, + "loss": -4.3283, + "rewards/accuracies": 0.5, + "rewards/chosen": -34.61457061767578, + "rewards/margins": 0.11583442986011505, + "rewards/rejected": -34.730403900146484, + "step": 2840 + }, + { + "epoch": 2.94, + "learning_rate": 1.0894495412844038e-08, + "logits/chosen": -0.9170868992805481, + "logits/rejected": -0.9585624933242798, + "logps/chosen": -573.9271240234375, + "logps/rejected": -615.289794921875, + "loss": -9.5317, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -31.822189331054688, + "rewards/margins": 6.956101894378662, + "rewards/rejected": -38.77829360961914, + "step": 2850 + }, + { + "epoch": 2.95, + "learning_rate": 8.983180428134555e-09, + "logits/chosen": -0.9001976847648621, + "logits/rejected": -0.7435283660888672, + "logps/chosen": -546.4014892578125, + "logps/rejected": -503.437255859375, + "loss": -7.6183, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -28.65180015563965, + "rewards/margins": 3.4450135231018066, + "rewards/rejected": -32.0968132019043, + "step": 2860 + }, + { + "epoch": 2.96, + "learning_rate": 7.071865443425076e-09, + "logits/chosen": -0.9058972597122192, + "logits/rejected": -0.6541125774383545, + "logps/chosen": -654.9110107421875, + "logps/rejected": -610.533447265625, + "loss": -7.5332, + "rewards/accuracies": 0.5625, + "rewards/chosen": -37.84512710571289, + "rewards/margins": 0.6990224123001099, + "rewards/rejected": -38.544151306152344, + "step": 2870 + }, + { + "epoch": 2.97, + "learning_rate": 5.1605504587155965e-09, + "logits/chosen": -1.0033214092254639, + "logits/rejected": -1.143033504486084, + "logps/chosen": -586.54833984375, + "logps/rejected": -587.2291259765625, + "loss": -8.2379, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.552661895751953, + "rewards/margins": 5.087256908416748, + "rewards/rejected": -35.63991928100586, + "step": 2880 + }, + { + "epoch": 2.98, + "learning_rate": 3.249235474006116e-09, + "logits/chosen": -0.9559615850448608, + "logits/rejected": -0.6920489072799683, + "logps/chosen": -566.3262329101562, + "logps/rejected": -614.7679443359375, + "loss": -6.6734, + "rewards/accuracies": 0.625, + "rewards/chosen": -29.60939598083496, + "rewards/margins": 8.876012802124023, + "rewards/rejected": -38.48540496826172, + "step": 2890 + }, + { + "epoch": 2.99, + "learning_rate": 1.3379204892966359e-09, + "logits/chosen": -0.7593010663986206, + "logits/rejected": -0.7966611981391907, + "logps/chosen": -579.7311401367188, + "logps/rejected": -572.9598999023438, + "loss": -5.7169, + "rewards/accuracies": 0.625, + "rewards/chosen": -32.345096588134766, + "rewards/margins": 4.745226860046387, + "rewards/rejected": -37.09032440185547, + "step": 2900 + }, + { + "epoch": 3.0, + "eval_logits/chosen": -1.2629982233047485, + "eval_logits/rejected": -1.075066089630127, + "eval_logps/chosen": -588.9970092773438, + "eval_logps/rejected": -633.47216796875, + "eval_loss": -7.541553497314453, + "eval_rewards/accuracies": 0.6150793433189392, + "eval_rewards/chosen": -29.919435501098633, + "eval_rewards/margins": 9.934508323669434, + "eval_rewards/rejected": -39.853946685791016, + "eval_runtime": 238.4669, + "eval_samples_per_second": 8.387, + "eval_steps_per_second": 0.264, + "step": 2907 + }, + { + "epoch": 3.0, + "step": 2907, + "total_flos": 0.0, + "train_loss": -1.932115889119454, + "train_runtime": 45081.596, + "train_samples_per_second": 4.124, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 10, + "max_steps": 2907, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}