{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10000, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008, "grad_norm": 21.581922928906643, "learning_rate": 2.4e-08, "logits/chosen": 58.004119873046875, "logits/rejected": 46.01157760620117, "logps/chosen": -68.83617401123047, "logps/rejected": -57.57984924316406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.008, "grad_norm": 20.4688268613938, "learning_rate": 2.4000000000000003e-07, "logits/chosen": 59.60597229003906, "logits/rejected": 55.70063781738281, "logps/chosen": -57.55007553100586, "logps/rejected": -64.12437438964844, "loss": 0.6943, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.0031741515267640352, "rewards/margins": 0.004799725487828255, "rewards/rejected": -0.0016255751252174377, "step": 10 }, { "epoch": 0.016, "grad_norm": 24.0481332048613, "learning_rate": 4.800000000000001e-07, "logits/chosen": 58.38694381713867, "logits/rejected": 57.88800811767578, "logps/chosen": -61.777549743652344, "logps/rejected": -76.17720031738281, "loss": 0.6889, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.016177240759134293, "rewards/margins": -0.016559984534978867, "rewards/rejected": 0.0003827471227850765, "step": 20 }, { "epoch": 0.024, "grad_norm": 18.48062338133383, "learning_rate": 7.2e-07, "logits/chosen": 57.91182327270508, "logits/rejected": 55.304176330566406, "logps/chosen": -64.21894073486328, "logps/rejected": -70.75691986083984, "loss": 0.6969, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.016543073579669, "rewards/margins": -0.02169782668352127, "rewards/rejected": 0.005154752172529697, "step": 30 }, { "epoch": 0.032, "grad_norm": 23.114053471975932, "learning_rate": 9.600000000000001e-07, "logits/chosen": 56.66730880737305, "logits/rejected": 58.83241653442383, "logps/chosen": -55.49696731567383, "logps/rejected": -65.36714172363281, "loss": 0.6852, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.06515228003263474, "rewards/margins": -0.007973963394761086, "rewards/rejected": -0.0571783110499382, "step": 40 }, { "epoch": 0.04, "grad_norm": 17.75345076566636, "learning_rate": 1.2000000000000002e-06, "logits/chosen": 56.38788986206055, "logits/rejected": 56.39375686645508, "logps/chosen": -56.10227584838867, "logps/rejected": -68.6869125366211, "loss": 0.658, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12126936763525009, "rewards/margins": 0.07603181153535843, "rewards/rejected": -0.19730117917060852, "step": 50 }, { "epoch": 0.048, "grad_norm": 19.63702779072802, "learning_rate": 1.44e-06, "logits/chosen": 55.504783630371094, "logits/rejected": 58.70302200317383, "logps/chosen": -57.737709045410156, "logps/rejected": -81.11849975585938, "loss": 0.604, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.26448559761047363, "rewards/margins": 0.15866820514202118, "rewards/rejected": -0.4231537878513336, "step": 60 }, { "epoch": 0.056, "grad_norm": 18.137331310839922, "learning_rate": 1.6800000000000002e-06, "logits/chosen": 57.54669952392578, "logits/rejected": 54.60467529296875, "logps/chosen": -67.18961334228516, "logps/rejected": -77.70694732666016, "loss": 0.5428, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.612514317035675, "rewards/margins": 0.23503276705741882, "rewards/rejected": -0.847547173500061, "step": 70 }, { "epoch": 0.064, "grad_norm": 18.231347330767957, "learning_rate": 1.9200000000000003e-06, "logits/chosen": 54.9367561340332, "logits/rejected": 54.68426513671875, "logps/chosen": -62.68987274169922, "logps/rejected": -77.52642059326172, "loss": 0.5303, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0836379528045654, "rewards/margins": 0.46581095457077026, "rewards/rejected": -1.5494489669799805, "step": 80 }, { "epoch": 0.072, "grad_norm": 17.73877224591989, "learning_rate": 2.16e-06, "logits/chosen": 53.1925163269043, "logits/rejected": 54.84343338012695, "logps/chosen": -75.93465423583984, "logps/rejected": -97.27320861816406, "loss": 0.4118, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.2428396940231323, "rewards/margins": 1.071587324142456, "rewards/rejected": -2.314426898956299, "step": 90 }, { "epoch": 0.08, "grad_norm": 19.99662339162552, "learning_rate": 2.4000000000000003e-06, "logits/chosen": 53.5159797668457, "logits/rejected": 50.565223693847656, "logps/chosen": -77.43242645263672, "logps/rejected": -100.10637664794922, "loss": 0.4154, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.2108376026153564, "rewards/margins": 1.270476222038269, "rewards/rejected": -3.481314182281494, "step": 100 }, { "epoch": 0.088, "grad_norm": 21.75426718419849, "learning_rate": 2.64e-06, "logits/chosen": 52.46582794189453, "logits/rejected": 51.50734329223633, "logps/chosen": -81.02312469482422, "logps/rejected": -109.15274810791016, "loss": 0.3175, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.748387336730957, "rewards/margins": 1.5973384380340576, "rewards/rejected": -4.3457255363464355, "step": 110 }, { "epoch": 0.096, "grad_norm": 16.929821828111294, "learning_rate": 2.88e-06, "logits/chosen": 49.478553771972656, "logits/rejected": 47.64826583862305, "logps/chosen": -77.54215240478516, "logps/rejected": -115.6530990600586, "loss": 0.3657, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.67846941947937, "rewards/margins": 2.1004786491394043, "rewards/rejected": -4.778947830200195, "step": 120 }, { "epoch": 0.104, "grad_norm": 18.372474710811762, "learning_rate": 2.9998537860139563e-06, "logits/chosen": 48.161476135253906, "logits/rejected": 49.45183563232422, "logps/chosen": -99.06620025634766, "logps/rejected": -118.9909439086914, "loss": 0.3465, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.597534656524658, "rewards/margins": 1.4235634803771973, "rewards/rejected": -5.021098613739014, "step": 130 }, { "epoch": 0.112, "grad_norm": 11.846677373240393, "learning_rate": 2.9986842451482876e-06, "logits/chosen": 47.42317199707031, "logits/rejected": 47.59636306762695, "logps/chosen": -87.74755859375, "logps/rejected": -126.07084655761719, "loss": 0.285, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.148099422454834, "rewards/margins": 2.943894863128662, "rewards/rejected": -6.091994285583496, "step": 140 }, { "epoch": 0.12, "grad_norm": 36.496040238235864, "learning_rate": 2.9963460753897363e-06, "logits/chosen": 45.3277473449707, "logits/rejected": 49.25115203857422, "logps/chosen": -95.30039978027344, "logps/rejected": -133.84915161132812, "loss": 0.3391, "rewards/accuracies": 0.75, "rewards/chosen": -3.688962459564209, "rewards/margins": 2.5285627841949463, "rewards/rejected": -6.217525482177734, "step": 150 }, { "epoch": 0.128, "grad_norm": 19.59471062026771, "learning_rate": 2.9928410999727467e-06, "logits/chosen": 48.085975646972656, "logits/rejected": 47.303443908691406, "logps/chosen": -94.16909790039062, "logps/rejected": -124.5986557006836, "loss": 0.2205, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.4897732734680176, "rewards/margins": 2.562234401702881, "rewards/rejected": -6.05200719833374, "step": 160 }, { "epoch": 0.136, "grad_norm": 12.523322391428595, "learning_rate": 2.988172051971717e-06, "logits/chosen": 45.95378112792969, "logits/rejected": 47.941734313964844, "logps/chosen": -95.47936248779297, "logps/rejected": -132.12608337402344, "loss": 0.2479, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.5358726978302, "rewards/margins": 2.7687032222747803, "rewards/rejected": -6.3045759201049805, "step": 170 }, { "epoch": 0.144, "grad_norm": 22.507905467341416, "learning_rate": 2.9823425721698293e-06, "logits/chosen": 47.45555877685547, "logits/rejected": 45.6512451171875, "logps/chosen": -88.78406524658203, "logps/rejected": -130.03347778320312, "loss": 0.2046, "rewards/accuracies": 0.875, "rewards/chosen": -3.3888843059539795, "rewards/margins": 3.680863857269287, "rewards/rejected": -7.0697479248046875, "step": 180 }, { "epoch": 0.152, "grad_norm": 20.747003935771083, "learning_rate": 2.975357206220079e-06, "logits/chosen": 43.333560943603516, "logits/rejected": 47.98616409301758, "logps/chosen": -103.88460540771484, "logps/rejected": -156.9280242919922, "loss": 0.1485, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.8503522872924805, "rewards/margins": 3.576326370239258, "rewards/rejected": -8.426677703857422, "step": 190 }, { "epoch": 0.16, "grad_norm": 17.041744365225764, "learning_rate": 2.9672214011007086e-06, "logits/chosen": 39.760108947753906, "logits/rejected": 39.248085021972656, "logps/chosen": -108.03792572021484, "logps/rejected": -156.3753204345703, "loss": 0.1781, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.024051189422607, "rewards/margins": 4.045407295227051, "rewards/rejected": -9.0694580078125, "step": 200 }, { "epoch": 0.168, "grad_norm": 36.70520837669934, "learning_rate": 2.95794150086782e-06, "logits/chosen": 37.43067169189453, "logits/rejected": 37.81218719482422, "logps/chosen": -122.63470458984375, "logps/rejected": -180.85458374023438, "loss": 0.1757, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.044098854064941, "rewards/margins": 4.934633255004883, "rewards/rejected": -10.978731155395508, "step": 210 }, { "epoch": 0.176, "grad_norm": 9.451569716987374, "learning_rate": 2.9475247417084673e-06, "logits/chosen": 38.666202545166016, "logits/rejected": 39.08936309814453, "logps/chosen": -123.2136001586914, "logps/rejected": -177.7836456298828, "loss": 0.1577, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.986439228057861, "rewards/margins": 4.965152263641357, "rewards/rejected": -10.951591491699219, "step": 220 }, { "epoch": 0.184, "grad_norm": 4.732586023204334, "learning_rate": 2.9359792462981008e-06, "logits/chosen": 37.19866180419922, "logits/rejected": 39.60905075073242, "logps/chosen": -121.33009338378906, "logps/rejected": -176.44203186035156, "loss": 0.271, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -6.191902160644531, "rewards/margins": 4.402517318725586, "rewards/rejected": -10.594419479370117, "step": 230 }, { "epoch": 0.192, "grad_norm": 46.75079216151069, "learning_rate": 2.9233140174667447e-06, "logits/chosen": 35.31947326660156, "logits/rejected": 39.77277374267578, "logps/chosen": -115.7451400756836, "logps/rejected": -193.0277557373047, "loss": 0.1112, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.50314998626709, "rewards/margins": 6.3514556884765625, "rewards/rejected": -11.854605674743652, "step": 240 }, { "epoch": 0.2, "grad_norm": 56.466288047816285, "learning_rate": 2.9095389311788626e-06, "logits/chosen": 33.861488342285156, "logits/rejected": 37.11942672729492, "logps/chosen": -114.38687896728516, "logps/rejected": -182.79983520507812, "loss": 0.1955, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.901637077331543, "rewards/margins": 5.603636741638184, "rewards/rejected": -11.505274772644043, "step": 250 }, { "epoch": 0.208, "grad_norm": 11.256333883583846, "learning_rate": 2.894664728832377e-06, "logits/chosen": 38.68649673461914, "logits/rejected": 38.391868591308594, "logps/chosen": -125.4790267944336, "logps/rejected": -184.40927124023438, "loss": 0.1671, "rewards/accuracies": 0.875, "rewards/chosen": -5.836948871612549, "rewards/margins": 5.325386047363281, "rewards/rejected": -11.162334442138672, "step": 260 }, { "epoch": 0.216, "grad_norm": 26.666948743704236, "learning_rate": 2.878703008882852e-06, "logits/chosen": 36.955894470214844, "logits/rejected": 39.17523193359375, "logps/chosen": -107.0689468383789, "logps/rejected": -169.14031982421875, "loss": 0.21, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.4622063636779785, "rewards/margins": 4.83941125869751, "rewards/rejected": -10.301618576049805, "step": 270 }, { "epoch": 0.224, "grad_norm": 14.65567488371947, "learning_rate": 2.861666217799363e-06, "logits/chosen": 38.019309997558594, "logits/rejected": 41.11383056640625, "logps/chosen": -110.4853515625, "logps/rejected": -187.29208374023438, "loss": 0.2127, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.756478309631348, "rewards/margins": 6.638999938964844, "rewards/rejected": -11.395478248596191, "step": 280 }, { "epoch": 0.232, "grad_norm": 4.610753920962251, "learning_rate": 2.8435676403591196e-06, "logits/chosen": 38.20881271362305, "logits/rejected": 35.70587921142578, "logps/chosen": -113.76185607910156, "logps/rejected": -169.40895080566406, "loss": 0.1187, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.58528995513916, "rewards/margins": 5.651334285736084, "rewards/rejected": -11.236624717712402, "step": 290 }, { "epoch": 0.24, "grad_norm": 16.792649400961125, "learning_rate": 2.8244213892883906e-06, "logits/chosen": 37.90871810913086, "logits/rejected": 41.139713287353516, "logps/chosen": -115.63899993896484, "logps/rejected": -188.9228973388672, "loss": 0.0712, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.2915940284729, "rewards/margins": 5.684304237365723, "rewards/rejected": -10.975896835327148, "step": 300 }, { "epoch": 0.248, "grad_norm": 3.4109197371862177, "learning_rate": 2.8042423942578284e-06, "logits/chosen": 32.42366027832031, "logits/rejected": 37.181610107421875, "logps/chosen": -111.64329528808594, "logps/rejected": -204.35745239257812, "loss": 0.1115, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.966713905334473, "rewards/margins": 7.201415061950684, "rewards/rejected": -13.168128967285156, "step": 310 }, { "epoch": 0.256, "grad_norm": 34.61265439136052, "learning_rate": 2.78304639024076e-06, "logits/chosen": 33.11461639404297, "logits/rejected": 34.98590087890625, "logps/chosen": -129.79061889648438, "logps/rejected": -214.4392547607422, "loss": 0.1245, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.288665771484375, "rewards/margins": 6.769256591796875, "rewards/rejected": -14.05792236328125, "step": 320 }, { "epoch": 0.264, "grad_norm": 40.604661066881384, "learning_rate": 2.7608499052435266e-06, "logits/chosen": 30.30582046508789, "logits/rejected": 34.23260498046875, "logps/chosen": -120.01988220214844, "logps/rejected": -213.86123657226562, "loss": 0.1025, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.6563215255737305, "rewards/margins": 7.752140998840332, "rewards/rejected": -14.408462524414062, "step": 330 }, { "epoch": 0.272, "grad_norm": 14.452169107352786, "learning_rate": 2.7376702474174426e-06, "logits/chosen": 33.35188293457031, "logits/rejected": 31.421377182006836, "logps/chosen": -136.9534912109375, "logps/rejected": -193.56581115722656, "loss": 0.121, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.1245574951171875, "rewards/margins": 5.618683338165283, "rewards/rejected": -12.743240356445312, "step": 340 }, { "epoch": 0.28, "grad_norm": 0.6616288825225272, "learning_rate": 2.713525491562421e-06, "logits/chosen": 35.41117858886719, "logits/rejected": 35.81652069091797, "logps/chosen": -142.27957153320312, "logps/rejected": -206.56591796875, "loss": 0.1007, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.426796913146973, "rewards/margins": 6.085718154907227, "rewards/rejected": -13.5125150680542, "step": 350 }, { "epoch": 0.288, "grad_norm": 17.254991119966604, "learning_rate": 2.688434465032786e-06, "logits/chosen": 27.68739891052246, "logits/rejected": 32.76261520385742, "logps/chosen": -134.08424377441406, "logps/rejected": -243.40560913085938, "loss": 0.147, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.258644104003906, "rewards/margins": 9.23260498046875, "rewards/rejected": -16.491247177124023, "step": 360 }, { "epoch": 0.296, "grad_norm": 4.261139222241878, "learning_rate": 2.6624167330562694e-06, "logits/chosen": 29.108882904052734, "logits/rejected": 31.266714096069336, "logps/chosen": -134.42970275878906, "logps/rejected": -220.4041748046875, "loss": 0.0617, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.372394561767578, "rewards/margins": 8.266263008117676, "rewards/rejected": -15.63865852355957, "step": 370 }, { "epoch": 0.304, "grad_norm": 1.262878357266055, "learning_rate": 2.6354925834776346e-06, "logits/chosen": 27.302906036376953, "logits/rejected": 31.609981536865234, "logps/chosen": -149.15396118164062, "logps/rejected": -241.90420532226562, "loss": 0.1221, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.688291549682617, "rewards/margins": 8.208234786987305, "rewards/rejected": -16.896526336669922, "step": 380 }, { "epoch": 0.312, "grad_norm": 43.62912594375019, "learning_rate": 2.607683010938826e-06, "logits/chosen": 27.132095336914062, "logits/rejected": 28.30078125, "logps/chosen": -155.2683563232422, "logps/rejected": -240.5152587890625, "loss": 0.1539, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.518204689025879, "rewards/margins": 7.853548526763916, "rewards/rejected": -17.371753692626953, "step": 390 }, { "epoch": 0.32, "grad_norm": 31.97742985431739, "learning_rate": 2.5790097005079765e-06, "logits/chosen": 27.687297821044922, "logits/rejected": 30.38442611694336, "logps/chosen": -152.99664306640625, "logps/rejected": -259.11627197265625, "loss": 0.0747, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.019024848937988, "rewards/margins": 9.441879272460938, "rewards/rejected": -18.46090316772461, "step": 400 }, { "epoch": 0.328, "grad_norm": 0.11384820974453211, "learning_rate": 2.549495010770048e-06, "logits/chosen": 27.36895751953125, "logits/rejected": 28.704010009765625, "logps/chosen": -144.52304077148438, "logps/rejected": -247.2997589111328, "loss": 0.1256, "rewards/accuracies": 1.0, "rewards/chosen": -8.224719047546387, "rewards/margins": 9.54539680480957, "rewards/rejected": -17.77011489868164, "step": 410 }, { "epoch": 0.336, "grad_norm": 7.534465189988814, "learning_rate": 2.519161956392275e-06, "logits/chosen": 30.696910858154297, "logits/rejected": 30.624948501586914, "logps/chosen": -144.02389526367188, "logps/rejected": -232.913818359375, "loss": 0.1559, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.057051658630371, "rewards/margins": 8.463793754577637, "rewards/rejected": -16.520845413208008, "step": 420 }, { "epoch": 0.344, "grad_norm": 23.270592204413173, "learning_rate": 2.4880341901780208e-06, "logits/chosen": 31.727802276611328, "logits/rejected": 34.599830627441406, "logps/chosen": -146.5982208251953, "logps/rejected": -236.9903106689453, "loss": 0.101, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -8.658833503723145, "rewards/margins": 7.560339450836182, "rewards/rejected": -16.219173431396484, "step": 430 }, { "epoch": 0.352, "grad_norm": 18.404989253529585, "learning_rate": 2.456135984623035e-06, "logits/chosen": 31.22390365600586, "logits/rejected": 34.494503021240234, "logps/chosen": -131.6576385498047, "logps/rejected": -224.7453155517578, "loss": 0.0794, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.250141143798828, "rewards/margins": 7.884213447570801, "rewards/rejected": -16.134353637695312, "step": 440 }, { "epoch": 0.36, "grad_norm": 5.835930327262679, "learning_rate": 2.4234922129884873e-06, "logits/chosen": 31.049453735351562, "logits/rejected": 32.644561767578125, "logps/chosen": -136.99464416503906, "logps/rejected": -229.085693359375, "loss": 0.0925, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.603506565093994, "rewards/margins": 8.22996997833252, "rewards/rejected": -15.833475112915039, "step": 450 }, { "epoch": 0.368, "grad_norm": 83.38465938371176, "learning_rate": 2.3901283299055523e-06, "logits/chosen": 29.210134506225586, "logits/rejected": 33.91218185424805, "logps/chosen": -133.32357788085938, "logps/rejected": -240.31906127929688, "loss": 0.0892, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.826475620269775, "rewards/margins": 8.995865821838379, "rewards/rejected": -16.82234001159668, "step": 460 }, { "epoch": 0.376, "grad_norm": 2.502030553529166, "learning_rate": 2.356070351526648e-06, "logits/chosen": 30.35089111328125, "logits/rejected": 29.741165161132812, "logps/chosen": -145.46237182617188, "logps/rejected": -234.22525024414062, "loss": 0.0886, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.746350288391113, "rewards/margins": 8.287931442260742, "rewards/rejected": -17.034282684326172, "step": 470 }, { "epoch": 0.384, "grad_norm": 14.091730335356385, "learning_rate": 2.3213448352388254e-06, "logits/chosen": 27.40460777282715, "logits/rejected": 31.037439346313477, "logps/chosen": -142.2796173095703, "logps/rejected": -239.9733428955078, "loss": 0.064, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.121622085571289, "rewards/margins": 8.313209533691406, "rewards/rejected": -17.434831619262695, "step": 480 }, { "epoch": 0.392, "grad_norm": 11.107798849175124, "learning_rate": 2.285978858955119e-06, "logits/chosen": 31.397411346435547, "logits/rejected": 31.57785987854004, "logps/chosen": -137.85682678222656, "logps/rejected": -237.64340209960938, "loss": 0.0405, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.711313724517822, "rewards/margins": 8.816143035888672, "rewards/rejected": -16.527456283569336, "step": 490 }, { "epoch": 0.4, "grad_norm": 0.7122560435001203, "learning_rate": 2.25e-06, "logits/chosen": 29.285724639892578, "logits/rejected": 33.19956970214844, "logps/chosen": -150.15484619140625, "logps/rejected": -258.1266784667969, "loss": 0.0857, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.766794204711914, "rewards/margins": 9.55189323425293, "rewards/rejected": -18.31868553161621, "step": 500 }, { "epoch": 0.408, "grad_norm": 1.1236023709740472, "learning_rate": 2.213436313605413e-06, "logits/chosen": 27.258758544921875, "logits/rejected": 32.38069152832031, "logps/chosen": -140.72256469726562, "logps/rejected": -257.39410400390625, "loss": 0.0529, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.840039253234863, "rewards/margins": 10.10526180267334, "rewards/rejected": -18.945301055908203, "step": 510 }, { "epoch": 0.416, "grad_norm": 42.2808961940555, "learning_rate": 2.1763163110341462e-06, "logits/chosen": 26.64520263671875, "logits/rejected": 29.211254119873047, "logps/chosen": -143.02182006835938, "logps/rejected": -262.0480041503906, "loss": 0.0919, "rewards/accuracies": 1.0, "rewards/chosen": -8.261417388916016, "rewards/margins": 10.782800674438477, "rewards/rejected": -19.044218063354492, "step": 520 }, { "epoch": 0.424, "grad_norm": 5.223417864951368, "learning_rate": 2.138668937347609e-06, "logits/chosen": 25.72537612915039, "logits/rejected": 28.29427146911621, "logps/chosen": -153.34121704101562, "logps/rejected": -274.3043212890625, "loss": 0.1288, "rewards/accuracies": 1.0, "rewards/chosen": -9.276151657104492, "rewards/margins": 11.748897552490234, "rewards/rejected": -21.02505111694336, "step": 530 }, { "epoch": 0.432, "grad_norm": 13.59100794053642, "learning_rate": 2.100523548835343e-06, "logits/chosen": 26.498031616210938, "logits/rejected": 29.164413452148438, "logps/chosen": -168.49325561523438, "logps/rejected": -274.753173828125, "loss": 0.0632, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.204986572265625, "rewards/margins": 9.171854972839355, "rewards/rejected": -20.376840591430664, "step": 540 }, { "epoch": 0.44, "grad_norm": 6.137158026956021, "learning_rate": 2.061909890123868e-06, "logits/chosen": 23.507408142089844, "logits/rejected": 27.2824764251709, "logps/chosen": -153.0961456298828, "logps/rejected": -278.5215148925781, "loss": 0.0752, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.87879753112793, "rewards/margins": 10.660184860229492, "rewards/rejected": -20.538986206054688, "step": 550 }, { "epoch": 0.448, "grad_norm": 3.456576564579184, "learning_rate": 2.022858070982723e-06, "logits/chosen": 24.48337745666504, "logits/rejected": 27.21124839782715, "logps/chosen": -173.62545776367188, "logps/rejected": -283.33575439453125, "loss": 0.0325, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.958311080932617, "rewards/margins": 10.133281707763672, "rewards/rejected": -21.09159278869629, "step": 560 }, { "epoch": 0.456, "grad_norm": 40.06033927173472, "learning_rate": 1.983398542845767e-06, "logits/chosen": 23.440534591674805, "logits/rejected": 24.654071807861328, "logps/chosen": -153.51531982421875, "logps/rejected": -272.8458251953125, "loss": 0.0788, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.767447471618652, "rewards/margins": 11.361773490905762, "rewards/rejected": -21.129222869873047, "step": 570 }, { "epoch": 0.464, "grad_norm": 0.1813350915194513, "learning_rate": 1.9435620750660703e-06, "logits/chosen": 20.848228454589844, "logits/rejected": 24.540836334228516, "logps/chosen": -139.9002227783203, "logps/rejected": -272.963623046875, "loss": 0.0687, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.545764923095703, "rewards/margins": 12.114400863647461, "rewards/rejected": -20.660165786743164, "step": 580 }, { "epoch": 0.472, "grad_norm": 45.254112699453046, "learning_rate": 1.9033797309228985e-06, "logits/chosen": 19.232898712158203, "logits/rejected": 23.08200454711914, "logps/chosen": -163.60186767578125, "logps/rejected": -292.57952880859375, "loss": 0.3063, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.544285774230957, "rewards/margins": 11.296300888061523, "rewards/rejected": -21.840587615966797, "step": 590 }, { "epoch": 0.48, "grad_norm": 2.147234675582369, "learning_rate": 1.8628828433995015e-06, "logits/chosen": 24.288915634155273, "logits/rejected": 26.083560943603516, "logps/chosen": -165.03009033203125, "logps/rejected": -274.15594482421875, "loss": 0.0344, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.776161193847656, "rewards/margins": 10.559597969055176, "rewards/rejected": -20.335758209228516, "step": 600 }, { "epoch": 0.488, "grad_norm": 32.88041793576487, "learning_rate": 1.822102990750595e-06, "logits/chosen": 21.8018798828125, "logits/rejected": 26.132715225219727, "logps/chosen": -159.8308563232422, "logps/rejected": -299.31707763671875, "loss": 0.059, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.872952461242676, "rewards/margins": 12.362961769104004, "rewards/rejected": -22.235912322998047, "step": 610 }, { "epoch": 0.496, "grad_norm": 27.101516320021904, "learning_rate": 1.7810719718785873e-06, "logits/chosen": 22.754619598388672, "logits/rejected": 26.49776268005371, "logps/chosen": -159.6096954345703, "logps/rejected": -283.2123718261719, "loss": 0.0772, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.500214576721191, "rewards/margins": 10.847715377807617, "rewards/rejected": -21.34792709350586, "step": 620 }, { "epoch": 0.504, "grad_norm": 12.207017549903293, "learning_rate": 1.7398217815377524e-06, "logits/chosen": 23.825117111206055, "logits/rejected": 24.585908889770508, "logps/chosen": -167.65447998046875, "logps/rejected": -288.53851318359375, "loss": 0.1162, "rewards/accuracies": 1.0, "rewards/chosen": -10.316202163696289, "rewards/margins": 12.006393432617188, "rewards/rejected": -22.322595596313477, "step": 630 }, { "epoch": 0.512, "grad_norm": 5.163064270145757, "learning_rate": 1.698384585385684e-06, "logits/chosen": 23.767126083374023, "logits/rejected": 23.51525115966797, "logps/chosen": -180.27285766601562, "logps/rejected": -302.7817077636719, "loss": 0.0808, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.770536422729492, "rewards/margins": 12.786243438720703, "rewards/rejected": -23.556777954101562, "step": 640 }, { "epoch": 0.52, "grad_norm": 19.70083585367663, "learning_rate": 1.6567926949014804e-06, "logits/chosen": 21.79136085510254, "logits/rejected": 26.325298309326172, "logps/chosen": -160.727294921875, "logps/rejected": -299.2230224609375, "loss": 0.0831, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.32923698425293, "rewards/margins": 12.3799467086792, "rewards/rejected": -22.709184646606445, "step": 650 }, { "epoch": 0.528, "grad_norm": 58.109980325157714, "learning_rate": 1.615078542190228e-06, "logits/chosen": 18.619295120239258, "logits/rejected": 23.427764892578125, "logps/chosen": -153.34585571289062, "logps/rejected": -298.84844970703125, "loss": 0.0532, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.80422306060791, "rewards/margins": 12.987719535827637, "rewards/rejected": -22.791942596435547, "step": 660 }, { "epoch": 0.536, "grad_norm": 12.579940547510622, "learning_rate": 1.5732746546934201e-06, "logits/chosen": 18.508384704589844, "logits/rejected": 23.189361572265625, "logps/chosen": -153.68649291992188, "logps/rejected": -281.5841979980469, "loss": 0.0459, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.447421073913574, "rewards/margins": 11.287437438964844, "rewards/rejected": -21.7348575592041, "step": 670 }, { "epoch": 0.544, "grad_norm": 3.9836905566738907, "learning_rate": 1.5314136298250356e-06, "logits/chosen": 19.70537567138672, "logits/rejected": 21.990859985351562, "logps/chosen": -177.63992309570312, "logps/rejected": -290.6033630371094, "loss": 0.0641, "rewards/accuracies": 1.0, "rewards/chosen": -11.616503715515137, "rewards/margins": 10.820137023925781, "rewards/rejected": -22.436641693115234, "step": 680 }, { "epoch": 0.552, "grad_norm": 0.03655701442918885, "learning_rate": 1.4895281095530578e-06, "logits/chosen": 19.422595977783203, "logits/rejected": 21.195425033569336, "logps/chosen": -176.49839782714844, "logps/rejected": -313.53497314453125, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": -11.55458927154541, "rewards/margins": 13.00189208984375, "rewards/rejected": -24.556480407714844, "step": 690 }, { "epoch": 0.56, "grad_norm": 14.189469953880016, "learning_rate": 1.4476507549462489e-06, "logits/chosen": 19.599300384521484, "logits/rejected": 22.10235023498535, "logps/chosen": -175.69180297851562, "logps/rejected": -306.8517150878906, "loss": 0.0487, "rewards/accuracies": 1.0, "rewards/chosen": -11.599604606628418, "rewards/margins": 12.865476608276367, "rewards/rejected": -24.46508026123047, "step": 700 }, { "epoch": 0.568, "grad_norm": 16.687171595570835, "learning_rate": 1.40581422070603e-06, "logits/chosen": 18.665790557861328, "logits/rejected": 22.680692672729492, "logps/chosen": -139.86900329589844, "logps/rejected": -305.75372314453125, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": -9.141308784484863, "rewards/margins": 14.639094352722168, "rewards/rejected": -23.78040313720703, "step": 710 }, { "epoch": 0.576, "grad_norm": 30.88419300499099, "learning_rate": 1.36405112970333e-06, "logits/chosen": 20.21243667602539, "logits/rejected": 22.893695831298828, "logps/chosen": -158.63064575195312, "logps/rejected": -314.7591247558594, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": -9.554333686828613, "rewards/margins": 14.731111526489258, "rewards/rejected": -24.285442352294922, "step": 720 }, { "epoch": 0.584, "grad_norm": 0.5428652197520796, "learning_rate": 1.3223940475402486e-06, "logits/chosen": 18.03298568725586, "logits/rejected": 20.249217987060547, "logps/chosen": -156.5733642578125, "logps/rejected": -343.110107421875, "loss": 0.0561, "rewards/accuracies": 1.0, "rewards/chosen": -9.43940258026123, "rewards/margins": 17.574020385742188, "rewards/rejected": -27.0134220123291, "step": 730 }, { "epoch": 0.592, "grad_norm": 1.2508849959109967, "learning_rate": 1.2808754571563827e-06, "logits/chosen": 20.323490142822266, "logits/rejected": 20.145631790161133, "logps/chosen": -172.84413146972656, "logps/rejected": -311.8101501464844, "loss": 0.0922, "rewards/accuracies": 1.0, "rewards/chosen": -11.189407348632812, "rewards/margins": 13.641016960144043, "rewards/rejected": -24.830425262451172, "step": 740 }, { "epoch": 0.6, "grad_norm": 2.991663806982443, "learning_rate": 1.2395277334996047e-06, "logits/chosen": 18.46148681640625, "logits/rejected": 19.215234756469727, "logps/chosen": -161.73716735839844, "logps/rejected": -280.82550048828125, "loss": 0.0407, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.929094314575195, "rewards/margins": 11.809922218322754, "rewards/rejected": -21.739017486572266, "step": 750 }, { "epoch": 0.608, "grad_norm": 20.4365745548988, "learning_rate": 1.1983831182810534e-06, "logits/chosen": 18.275842666625977, "logits/rejected": 21.843782424926758, "logps/chosen": -164.38780212402344, "logps/rejected": -306.2947998046875, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -11.01513957977295, "rewards/margins": 12.679012298583984, "rewards/rejected": -23.694150924682617, "step": 760 }, { "epoch": 0.616, "grad_norm": 55.8762128914233, "learning_rate": 1.1574736948340164e-06, "logits/chosen": 17.70195960998535, "logits/rejected": 19.546524047851562, "logps/chosen": -178.3878173828125, "logps/rejected": -305.541259765625, "loss": 0.0884, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.889364242553711, "rewards/margins": 11.622591972351074, "rewards/rejected": -23.5119571685791, "step": 770 }, { "epoch": 0.624, "grad_norm": 16.837136970797925, "learning_rate": 1.1168313630963144e-06, "logits/chosen": 14.999873161315918, "logits/rejected": 17.94775390625, "logps/chosen": -173.88241577148438, "logps/rejected": -343.7441101074219, "loss": 0.0886, "rewards/accuracies": 1.0, "rewards/chosen": -11.541296005249023, "rewards/margins": 15.598657608032227, "rewards/rejected": -27.13995361328125, "step": 780 }, { "epoch": 0.632, "grad_norm": 20.598469671539753, "learning_rate": 1.0764878147356852e-06, "logits/chosen": 16.952680587768555, "logits/rejected": 19.94651222229004, "logps/chosen": -156.38262939453125, "logps/rejected": -301.9223327636719, "loss": 0.0413, "rewards/accuracies": 1.0, "rewards/chosen": -10.413267135620117, "rewards/margins": 13.252962112426758, "rewards/rejected": -23.66622543334961, "step": 790 }, { "epoch": 0.64, "grad_norm": 0.7362208350991403, "learning_rate": 1.036474508437579e-06, "logits/chosen": 19.5678653717041, "logits/rejected": 21.027923583984375, "logps/chosen": -173.03187561035156, "logps/rejected": -347.1485900878906, "loss": 0.0423, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.36100959777832, "rewards/margins": 16.25722885131836, "rewards/rejected": -26.618236541748047, "step": 800 }, { "epoch": 0.648, "grad_norm": 0.46643139845120424, "learning_rate": 9.968226453746177e-07, "logits/chosen": 14.546850204467773, "logits/rejected": 18.3260555267334, "logps/chosen": -182.55184936523438, "logps/rejected": -322.73321533203125, "loss": 0.0687, "rewards/accuracies": 1.0, "rewards/chosen": -12.82792854309082, "rewards/margins": 12.780898094177246, "rewards/rejected": -25.60882568359375, "step": 810 }, { "epoch": 0.656, "grad_norm": 0.007677469812290264, "learning_rate": 9.575631448768617e-07, "logits/chosen": 15.806520462036133, "logits/rejected": 19.548625946044922, "logps/chosen": -189.52545166015625, "logps/rejected": -327.7802429199219, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -12.922433853149414, "rewards/margins": 12.425726890563965, "rewards/rejected": -25.348161697387695, "step": 820 }, { "epoch": 0.664, "grad_norm": 2.0558318532148774, "learning_rate": 9.187266203218456e-07, "logits/chosen": 17.683448791503906, "logits/rejected": 20.042156219482422, "logps/chosen": -159.53077697753906, "logps/rejected": -312.9973449707031, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -10.393224716186523, "rewards/margins": 14.232980728149414, "rewards/rejected": -24.626201629638672, "step": 830 }, { "epoch": 0.672, "grad_norm": 17.198595889469797, "learning_rate": 8.803433552631875e-07, "logits/chosen": 17.60666275024414, "logits/rejected": 17.99045181274414, "logps/chosen": -170.3730926513672, "logps/rejected": -328.09185791015625, "loss": 0.0873, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.795900344848633, "rewards/margins": 15.03393840789795, "rewards/rejected": -25.8298397064209, "step": 840 }, { "epoch": 0.68, "grad_norm": 1.3072581313287024, "learning_rate": 8.424432798163837e-07, "logits/chosen": 18.38207244873047, "logits/rejected": 19.747913360595703, "logps/chosen": -165.44735717773438, "logps/rejected": -309.8100891113281, "loss": 0.0274, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.688611030578613, "rewards/margins": 13.613398551940918, "rewards/rejected": -24.302011489868164, "step": 850 }, { "epoch": 0.688, "grad_norm": 0.4582264747297489, "learning_rate": 8.050559473202078e-07, "logits/chosen": 14.035835266113281, "logits/rejected": 18.153425216674805, "logps/chosen": -167.19036865234375, "logps/rejected": -312.13775634765625, "loss": 0.0353, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.431414604187012, "rewards/margins": 12.939321517944336, "rewards/rejected": -24.370737075805664, "step": 860 }, { "epoch": 0.696, "grad_norm": 3.136356793214795, "learning_rate": 7.682105112919007e-07, "logits/chosen": 19.952112197875977, "logits/rejected": 21.141132354736328, "logps/chosen": -154.23995971679688, "logps/rejected": -319.5997009277344, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": -9.864435195922852, "rewards/margins": 15.159929275512695, "rewards/rejected": -25.024364471435547, "step": 870 }, { "epoch": 0.704, "grad_norm": 3.519769878341393, "learning_rate": 7.319357026941429e-07, "logits/chosen": 19.199859619140625, "logits/rejected": 19.918563842773438, "logps/chosen": -194.5305938720703, "logps/rejected": -317.95941162109375, "loss": 0.0509, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.087428092956543, "rewards/margins": 12.253093719482422, "rewards/rejected": -25.34052276611328, "step": 880 }, { "epoch": 0.712, "grad_norm": 24.857735299978447, "learning_rate": 6.962598075315047e-07, "logits/chosen": 17.146230697631836, "logits/rejected": 20.964147567749023, "logps/chosen": -182.15980529785156, "logps/rejected": -345.76788330078125, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -12.32874584197998, "rewards/margins": 14.388389587402344, "rewards/rejected": -26.71713638305664, "step": 890 }, { "epoch": 0.72, "grad_norm": 0.318353767436957, "learning_rate": 6.6121064479388e-07, "logits/chosen": 17.372299194335938, "logits/rejected": 20.967824935913086, "logps/chosen": -164.0208282470703, "logps/rejected": -318.7355651855469, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": -11.150172233581543, "rewards/margins": 13.733154296875, "rewards/rejected": -24.88332748413086, "step": 900 }, { "epoch": 0.728, "grad_norm": 3.8936657076963193, "learning_rate": 6.268155447640661e-07, "logits/chosen": 18.852811813354492, "logits/rejected": 21.377389907836914, "logps/chosen": -168.42544555664062, "logps/rejected": -329.688232421875, "loss": 0.0379, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.377861976623535, "rewards/margins": 15.148188591003418, "rewards/rejected": -26.526050567626953, "step": 910 }, { "epoch": 0.736, "grad_norm": 8.372162670348306, "learning_rate": 5.931013277064378e-07, "logits/chosen": 15.737344741821289, "logits/rejected": 17.821592330932617, "logps/chosen": -168.76051330566406, "logps/rejected": -316.5263366699219, "loss": 0.0359, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.1880521774292, "rewards/margins": 13.797701835632324, "rewards/rejected": -24.985754013061523, "step": 920 }, { "epoch": 0.744, "grad_norm": 19.290631890698684, "learning_rate": 5.600942829533097e-07, "logits/chosen": 16.14108657836914, "logits/rejected": 18.526290893554688, "logps/chosen": -183.9233856201172, "logps/rejected": -345.7073669433594, "loss": 0.0664, "rewards/accuracies": 1.0, "rewards/chosen": -12.947591781616211, "rewards/margins": 14.908193588256836, "rewards/rejected": -27.855789184570312, "step": 930 }, { "epoch": 0.752, "grad_norm": 28.437105122450188, "learning_rate": 5.278201484053037e-07, "logits/chosen": 14.238430976867676, "logits/rejected": 15.927212715148926, "logps/chosen": -175.82205200195312, "logps/rejected": -357.60577392578125, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -11.766983032226562, "rewards/margins": 17.238004684448242, "rewards/rejected": -29.004989624023438, "step": 940 }, { "epoch": 0.76, "grad_norm": 47.398111998155244, "learning_rate": 4.963040904617131e-07, "logits/chosen": 14.369181632995605, "logits/rejected": 16.830841064453125, "logps/chosen": -195.03958129882812, "logps/rejected": -361.30755615234375, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": -13.507547378540039, "rewards/margins": 15.785783767700195, "rewards/rejected": -29.2933292388916, "step": 950 }, { "epoch": 0.768, "grad_norm": 18.929417271431557, "learning_rate": 4.6557068439649533e-07, "logits/chosen": 12.442909240722656, "logits/rejected": 15.81037712097168, "logps/chosen": -172.2083740234375, "logps/rejected": -346.9830627441406, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -11.788956642150879, "rewards/margins": 15.845266342163086, "rewards/rejected": -27.63422203063965, "step": 960 }, { "epoch": 0.776, "grad_norm": 0.010537786096251401, "learning_rate": 4.3564389519521896e-07, "logits/chosen": 11.128179550170898, "logits/rejected": 15.002403259277344, "logps/chosen": -179.82070922851562, "logps/rejected": -359.905517578125, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -12.77198314666748, "rewards/margins": 16.542465209960938, "rewards/rejected": -29.3144474029541, "step": 970 }, { "epoch": 0.784, "grad_norm": 3.2903365603646555, "learning_rate": 4.06547058867883e-07, "logits/chosen": 13.022871017456055, "logits/rejected": 17.119640350341797, "logps/chosen": -183.79476928710938, "logps/rejected": -351.65728759765625, "loss": 0.0185, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -12.853482246398926, "rewards/margins": 15.206174850463867, "rewards/rejected": -28.059656143188477, "step": 980 }, { "epoch": 0.792, "grad_norm": 8.653995083366096, "learning_rate": 3.7830286425220237e-07, "logits/chosen": 14.448740005493164, "logits/rejected": 16.771413803100586, "logps/chosen": -181.69955444335938, "logps/rejected": -360.3311767578125, "loss": 0.0337, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -12.328413009643555, "rewards/margins": 16.78951644897461, "rewards/rejected": -29.117929458618164, "step": 990 }, { "epoch": 0.8, "grad_norm": 0.028161198779737273, "learning_rate": 3.5093333532153313e-07, "logits/chosen": 12.604635238647461, "logits/rejected": 16.513111114501953, "logps/chosen": -170.95950317382812, "logps/rejected": -356.91705322265625, "loss": 0.0152, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.3618745803833, "rewards/margins": 17.1263370513916, "rewards/rejected": -28.488210678100586, "step": 1000 }, { "epoch": 0.808, "grad_norm": 1.7845230113365977, "learning_rate": 3.2445981401124044e-07, "logits/chosen": 12.725648880004883, "logits/rejected": 15.18701457977295, "logps/chosen": -182.36656188964844, "logps/rejected": -355.4485778808594, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -12.828389167785645, "rewards/margins": 16.320749282836914, "rewards/rejected": -29.149139404296875, "step": 1010 }, { "epoch": 0.816, "grad_norm": 0.034587439255628315, "learning_rate": 2.9890294357689994e-07, "logits/chosen": 14.864664077758789, "logits/rejected": 18.851364135742188, "logps/chosen": -199.5797576904297, "logps/rejected": -365.7235412597656, "loss": 0.0171, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.424786567687988, "rewards/margins": 15.109170913696289, "rewards/rejected": -28.533960342407227, "step": 1020 }, { "epoch": 0.824, "grad_norm": 0.03435180433087091, "learning_rate": 2.7428265249730726e-07, "logits/chosen": 12.820098876953125, "logits/rejected": 15.737815856933594, "logps/chosen": -192.89608764648438, "logps/rejected": -351.27911376953125, "loss": 0.0322, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.299699783325195, "rewards/margins": 14.958763122558594, "rewards/rejected": -28.258464813232422, "step": 1030 }, { "epoch": 0.832, "grad_norm": 3.4002892431899436, "learning_rate": 2.5061813893485086e-07, "logits/chosen": 14.156018257141113, "logits/rejected": 17.030376434326172, "logps/chosen": -179.10986328125, "logps/rejected": -335.97186279296875, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -11.678332328796387, "rewards/margins": 14.767659187316895, "rewards/rejected": -26.44598960876465, "step": 1040 }, { "epoch": 0.84, "grad_norm": 0.020870062099409628, "learning_rate": 2.2792785576536108e-07, "logits/chosen": 12.851526260375977, "logits/rejected": 17.06875228881836, "logps/chosen": -163.72132873535156, "logps/rejected": -354.77117919921875, "loss": 0.0303, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.758419036865234, "rewards/margins": 16.93534278869629, "rewards/rejected": -27.693761825561523, "step": 1050 }, { "epoch": 0.848, "grad_norm": 0.12145713143714221, "learning_rate": 2.062294961891138e-07, "logits/chosen": 13.231893539428711, "logits/rejected": 16.38172149658203, "logps/chosen": -175.502685546875, "logps/rejected": -341.85650634765625, "loss": 0.0067, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.950915336608887, "rewards/margins": 15.844401359558105, "rewards/rejected": -27.795318603515625, "step": 1060 }, { "epoch": 0.856, "grad_norm": 43.872269822135024, "learning_rate": 1.8553997993420495e-07, "logits/chosen": 11.042947769165039, "logits/rejected": 14.6314697265625, "logps/chosen": -167.84786987304688, "logps/rejected": -359.76239013671875, "loss": 0.0365, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -11.415273666381836, "rewards/margins": 17.310102462768555, "rewards/rejected": -28.72537612915039, "step": 1070 }, { "epoch": 0.864, "grad_norm": 16.613133726739267, "learning_rate": 1.6587544006305372e-07, "logits/chosen": 14.675074577331543, "logits/rejected": 14.268391609191895, "logps/chosen": -200.9607391357422, "logps/rejected": -337.6705627441406, "loss": 0.0206, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.507387161254883, "rewards/margins": 13.960055351257324, "rewards/rejected": -27.46744155883789, "step": 1080 }, { "epoch": 0.872, "grad_norm": 0.10757306008079003, "learning_rate": 1.4725121039232948e-07, "logits/chosen": 14.780688285827637, "logits/rejected": 16.79671287536621, "logps/chosen": -167.3707733154297, "logps/rejected": -333.1465759277344, "loss": 0.0274, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.258435249328613, "rewards/margins": 16.166662216186523, "rewards/rejected": -26.425098419189453, "step": 1090 }, { "epoch": 0.88, "grad_norm": 53.565796368395205, "learning_rate": 1.2968181353609853e-07, "logits/chosen": 13.000600814819336, "logits/rejected": 15.201478958129883, "logps/chosen": -158.11436462402344, "logps/rejected": -326.98077392578125, "loss": 0.0248, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.076480865478516, "rewards/margins": 16.441701889038086, "rewards/rejected": -26.518178939819336, "step": 1100 }, { "epoch": 0.888, "grad_norm": 0.4931850288780545, "learning_rate": 1.1318094958153047e-07, "logits/chosen": 15.825027465820312, "logits/rejected": 18.127300262451172, "logps/chosen": -175.34991455078125, "logps/rejected": -349.7121276855469, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -11.070804595947266, "rewards/margins": 16.37197494506836, "rewards/rejected": -27.44277572631836, "step": 1110 }, { "epoch": 0.896, "grad_norm": 0.12431725759551986, "learning_rate": 9.776148540597835e-08, "logits/chosen": 14.717602729797363, "logits/rejected": 16.464252471923828, "logps/chosen": -171.41806030273438, "logps/rejected": -341.33636474609375, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -11.874577522277832, "rewards/margins": 15.349774360656738, "rewards/rejected": -27.224353790283203, "step": 1120 }, { "epoch": 0.904, "grad_norm": 44.43625690363336, "learning_rate": 8.34354446437785e-08, "logits/chosen": 13.284322738647461, "logits/rejected": 14.527437210083008, "logps/chosen": -176.1920166015625, "logps/rejected": -344.534912109375, "loss": 0.0616, "rewards/accuracies": 1.0, "rewards/chosen": -11.812251091003418, "rewards/margins": 16.68192481994629, "rewards/rejected": -28.49417495727539, "step": 1130 }, { "epoch": 0.912, "grad_norm": 0.3325545642344925, "learning_rate": 7.021399831057961e-08, "logits/chosen": 14.616548538208008, "logits/rejected": 18.06045913696289, "logps/chosen": -175.67092895507812, "logps/rejected": -326.31268310546875, "loss": 0.0184, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -11.880111694335938, "rewards/margins": 14.142997741699219, "rewards/rejected": -26.023107528686523, "step": 1140 }, { "epoch": 0.92, "grad_norm": 9.535643407043993, "learning_rate": 5.810745609252166e-08, "logits/chosen": 15.016606330871582, "logits/rejected": 19.4204044342041, "logps/chosen": -171.228515625, "logps/rejected": -339.8002014160156, "loss": 0.016, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.794087409973145, "rewards/margins": 14.61567497253418, "rewards/rejected": -26.40976333618164, "step": 1150 }, { "epoch": 0.928, "grad_norm": 0.1185713320229559, "learning_rate": 4.712525830705339e-08, "logits/chosen": 14.084701538085938, "logits/rejected": 15.02185344696045, "logps/chosen": -175.77389526367188, "logps/rejected": -327.8313903808594, "loss": 0.0058, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.838964462280273, "rewards/margins": 15.071843147277832, "rewards/rejected": -26.91080665588379, "step": 1160 }, { "epoch": 0.936, "grad_norm": 49.630514506359575, "learning_rate": 3.72759685416551e-08, "logits/chosen": 12.91785717010498, "logits/rejected": 16.28598403930664, "logps/chosen": -193.53079223632812, "logps/rejected": -346.7781982421875, "loss": 0.0213, "rewards/accuracies": 1.0, "rewards/chosen": -13.229898452758789, "rewards/margins": 14.539329528808594, "rewards/rejected": -27.76923179626465, "step": 1170 }, { "epoch": 0.944, "grad_norm": 2.386421051511265, "learning_rate": 2.8567266976212704e-08, "logits/chosen": 13.009920120239258, "logits/rejected": 16.784725189208984, "logps/chosen": -174.90267944335938, "logps/rejected": -360.4608154296875, "loss": 0.0274, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.592721939086914, "rewards/margins": 16.40024185180664, "rewards/rejected": -27.992965698242188, "step": 1180 }, { "epoch": 0.952, "grad_norm": 0.09581452353121599, "learning_rate": 2.1005944394242692e-08, "logits/chosen": 14.088732719421387, "logits/rejected": 14.811144828796387, "logps/chosen": -190.23629760742188, "logps/rejected": -340.3313903808594, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -12.186540603637695, "rewards/margins": 15.273702621459961, "rewards/rejected": -27.460241317749023, "step": 1190 }, { "epoch": 0.96, "grad_norm": 2.5596226732992444, "learning_rate": 1.4597896887644457e-08, "logits/chosen": 14.2628812789917, "logits/rejected": 15.2882661819458, "logps/chosen": -183.11183166503906, "logps/rejected": -330.42401123046875, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -11.977328300476074, "rewards/margins": 14.7813720703125, "rewards/rejected": -26.75870132446289, "step": 1200 }, { "epoch": 0.968, "grad_norm": 8.966198799351712, "learning_rate": 9.348121259105447e-09, "logits/chosen": 13.10435962677002, "logits/rejected": 15.16510009765625, "logps/chosen": -194.73561096191406, "logps/rejected": -350.19671630859375, "loss": 0.0139, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.223831176757812, "rewards/margins": 14.7427978515625, "rewards/rejected": -27.966629028320312, "step": 1210 }, { "epoch": 0.976, "grad_norm": 2.1731183890345145, "learning_rate": 5.260711125743445e-09, "logits/chosen": 11.423822402954102, "logits/rejected": 15.956392288208008, "logps/chosen": -157.79640197753906, "logps/rejected": -352.1459655761719, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -10.58418083190918, "rewards/margins": 17.188343048095703, "rewards/rejected": -27.772525787353516, "step": 1220 }, { "epoch": 0.984, "grad_norm": 0.000975641347440435, "learning_rate": 2.3388537270284673e-09, "logits/chosen": 14.252192497253418, "logits/rejected": 16.61027717590332, "logps/chosen": -179.70510864257812, "logps/rejected": -353.10711669921875, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -12.220856666564941, "rewards/margins": 16.520856857299805, "rewards/rejected": -28.741714477539062, "step": 1230 }, { "epoch": 0.992, "grad_norm": 0.5744272246950218, "learning_rate": 5.848274394684716e-10, "logits/chosen": 14.274118423461914, "logits/rejected": 17.041919708251953, "logps/chosen": -176.65518188476562, "logps/rejected": -355.2174377441406, "loss": 0.0222, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -12.072566032409668, "rewards/margins": 16.45151710510254, "rewards/rejected": -28.524078369140625, "step": 1240 }, { "epoch": 1.0, "grad_norm": 0.016349832310295288, "learning_rate": 0.0, "logits/chosen": 12.271159172058105, "logits/rejected": 15.676365852355957, "logps/chosen": -191.03286743164062, "logps/rejected": -355.4150390625, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": -13.313619613647461, "rewards/margins": 14.883234024047852, "rewards/rejected": -28.196853637695312, "step": 1250 }, { "epoch": 1.0, "step": 1250, "total_flos": 0.0, "train_loss": 0.12455665428638459, "train_runtime": 20866.0985, "train_samples_per_second": 0.958, "train_steps_per_second": 0.06 } ], "logging_steps": 10, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }