File size: 77,691 Bytes
a57b801
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 100,
  "global_step": 91,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "debug/policy_chosen_logits": 22.537443161010742,
      "debug/policy_chosen_logps": -454.7864685058594,
      "debug/policy_rejected_logits": 24.08443260192871,
      "debug/policy_rejected_logps": -485.6905517578125,
      "debug/reference_chosen_logps": -454.7864685058594,
      "debug/reference_rejected_logps": -485.6905517578125,
      "epoch": 0.01098901098901099,
      "grad_norm": 6.833481499698497,
      "learning_rate": 1e-06,
      "logits/chosen": 22.537443161010742,
      "logits/rejected": 24.08443260192871,
      "logps/chosen": -454.7864685058594,
      "logps/rejected": -485.6905517578125,
      "loss": 0.5,
      "rewards/accuracies": 0.0,
      "rewards/chosen": 0.0,
      "rewards/margins": 0.0,
      "rewards/rejected": 0.0,
      "step": 1
    },
    {
      "debug/policy_chosen_logits": 25.2491512298584,
      "debug/policy_chosen_logps": -426.63800048828125,
      "debug/policy_rejected_logits": 25.061065673828125,
      "debug/policy_rejected_logps": -446.8348083496094,
      "debug/reference_chosen_logps": -426.80908203125,
      "debug/reference_rejected_logps": -446.17877197265625,
      "epoch": 0.02197802197802198,
      "grad_norm": 8.518360012009484,
      "learning_rate": 1e-06,
      "logits/chosen": 25.2491512298584,
      "logits/rejected": 25.061065673828125,
      "logps/chosen": -426.63800048828125,
      "logps/rejected": -446.8348083496094,
      "loss": 0.5003,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.0017108535394072533,
      "rewards/margins": 0.008270950056612492,
      "rewards/rejected": -0.006560096517205238,
      "step": 2
    },
    {
      "debug/policy_chosen_logits": 23.04501724243164,
      "debug/policy_chosen_logps": -418.5323791503906,
      "debug/policy_rejected_logits": 26.21137809753418,
      "debug/policy_rejected_logps": -445.2972412109375,
      "debug/reference_chosen_logps": -418.4830627441406,
      "debug/reference_rejected_logps": -445.174560546875,
      "epoch": 0.03296703296703297,
      "grad_norm": 8.234274066877381,
      "learning_rate": 1e-06,
      "logits/chosen": 23.04501724243164,
      "logits/rejected": 26.21137809753418,
      "logps/chosen": -418.5323791503906,
      "logps/rejected": -445.2972412109375,
      "loss": 0.497,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.0004932782612740993,
      "rewards/margins": 0.0007336426060646772,
      "rewards/rejected": -0.001226921333000064,
      "step": 3
    },
    {
      "debug/policy_chosen_logits": 22.864124298095703,
      "debug/policy_chosen_logps": -424.31500244140625,
      "debug/policy_rejected_logits": 24.804826736450195,
      "debug/policy_rejected_logps": -440.4050598144531,
      "debug/reference_chosen_logps": -424.2529296875,
      "debug/reference_rejected_logps": -440.0723876953125,
      "epoch": 0.04395604395604396,
      "grad_norm": 8.267600116778256,
      "learning_rate": 1e-06,
      "logits/chosen": 22.864124298095703,
      "logits/rejected": 24.804826736450195,
      "logps/chosen": -424.31500244140625,
      "logps/rejected": -440.4050598144531,
      "loss": 0.4948,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.000620613107457757,
      "rewards/margins": 0.0027059551794081926,
      "rewards/rejected": -0.0033265682868659496,
      "step": 4
    },
    {
      "debug/policy_chosen_logits": 23.74032211303711,
      "debug/policy_chosen_logps": -425.2148132324219,
      "debug/policy_rejected_logits": 24.66983985900879,
      "debug/policy_rejected_logps": -428.9949951171875,
      "debug/reference_chosen_logps": -425.707763671875,
      "debug/reference_rejected_logps": -427.61578369140625,
      "epoch": 0.054945054945054944,
      "grad_norm": 7.1377920370536465,
      "learning_rate": 1e-06,
      "logits/chosen": 23.74032211303711,
      "logits/rejected": 24.66983985900879,
      "logps/chosen": -425.2148132324219,
      "logps/rejected": -428.9949951171875,
      "loss": 0.4915,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.004929542541503906,
      "rewards/margins": 0.018722152337431908,
      "rewards/rejected": -0.013792609795928001,
      "step": 5
    },
    {
      "debug/policy_chosen_logits": 22.908384323120117,
      "debug/policy_chosen_logps": -436.1788635253906,
      "debug/policy_rejected_logits": 26.066926956176758,
      "debug/policy_rejected_logps": -458.21295166015625,
      "debug/reference_chosen_logps": -437.0372619628906,
      "debug/reference_rejected_logps": -455.97064208984375,
      "epoch": 0.06593406593406594,
      "grad_norm": 6.840813228978009,
      "learning_rate": 1e-06,
      "logits/chosen": 22.908384323120117,
      "logits/rejected": 26.066926956176758,
      "logps/chosen": -436.1788635253906,
      "logps/rejected": -458.21295166015625,
      "loss": 0.4897,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.008583908900618553,
      "rewards/margins": 0.031006699427962303,
      "rewards/rejected": -0.02242279052734375,
      "step": 6
    },
    {
      "debug/policy_chosen_logits": 27.81578826904297,
      "debug/policy_chosen_logps": -456.6412353515625,
      "debug/policy_rejected_logits": 22.0266170501709,
      "debug/policy_rejected_logps": -461.65155029296875,
      "debug/reference_chosen_logps": -457.8715515136719,
      "debug/reference_rejected_logps": -459.9510498046875,
      "epoch": 0.07692307692307693,
      "grad_norm": 7.815334550310639,
      "learning_rate": 1e-06,
      "logits/chosen": 27.81578826904297,
      "logits/rejected": 22.0266170501709,
      "logps/chosen": -456.6412353515625,
      "logps/rejected": -461.65155029296875,
      "loss": 0.4758,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.012303275987505913,
      "rewards/margins": 0.029308240860700607,
      "rewards/rejected": -0.017004966735839844,
      "step": 7
    },
    {
      "debug/policy_chosen_logits": 21.05910301208496,
      "debug/policy_chosen_logps": -420.5210876464844,
      "debug/policy_rejected_logits": 21.690637588500977,
      "debug/policy_rejected_logps": -461.24737548828125,
      "debug/reference_chosen_logps": -421.00885009765625,
      "debug/reference_rejected_logps": -459.6142578125,
      "epoch": 0.08791208791208792,
      "grad_norm": 6.940663555739121,
      "learning_rate": 1e-06,
      "logits/chosen": 21.05910301208496,
      "logits/rejected": 21.690637588500977,
      "logps/chosen": -420.5210876464844,
      "logps/rejected": -461.24737548828125,
      "loss": 0.4748,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.004877815023064613,
      "rewards/margins": 0.02120864763855934,
      "rewards/rejected": -0.016330832615494728,
      "step": 8
    },
    {
      "debug/policy_chosen_logits": 25.456287384033203,
      "debug/policy_chosen_logps": -440.22540283203125,
      "debug/policy_rejected_logits": 26.438932418823242,
      "debug/policy_rejected_logps": -419.1558837890625,
      "debug/reference_chosen_logps": -439.4281921386719,
      "debug/reference_rejected_logps": -417.99652099609375,
      "epoch": 0.0989010989010989,
      "grad_norm": 5.7812586478558865,
      "learning_rate": 1e-06,
      "logits/chosen": 25.456287384033203,
      "logits/rejected": 26.438932418823242,
      "logps/chosen": -440.22540283203125,
      "logps/rejected": -419.1558837890625,
      "loss": 0.4769,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.007972106337547302,
      "rewards/margins": 0.003621369134634733,
      "rewards/rejected": -0.011593475937843323,
      "step": 9
    },
    {
      "debug/policy_chosen_logits": 22.315250396728516,
      "debug/policy_chosen_logps": -425.8302001953125,
      "debug/policy_rejected_logits": 28.707416534423828,
      "debug/policy_rejected_logps": -453.5276184082031,
      "debug/reference_chosen_logps": -427.7785339355469,
      "debug/reference_rejected_logps": -450.7994079589844,
      "epoch": 0.10989010989010989,
      "grad_norm": 6.765369287722621,
      "learning_rate": 1e-06,
      "logits/chosen": 22.315250396728516,
      "logits/rejected": 28.707416534423828,
      "logps/chosen": -425.8302001953125,
      "logps/rejected": -453.5276184082031,
      "loss": 0.4488,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.019483529031276703,
      "rewards/margins": 0.04676567018032074,
      "rewards/rejected": -0.027282143011689186,
      "step": 10
    },
    {
      "debug/policy_chosen_logits": 23.35513687133789,
      "debug/policy_chosen_logps": -409.0987548828125,
      "debug/policy_rejected_logits": 25.19911003112793,
      "debug/policy_rejected_logps": -457.9080810546875,
      "debug/reference_chosen_logps": -411.0341796875,
      "debug/reference_rejected_logps": -452.00286865234375,
      "epoch": 0.12087912087912088,
      "grad_norm": 6.194229784289876,
      "learning_rate": 1e-06,
      "logits/chosen": 23.35513687133789,
      "logits/rejected": 25.19911003112793,
      "logps/chosen": -409.0987548828125,
      "logps/rejected": -457.9080810546875,
      "loss": 0.4512,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.01935398206114769,
      "rewards/margins": 0.07840606570243835,
      "rewards/rejected": -0.059052083641290665,
      "step": 11
    },
    {
      "debug/policy_chosen_logits": 24.74183464050293,
      "debug/policy_chosen_logps": -443.5227355957031,
      "debug/policy_rejected_logits": 23.557279586791992,
      "debug/policy_rejected_logps": -438.05023193359375,
      "debug/reference_chosen_logps": -444.56549072265625,
      "debug/reference_rejected_logps": -433.29443359375,
      "epoch": 0.13186813186813187,
      "grad_norm": 6.295588890883081,
      "learning_rate": 1e-06,
      "logits/chosen": 24.74183464050293,
      "logits/rejected": 23.557279586791992,
      "logps/chosen": -443.5227355957031,
      "logps/rejected": -438.05023193359375,
      "loss": 0.444,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.010427666828036308,
      "rewards/margins": 0.057986069470644,
      "rewards/rejected": -0.04755840077996254,
      "step": 12
    },
    {
      "debug/policy_chosen_logits": 25.26060676574707,
      "debug/policy_chosen_logps": -401.9018249511719,
      "debug/policy_rejected_logits": 25.899250030517578,
      "debug/policy_rejected_logps": -447.271240234375,
      "debug/reference_chosen_logps": -407.1501159667969,
      "debug/reference_rejected_logps": -443.3197326660156,
      "epoch": 0.14285714285714285,
      "grad_norm": 6.651428295339092,
      "learning_rate": 1e-06,
      "logits/chosen": 25.26060676574707,
      "logits/rejected": 25.899250030517578,
      "logps/chosen": -401.9018249511719,
      "logps/rejected": -447.271240234375,
      "loss": 0.4438,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.05248305946588516,
      "rewards/margins": 0.0919981375336647,
      "rewards/rejected": -0.03951507434248924,
      "step": 13
    },
    {
      "debug/policy_chosen_logits": 27.350528717041016,
      "debug/policy_chosen_logps": -431.6693115234375,
      "debug/policy_rejected_logits": 27.774742126464844,
      "debug/policy_rejected_logps": -437.4266052246094,
      "debug/reference_chosen_logps": -437.1637268066406,
      "debug/reference_rejected_logps": -433.93536376953125,
      "epoch": 0.15384615384615385,
      "grad_norm": 6.44432847940225,
      "learning_rate": 1e-06,
      "logits/chosen": 27.350528717041016,
      "logits/rejected": 27.774742126464844,
      "logps/chosen": -431.6693115234375,
      "logps/rejected": -437.4266052246094,
      "loss": 0.4203,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.05494399741292,
      "rewards/margins": 0.08985621482133865,
      "rewards/rejected": -0.034912221133708954,
      "step": 14
    },
    {
      "debug/policy_chosen_logits": 26.464439392089844,
      "debug/policy_chosen_logps": -441.0960998535156,
      "debug/policy_rejected_logits": 22.606828689575195,
      "debug/policy_rejected_logps": -450.34637451171875,
      "debug/reference_chosen_logps": -443.2272033691406,
      "debug/reference_rejected_logps": -445.57098388671875,
      "epoch": 0.16483516483516483,
      "grad_norm": 6.034773405123565,
      "learning_rate": 1e-06,
      "logits/chosen": 26.464439392089844,
      "logits/rejected": 22.606828689575195,
      "logps/chosen": -441.0960998535156,
      "logps/rejected": -450.34637451171875,
      "loss": 0.4508,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.021311109885573387,
      "rewards/margins": 0.06906504929065704,
      "rewards/rejected": -0.047753941267728806,
      "step": 15
    },
    {
      "debug/policy_chosen_logits": 23.063968658447266,
      "debug/policy_chosen_logps": -430.00677490234375,
      "debug/policy_rejected_logits": 30.106121063232422,
      "debug/policy_rejected_logps": -459.87164306640625,
      "debug/reference_chosen_logps": -435.9080810546875,
      "debug/reference_rejected_logps": -454.621337890625,
      "epoch": 0.17582417582417584,
      "grad_norm": 5.694131268125543,
      "learning_rate": 1e-06,
      "logits/chosen": 23.063968658447266,
      "logits/rejected": 30.106121063232422,
      "logps/chosen": -430.00677490234375,
      "logps/rejected": -459.87164306640625,
      "loss": 0.4059,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.05901290848851204,
      "rewards/margins": 0.11151611059904099,
      "rewards/rejected": -0.05250319838523865,
      "step": 16
    },
    {
      "debug/policy_chosen_logits": 29.839580535888672,
      "debug/policy_chosen_logps": -436.97039794921875,
      "debug/policy_rejected_logits": 28.98643684387207,
      "debug/policy_rejected_logps": -476.59881591796875,
      "debug/reference_chosen_logps": -443.8123779296875,
      "debug/reference_rejected_logps": -466.94873046875,
      "epoch": 0.18681318681318682,
      "grad_norm": 6.738937580645961,
      "learning_rate": 1e-06,
      "logits/chosen": 29.839580535888672,
      "logits/rejected": 28.98643684387207,
      "logps/chosen": -436.97039794921875,
      "logps/rejected": -476.59881591796875,
      "loss": 0.4118,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.06841972470283508,
      "rewards/margins": 0.16492080688476562,
      "rewards/rejected": -0.09650108218193054,
      "step": 17
    },
    {
      "debug/policy_chosen_logits": 29.84876823425293,
      "debug/policy_chosen_logps": -441.1398620605469,
      "debug/policy_rejected_logits": 21.603878021240234,
      "debug/policy_rejected_logps": -443.14556884765625,
      "debug/reference_chosen_logps": -448.20770263671875,
      "debug/reference_rejected_logps": -441.1372375488281,
      "epoch": 0.1978021978021978,
      "grad_norm": 5.137388346436403,
      "learning_rate": 1e-06,
      "logits/chosen": 29.84876823425293,
      "logits/rejected": 21.603878021240234,
      "logps/chosen": -441.1398620605469,
      "logps/rejected": -443.14556884765625,
      "loss": 0.4207,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.07067855447530746,
      "rewards/margins": 0.09076160192489624,
      "rewards/rejected": -0.020083043724298477,
      "step": 18
    },
    {
      "debug/policy_chosen_logits": 26.974803924560547,
      "debug/policy_chosen_logps": -453.483642578125,
      "debug/policy_rejected_logits": 26.663101196289062,
      "debug/policy_rejected_logps": -489.6366271972656,
      "debug/reference_chosen_logps": -460.9517822265625,
      "debug/reference_rejected_logps": -470.0023193359375,
      "epoch": 0.2087912087912088,
      "grad_norm": 7.17158785388267,
      "learning_rate": 1e-06,
      "logits/chosen": 26.974803924560547,
      "logits/rejected": 26.663101196289062,
      "logps/chosen": -453.483642578125,
      "logps/rejected": -489.6366271972656,
      "loss": 0.3364,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.07468143105506897,
      "rewards/margins": 0.271024614572525,
      "rewards/rejected": -0.19634319841861725,
      "step": 19
    },
    {
      "debug/policy_chosen_logits": 25.026630401611328,
      "debug/policy_chosen_logps": -433.65960693359375,
      "debug/policy_rejected_logits": 25.77982521057129,
      "debug/policy_rejected_logps": -471.0029296875,
      "debug/reference_chosen_logps": -436.43359375,
      "debug/reference_rejected_logps": -460.9761962890625,
      "epoch": 0.21978021978021978,
      "grad_norm": 4.898911355829737,
      "learning_rate": 1e-06,
      "logits/chosen": 25.026630401611328,
      "logits/rejected": 25.77982521057129,
      "logps/chosen": -433.65960693359375,
      "logps/rejected": -471.0029296875,
      "loss": 0.3395,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.027739638462662697,
      "rewards/margins": 0.12800678610801697,
      "rewards/rejected": -0.10026714205741882,
      "step": 20
    },
    {
      "debug/policy_chosen_logits": 25.650062561035156,
      "debug/policy_chosen_logps": -435.83905029296875,
      "debug/policy_rejected_logits": 24.800756454467773,
      "debug/policy_rejected_logps": -481.5117492675781,
      "debug/reference_chosen_logps": -443.297607421875,
      "debug/reference_rejected_logps": -455.4820556640625,
      "epoch": 0.23076923076923078,
      "grad_norm": 5.689326640738984,
      "learning_rate": 1e-06,
      "logits/chosen": 25.650062561035156,
      "logits/rejected": 24.800756454467773,
      "logps/chosen": -435.83905029296875,
      "logps/rejected": -481.5117492675781,
      "loss": 0.3623,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.07458514720201492,
      "rewards/margins": 0.3348817229270935,
      "rewards/rejected": -0.2602965533733368,
      "step": 21
    },
    {
      "debug/policy_chosen_logits": 22.519716262817383,
      "debug/policy_chosen_logps": -405.88226318359375,
      "debug/policy_rejected_logits": 27.31670570373535,
      "debug/policy_rejected_logps": -470.7935791015625,
      "debug/reference_chosen_logps": -416.4732360839844,
      "debug/reference_rejected_logps": -464.93170166015625,
      "epoch": 0.24175824175824176,
      "grad_norm": 4.9841849550976765,
      "learning_rate": 1e-06,
      "logits/chosen": 22.519716262817383,
      "logits/rejected": 27.31670570373535,
      "logps/chosen": -405.88226318359375,
      "logps/rejected": -470.7935791015625,
      "loss": 0.3153,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.10590964555740356,
      "rewards/margins": 0.16452865302562714,
      "rewards/rejected": -0.058619000017642975,
      "step": 22
    },
    {
      "debug/policy_chosen_logits": 25.54558753967285,
      "debug/policy_chosen_logps": -427.68450927734375,
      "debug/policy_rejected_logits": 28.938701629638672,
      "debug/policy_rejected_logps": -456.0726623535156,
      "debug/reference_chosen_logps": -445.5054626464844,
      "debug/reference_rejected_logps": -441.0499267578125,
      "epoch": 0.25274725274725274,
      "grad_norm": 4.453741538704258,
      "learning_rate": 1e-06,
      "logits/chosen": 25.54558753967285,
      "logits/rejected": 28.938701629638672,
      "logps/chosen": -427.68450927734375,
      "logps/rejected": -456.0726623535156,
      "loss": 0.3437,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.17820952832698822,
      "rewards/margins": 0.328436940908432,
      "rewards/rejected": -0.1502273976802826,
      "step": 23
    },
    {
      "debug/policy_chosen_logits": 24.3718318939209,
      "debug/policy_chosen_logps": -407.55877685546875,
      "debug/policy_rejected_logits": 25.362825393676758,
      "debug/policy_rejected_logps": -428.3117370605469,
      "debug/reference_chosen_logps": -422.75518798828125,
      "debug/reference_rejected_logps": -419.3153381347656,
      "epoch": 0.26373626373626374,
      "grad_norm": 5.345678599763925,
      "learning_rate": 1e-06,
      "logits/chosen": 24.3718318939209,
      "logits/rejected": 25.362825393676758,
      "logps/chosen": -407.55877685546875,
      "logps/rejected": -428.3117370605469,
      "loss": 0.3313,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.1519637256860733,
      "rewards/margins": 0.24192774295806885,
      "rewards/rejected": -0.08996403217315674,
      "step": 24
    },
    {
      "debug/policy_chosen_logits": 22.36195945739746,
      "debug/policy_chosen_logps": -407.60107421875,
      "debug/policy_rejected_logits": 21.827068328857422,
      "debug/policy_rejected_logps": -436.4155578613281,
      "debug/reference_chosen_logps": -428.3585205078125,
      "debug/reference_rejected_logps": -423.5442199707031,
      "epoch": 0.27472527472527475,
      "grad_norm": 6.983328118149161,
      "learning_rate": 1e-06,
      "logits/chosen": 22.36195945739746,
      "logits/rejected": 21.827068328857422,
      "logps/chosen": -407.60107421875,
      "logps/rejected": -436.4155578613281,
      "loss": 0.3456,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.20757438242435455,
      "rewards/margins": 0.3362876772880554,
      "rewards/rejected": -0.12871329486370087,
      "step": 25
    },
    {
      "debug/policy_chosen_logits": 24.60111427307129,
      "debug/policy_chosen_logps": -424.568115234375,
      "debug/policy_rejected_logits": 27.44658088684082,
      "debug/policy_rejected_logps": -500.7226867675781,
      "debug/reference_chosen_logps": -442.6761779785156,
      "debug/reference_rejected_logps": -472.3526916503906,
      "epoch": 0.2857142857142857,
      "grad_norm": 4.621624916905314,
      "learning_rate": 1e-06,
      "logits/chosen": 24.60111427307129,
      "logits/rejected": 27.44658088684082,
      "logps/chosen": -424.568115234375,
      "logps/rejected": -500.7226867675781,
      "loss": 0.3364,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.18108057975769043,
      "rewards/margins": 0.46478039026260376,
      "rewards/rejected": -0.28369981050491333,
      "step": 26
    },
    {
      "debug/policy_chosen_logits": 24.04281234741211,
      "debug/policy_chosen_logps": -420.4295349121094,
      "debug/policy_rejected_logits": 24.854686737060547,
      "debug/policy_rejected_logps": -453.5174560546875,
      "debug/reference_chosen_logps": -440.0169677734375,
      "debug/reference_rejected_logps": -441.5909118652344,
      "epoch": 0.2967032967032967,
      "grad_norm": 4.722287417720129,
      "learning_rate": 1e-06,
      "logits/chosen": 24.04281234741211,
      "logits/rejected": 24.854686737060547,
      "logps/chosen": -420.4295349121094,
      "logps/rejected": -453.5174560546875,
      "loss": 0.3288,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.19587397575378418,
      "rewards/margins": 0.3151397109031677,
      "rewards/rejected": -0.11926574259996414,
      "step": 27
    },
    {
      "debug/policy_chosen_logits": 26.117721557617188,
      "debug/policy_chosen_logps": -437.7601318359375,
      "debug/policy_rejected_logits": 28.745031356811523,
      "debug/policy_rejected_logps": -477.95904541015625,
      "debug/reference_chosen_logps": -448.17889404296875,
      "debug/reference_rejected_logps": -449.2839050292969,
      "epoch": 0.3076923076923077,
      "grad_norm": 4.51629611753504,
      "learning_rate": 1e-06,
      "logits/chosen": 26.117721557617188,
      "logits/rejected": 28.745031356811523,
      "logps/chosen": -437.7601318359375,
      "logps/rejected": -477.95904541015625,
      "loss": 0.3783,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.10418765991926193,
      "rewards/margins": 0.3909391462802887,
      "rewards/rejected": -0.28675150871276855,
      "step": 28
    },
    {
      "debug/policy_chosen_logits": 28.643272399902344,
      "debug/policy_chosen_logps": -445.78875732421875,
      "debug/policy_rejected_logits": 30.509065628051758,
      "debug/policy_rejected_logps": -490.1319274902344,
      "debug/reference_chosen_logps": -458.77532958984375,
      "debug/reference_rejected_logps": -467.02667236328125,
      "epoch": 0.31868131868131866,
      "grad_norm": 5.726900857769083,
      "learning_rate": 1e-06,
      "logits/chosen": 28.643272399902344,
      "logits/rejected": 30.509065628051758,
      "logps/chosen": -445.78875732421875,
      "logps/rejected": -490.1319274902344,
      "loss": 0.3291,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.1298656463623047,
      "rewards/margins": 0.3609181046485901,
      "rewards/rejected": -0.2310524582862854,
      "step": 29
    },
    {
      "debug/policy_chosen_logits": 23.9057674407959,
      "debug/policy_chosen_logps": -429.0777587890625,
      "debug/policy_rejected_logits": 27.041988372802734,
      "debug/policy_rejected_logps": -489.99102783203125,
      "debug/reference_chosen_logps": -435.3116149902344,
      "debug/reference_rejected_logps": -468.8659973144531,
      "epoch": 0.32967032967032966,
      "grad_norm": 5.589091335127157,
      "learning_rate": 1e-06,
      "logits/chosen": 23.9057674407959,
      "logits/rejected": 27.041988372802734,
      "logps/chosen": -429.0777587890625,
      "logps/rejected": -489.99102783203125,
      "loss": 0.3149,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.06233878806233406,
      "rewards/margins": 0.2735891342163086,
      "rewards/rejected": -0.21125034987926483,
      "step": 30
    },
    {
      "debug/policy_chosen_logits": 20.52339744567871,
      "debug/policy_chosen_logps": -417.83367919921875,
      "debug/policy_rejected_logits": 25.475711822509766,
      "debug/policy_rejected_logps": -477.3974914550781,
      "debug/reference_chosen_logps": -429.0983581542969,
      "debug/reference_rejected_logps": -454.0728759765625,
      "epoch": 0.34065934065934067,
      "grad_norm": 5.919614811999328,
      "learning_rate": 1e-06,
      "logits/chosen": 20.52339744567871,
      "logits/rejected": 25.475711822509766,
      "logps/chosen": -417.83367919921875,
      "logps/rejected": -477.3974914550781,
      "loss": 0.2959,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.11264674365520477,
      "rewards/margins": 0.34589269757270813,
      "rewards/rejected": -0.23324593901634216,
      "step": 31
    },
    {
      "debug/policy_chosen_logits": 21.121295928955078,
      "debug/policy_chosen_logps": -410.53643798828125,
      "debug/policy_rejected_logits": 27.618637084960938,
      "debug/policy_rejected_logps": -471.409912109375,
      "debug/reference_chosen_logps": -427.6445007324219,
      "debug/reference_rejected_logps": -452.9000244140625,
      "epoch": 0.3516483516483517,
      "grad_norm": 6.235065911009125,
      "learning_rate": 1e-06,
      "logits/chosen": 21.121295928955078,
      "logits/rejected": 27.618637084960938,
      "logps/chosen": -410.53643798828125,
      "logps/rejected": -471.409912109375,
      "loss": 0.3322,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.1710808128118515,
      "rewards/margins": 0.35617977380752563,
      "rewards/rejected": -0.18509894609451294,
      "step": 32
    },
    {
      "debug/policy_chosen_logits": 25.020137786865234,
      "debug/policy_chosen_logps": -421.2449951171875,
      "debug/policy_rejected_logits": 26.06194305419922,
      "debug/policy_rejected_logps": -455.33154296875,
      "debug/reference_chosen_logps": -445.73223876953125,
      "debug/reference_rejected_logps": -430.2513427734375,
      "epoch": 0.3626373626373626,
      "grad_norm": 4.220132361607984,
      "learning_rate": 1e-06,
      "logits/chosen": 25.020137786865234,
      "logits/rejected": 26.06194305419922,
      "logps/chosen": -421.2449951171875,
      "logps/rejected": -455.33154296875,
      "loss": 0.2825,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.24487201869487762,
      "rewards/margins": 0.4956740140914917,
      "rewards/rejected": -0.25080201029777527,
      "step": 33
    },
    {
      "debug/policy_chosen_logits": 28.49104881286621,
      "debug/policy_chosen_logps": -432.42578125,
      "debug/policy_rejected_logits": 26.045143127441406,
      "debug/policy_rejected_logps": -500.81494140625,
      "debug/reference_chosen_logps": -455.1983642578125,
      "debug/reference_rejected_logps": -481.32403564453125,
      "epoch": 0.37362637362637363,
      "grad_norm": 4.102855456579325,
      "learning_rate": 1e-06,
      "logits/chosen": 28.49104881286621,
      "logits/rejected": 26.045143127441406,
      "logps/chosen": -432.42578125,
      "logps/rejected": -500.81494140625,
      "loss": 0.2778,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.22772569954395294,
      "rewards/margins": 0.422635018825531,
      "rewards/rejected": -0.19490931928157806,
      "step": 34
    },
    {
      "debug/policy_chosen_logits": 23.481491088867188,
      "debug/policy_chosen_logps": -410.7614440917969,
      "debug/policy_rejected_logits": 26.54258155822754,
      "debug/policy_rejected_logps": -461.57659912109375,
      "debug/reference_chosen_logps": -431.06304931640625,
      "debug/reference_rejected_logps": -428.4720458984375,
      "epoch": 0.38461538461538464,
      "grad_norm": 3.930918282446488,
      "learning_rate": 1e-06,
      "logits/chosen": 23.481491088867188,
      "logits/rejected": 26.54258155822754,
      "logps/chosen": -410.7614440917969,
      "logps/rejected": -461.57659912109375,
      "loss": 0.2688,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.20301616191864014,
      "rewards/margins": 0.5340613126754761,
      "rewards/rejected": -0.33104515075683594,
      "step": 35
    },
    {
      "debug/policy_chosen_logits": 24.361486434936523,
      "debug/policy_chosen_logps": -412.67987060546875,
      "debug/policy_rejected_logits": 27.827251434326172,
      "debug/policy_rejected_logps": -452.28857421875,
      "debug/reference_chosen_logps": -430.6239318847656,
      "debug/reference_rejected_logps": -440.8873596191406,
      "epoch": 0.3956043956043956,
      "grad_norm": 5.7094820058187645,
      "learning_rate": 1e-06,
      "logits/chosen": 24.361486434936523,
      "logits/rejected": 27.827251434326172,
      "logps/chosen": -412.67987060546875,
      "logps/rejected": -452.28857421875,
      "loss": 0.3088,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.1794406771659851,
      "rewards/margins": 0.29345306754112244,
      "rewards/rejected": -0.11401237547397614,
      "step": 36
    },
    {
      "debug/policy_chosen_logits": 26.789180755615234,
      "debug/policy_chosen_logps": -419.154296875,
      "debug/policy_rejected_logits": 24.381994247436523,
      "debug/policy_rejected_logps": -443.3766174316406,
      "debug/reference_chosen_logps": -440.4947509765625,
      "debug/reference_rejected_logps": -457.65887451171875,
      "epoch": 0.4065934065934066,
      "grad_norm": 3.962286836465319,
      "learning_rate": 1e-06,
      "logits/chosen": 26.789180755615234,
      "logits/rejected": 24.381994247436523,
      "logps/chosen": -419.154296875,
      "logps/rejected": -443.3766174316406,
      "loss": 0.2823,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.21340444684028625,
      "rewards/margins": 0.07058170437812805,
      "rewards/rejected": 0.1428227573633194,
      "step": 37
    },
    {
      "debug/policy_chosen_logits": 24.922195434570312,
      "debug/policy_chosen_logps": -399.93096923828125,
      "debug/policy_rejected_logits": 27.783912658691406,
      "debug/policy_rejected_logps": -472.2608947753906,
      "debug/reference_chosen_logps": -427.5108642578125,
      "debug/reference_rejected_logps": -462.4599914550781,
      "epoch": 0.4175824175824176,
      "grad_norm": 4.150324748851986,
      "learning_rate": 1e-06,
      "logits/chosen": 24.922195434570312,
      "logits/rejected": 27.783912658691406,
      "logps/chosen": -399.93096923828125,
      "logps/rejected": -472.2608947753906,
      "loss": 0.2398,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.27579906582832336,
      "rewards/margins": 0.37380802631378174,
      "rewards/rejected": -0.09800895303487778,
      "step": 38
    },
    {
      "debug/policy_chosen_logits": 22.96780014038086,
      "debug/policy_chosen_logps": -420.74261474609375,
      "debug/policy_rejected_logits": 23.919336318969727,
      "debug/policy_rejected_logps": -439.653076171875,
      "debug/reference_chosen_logps": -433.999755859375,
      "debug/reference_rejected_logps": -421.92962646484375,
      "epoch": 0.42857142857142855,
      "grad_norm": 4.365666660792002,
      "learning_rate": 1e-06,
      "logits/chosen": 22.96780014038086,
      "logits/rejected": 23.919336318969727,
      "logps/chosen": -420.74261474609375,
      "logps/rejected": -439.653076171875,
      "loss": 0.2833,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.13257166743278503,
      "rewards/margins": 0.3098059296607971,
      "rewards/rejected": -0.17723426222801208,
      "step": 39
    },
    {
      "debug/policy_chosen_logits": 27.75818634033203,
      "debug/policy_chosen_logps": -455.0806884765625,
      "debug/policy_rejected_logits": 27.270263671875,
      "debug/policy_rejected_logps": -468.50927734375,
      "debug/reference_chosen_logps": -462.73138427734375,
      "debug/reference_rejected_logps": -473.1136474609375,
      "epoch": 0.43956043956043955,
      "grad_norm": 4.570887002489462,
      "learning_rate": 1e-06,
      "logits/chosen": 27.75818634033203,
      "logits/rejected": 27.270263671875,
      "logps/chosen": -455.0806884765625,
      "logps/rejected": -468.50927734375,
      "loss": 0.3436,
      "rewards/accuracies": 0.375,
      "rewards/chosen": 0.07650664448738098,
      "rewards/margins": 0.030462883412837982,
      "rewards/rejected": 0.04604377597570419,
      "step": 40
    },
    {
      "debug/policy_chosen_logits": 25.347915649414062,
      "debug/policy_chosen_logps": -406.87591552734375,
      "debug/policy_rejected_logits": 28.548446655273438,
      "debug/policy_rejected_logps": -464.4383544921875,
      "debug/reference_chosen_logps": -433.459716796875,
      "debug/reference_rejected_logps": -448.8690185546875,
      "epoch": 0.45054945054945056,
      "grad_norm": 3.677831065051595,
      "learning_rate": 1e-06,
      "logits/chosen": 25.347915649414062,
      "logits/rejected": 28.548446655273438,
      "logps/chosen": -406.87591552734375,
      "logps/rejected": -464.4383544921875,
      "loss": 0.2758,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.26583802700042725,
      "rewards/margins": 0.4215315878391266,
      "rewards/rejected": -0.15569356083869934,
      "step": 41
    },
    {
      "debug/policy_chosen_logits": 22.77235221862793,
      "debug/policy_chosen_logps": -402.58416748046875,
      "debug/policy_rejected_logits": 26.393054962158203,
      "debug/policy_rejected_logps": -471.61260986328125,
      "debug/reference_chosen_logps": -426.26690673828125,
      "debug/reference_rejected_logps": -453.03125,
      "epoch": 0.46153846153846156,
      "grad_norm": 3.808742477450644,
      "learning_rate": 1e-06,
      "logits/chosen": 22.77235221862793,
      "logits/rejected": 26.393054962158203,
      "logps/chosen": -402.58416748046875,
      "logps/rejected": -471.61260986328125,
      "loss": 0.2739,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.23682719469070435,
      "rewards/margins": 0.4226408302783966,
      "rewards/rejected": -0.18581363558769226,
      "step": 42
    },
    {
      "debug/policy_chosen_logits": 22.165273666381836,
      "debug/policy_chosen_logps": -409.06304931640625,
      "debug/policy_rejected_logits": 26.427730560302734,
      "debug/policy_rejected_logps": -448.95489501953125,
      "debug/reference_chosen_logps": -434.5325927734375,
      "debug/reference_rejected_logps": -412.31060791015625,
      "epoch": 0.4725274725274725,
      "grad_norm": 7.512415554484962,
      "learning_rate": 1e-06,
      "logits/chosen": 22.165273666381836,
      "logits/rejected": 26.427730560302734,
      "logps/chosen": -409.06304931640625,
      "logps/rejected": -448.95489501953125,
      "loss": 0.3186,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.25469571352005005,
      "rewards/margins": 0.6211386322975159,
      "rewards/rejected": -0.36644288897514343,
      "step": 43
    },
    {
      "debug/policy_chosen_logits": 25.82545280456543,
      "debug/policy_chosen_logps": -409.12420654296875,
      "debug/policy_rejected_logits": 23.266063690185547,
      "debug/policy_rejected_logps": -493.0911865234375,
      "debug/reference_chosen_logps": -437.0023193359375,
      "debug/reference_rejected_logps": -456.59796142578125,
      "epoch": 0.4835164835164835,
      "grad_norm": 5.005310078975177,
      "learning_rate": 1e-06,
      "logits/chosen": 25.82545280456543,
      "logits/rejected": 23.266063690185547,
      "logps/chosen": -409.12420654296875,
      "logps/rejected": -493.0911865234375,
      "loss": 0.2517,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.278780996799469,
      "rewards/margins": 0.6437131762504578,
      "rewards/rejected": -0.36493217945098877,
      "step": 44
    },
    {
      "debug/policy_chosen_logits": 28.779375076293945,
      "debug/policy_chosen_logps": -419.06585693359375,
      "debug/policy_rejected_logits": 25.911027908325195,
      "debug/policy_rejected_logps": -493.4896240234375,
      "debug/reference_chosen_logps": -444.6504211425781,
      "debug/reference_rejected_logps": -467.607177734375,
      "epoch": 0.4945054945054945,
      "grad_norm": 5.276153180435082,
      "learning_rate": 1e-06,
      "logits/chosen": 28.779375076293945,
      "logits/rejected": 25.911027908325195,
      "logps/chosen": -419.06585693359375,
      "logps/rejected": -493.4896240234375,
      "loss": 0.3159,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.2558458745479584,
      "rewards/margins": 0.5146701335906982,
      "rewards/rejected": -0.25882428884506226,
      "step": 45
    },
    {
      "debug/policy_chosen_logits": 27.05695152282715,
      "debug/policy_chosen_logps": -428.0582275390625,
      "debug/policy_rejected_logits": 27.517444610595703,
      "debug/policy_rejected_logps": -499.21636962890625,
      "debug/reference_chosen_logps": -452.95599365234375,
      "debug/reference_rejected_logps": -468.61138916015625,
      "epoch": 0.5054945054945055,
      "grad_norm": 4.904417802927379,
      "learning_rate": 1e-06,
      "logits/chosen": 27.05695152282715,
      "logits/rejected": 27.517444610595703,
      "logps/chosen": -428.0582275390625,
      "logps/rejected": -499.21636962890625,
      "loss": 0.242,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.2489778846502304,
      "rewards/margins": 0.5550275444984436,
      "rewards/rejected": -0.306049644947052,
      "step": 46
    },
    {
      "debug/policy_chosen_logits": 24.79572105407715,
      "debug/policy_chosen_logps": -412.3216552734375,
      "debug/policy_rejected_logits": 26.50978660583496,
      "debug/policy_rejected_logps": -466.15557861328125,
      "debug/reference_chosen_logps": -430.951416015625,
      "debug/reference_rejected_logps": -448.00433349609375,
      "epoch": 0.5164835164835165,
      "grad_norm": 5.334062642354243,
      "learning_rate": 1e-06,
      "logits/chosen": 24.79572105407715,
      "logits/rejected": 26.50978660583496,
      "logps/chosen": -412.3216552734375,
      "logps/rejected": -466.15557861328125,
      "loss": 0.3266,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.18629732728004456,
      "rewards/margins": 0.3678096830844879,
      "rewards/rejected": -0.18151238560676575,
      "step": 47
    },
    {
      "debug/policy_chosen_logits": 24.24400520324707,
      "debug/policy_chosen_logps": -392.45751953125,
      "debug/policy_rejected_logits": 27.10297393798828,
      "debug/policy_rejected_logps": -463.1430969238281,
      "debug/reference_chosen_logps": -417.45147705078125,
      "debug/reference_rejected_logps": -438.20220947265625,
      "epoch": 0.5274725274725275,
      "grad_norm": 7.066668036628892,
      "learning_rate": 1e-06,
      "logits/chosen": 24.24400520324707,
      "logits/rejected": 27.10297393798828,
      "logps/chosen": -392.45751953125,
      "logps/rejected": -463.1430969238281,
      "loss": 0.3255,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.24993950128555298,
      "rewards/margins": 0.49934816360473633,
      "rewards/rejected": -0.24940869212150574,
      "step": 48
    },
    {
      "debug/policy_chosen_logits": 22.664649963378906,
      "debug/policy_chosen_logps": -394.15032958984375,
      "debug/policy_rejected_logits": 25.70164680480957,
      "debug/policy_rejected_logps": -452.9449768066406,
      "debug/reference_chosen_logps": -416.6987609863281,
      "debug/reference_rejected_logps": -439.2945251464844,
      "epoch": 0.5384615384615384,
      "grad_norm": 5.135410370061012,
      "learning_rate": 1e-06,
      "logits/chosen": 22.664649963378906,
      "logits/rejected": 25.70164680480957,
      "logps/chosen": -394.15032958984375,
      "logps/rejected": -452.9449768066406,
      "loss": 0.2879,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.22548410296440125,
      "rewards/margins": 0.3619886040687561,
      "rewards/rejected": -0.13650450110435486,
      "step": 49
    },
    {
      "debug/policy_chosen_logits": 22.666519165039062,
      "debug/policy_chosen_logps": -396.6944885253906,
      "debug/policy_rejected_logits": 21.13740348815918,
      "debug/policy_rejected_logps": -453.7715148925781,
      "debug/reference_chosen_logps": -424.0322265625,
      "debug/reference_rejected_logps": -440.89862060546875,
      "epoch": 0.5494505494505495,
      "grad_norm": 3.9756803394109825,
      "learning_rate": 1e-06,
      "logits/chosen": 22.666519165039062,
      "logits/rejected": 21.13740348815918,
      "logps/chosen": -396.6944885253906,
      "logps/rejected": -453.7715148925781,
      "loss": 0.2293,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.27337703108787537,
      "rewards/margins": 0.40210601687431335,
      "rewards/rejected": -0.1287289708852768,
      "step": 50
    },
    {
      "debug/policy_chosen_logits": 23.19964027404785,
      "debug/policy_chosen_logps": -397.15765380859375,
      "debug/policy_rejected_logits": 24.41975212097168,
      "debug/policy_rejected_logps": -475.76104736328125,
      "debug/reference_chosen_logps": -424.8116760253906,
      "debug/reference_rejected_logps": -434.97528076171875,
      "epoch": 0.5604395604395604,
      "grad_norm": 4.600442889147922,
      "learning_rate": 1e-06,
      "logits/chosen": 23.19964027404785,
      "logits/rejected": 24.41975212097168,
      "logps/chosen": -397.15765380859375,
      "logps/rejected": -475.76104736328125,
      "loss": 0.2753,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.2765401601791382,
      "rewards/margins": 0.6843976974487305,
      "rewards/rejected": -0.4078575372695923,
      "step": 51
    },
    {
      "debug/policy_chosen_logits": 27.027416229248047,
      "debug/policy_chosen_logps": -433.012451171875,
      "debug/policy_rejected_logits": 24.729970932006836,
      "debug/policy_rejected_logps": -465.9927978515625,
      "debug/reference_chosen_logps": -459.7962951660156,
      "debug/reference_rejected_logps": -439.83599853515625,
      "epoch": 0.5714285714285714,
      "grad_norm": 3.8308230544471087,
      "learning_rate": 1e-06,
      "logits/chosen": 27.027416229248047,
      "logits/rejected": 24.729970932006836,
      "logps/chosen": -433.012451171875,
      "logps/rejected": -465.9927978515625,
      "loss": 0.2846,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.26783838868141174,
      "rewards/margins": 0.529405951499939,
      "rewards/rejected": -0.26156753301620483,
      "step": 52
    },
    {
      "debug/policy_chosen_logits": 25.013721466064453,
      "debug/policy_chosen_logps": -409.18426513671875,
      "debug/policy_rejected_logits": 29.04503059387207,
      "debug/policy_rejected_logps": -455.4191589355469,
      "debug/reference_chosen_logps": -434.70855712890625,
      "debug/reference_rejected_logps": -437.6434020996094,
      "epoch": 0.5824175824175825,
      "grad_norm": 5.361663918449101,
      "learning_rate": 1e-06,
      "logits/chosen": 25.013721466064453,
      "logits/rejected": 29.04503059387207,
      "logps/chosen": -409.18426513671875,
      "logps/rejected": -455.4191589355469,
      "loss": 0.3145,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.2552429437637329,
      "rewards/margins": 0.433000773191452,
      "rewards/rejected": -0.17775781452655792,
      "step": 53
    },
    {
      "debug/policy_chosen_logits": 26.417736053466797,
      "debug/policy_chosen_logps": -419.6263427734375,
      "debug/policy_rejected_logits": 28.78829002380371,
      "debug/policy_rejected_logps": -475.8312683105469,
      "debug/reference_chosen_logps": -440.78143310546875,
      "debug/reference_rejected_logps": -462.00067138671875,
      "epoch": 0.5934065934065934,
      "grad_norm": 4.416192021518405,
      "learning_rate": 1e-06,
      "logits/chosen": 26.417736053466797,
      "logits/rejected": 28.78829002380371,
      "logps/chosen": -419.6263427734375,
      "logps/rejected": -475.8312683105469,
      "loss": 0.3234,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.2115507870912552,
      "rewards/margins": 0.34985676407814026,
      "rewards/rejected": -0.13830597698688507,
      "step": 54
    },
    {
      "debug/policy_chosen_logits": 28.22123908996582,
      "debug/policy_chosen_logps": -429.47418212890625,
      "debug/policy_rejected_logits": 25.79034996032715,
      "debug/policy_rejected_logps": -471.1182861328125,
      "debug/reference_chosen_logps": -445.8591003417969,
      "debug/reference_rejected_logps": -455.14202880859375,
      "epoch": 0.6043956043956044,
      "grad_norm": 6.671276859718192,
      "learning_rate": 1e-06,
      "logits/chosen": 28.22123908996582,
      "logits/rejected": 25.79034996032715,
      "logps/chosen": -429.47418212890625,
      "logps/rejected": -471.1182861328125,
      "loss": 0.2846,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.16384944319725037,
      "rewards/margins": 0.32361170649528503,
      "rewards/rejected": -0.15976226329803467,
      "step": 55
    },
    {
      "debug/policy_chosen_logits": 26.506807327270508,
      "debug/policy_chosen_logps": -420.41082763671875,
      "debug/policy_rejected_logits": 27.579261779785156,
      "debug/policy_rejected_logps": -452.91162109375,
      "debug/reference_chosen_logps": -448.4418029785156,
      "debug/reference_rejected_logps": -432.2170104980469,
      "epoch": 0.6153846153846154,
      "grad_norm": 5.601125759934195,
      "learning_rate": 1e-06,
      "logits/chosen": 26.506807327270508,
      "logits/rejected": 27.579261779785156,
      "logps/chosen": -420.41082763671875,
      "logps/rejected": -452.91162109375,
      "loss": 0.2563,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.2803099453449249,
      "rewards/margins": 0.48725610971450806,
      "rewards/rejected": -0.20694613456726074,
      "step": 56
    },
    {
      "debug/policy_chosen_logits": 22.477113723754883,
      "debug/policy_chosen_logps": -410.53009033203125,
      "debug/policy_rejected_logits": 24.827415466308594,
      "debug/policy_rejected_logps": -463.63214111328125,
      "debug/reference_chosen_logps": -430.95086669921875,
      "debug/reference_rejected_logps": -428.8712158203125,
      "epoch": 0.6263736263736264,
      "grad_norm": 4.278326605774298,
      "learning_rate": 1e-06,
      "logits/chosen": 22.477113723754883,
      "logits/rejected": 24.827415466308594,
      "logps/chosen": -410.53009033203125,
      "logps/rejected": -463.63214111328125,
      "loss": 0.2727,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.20420792698860168,
      "rewards/margins": 0.5518174171447754,
      "rewards/rejected": -0.3476094603538513,
      "step": 57
    },
    {
      "debug/policy_chosen_logits": 24.6320858001709,
      "debug/policy_chosen_logps": -408.2926025390625,
      "debug/policy_rejected_logits": 27.864049911499023,
      "debug/policy_rejected_logps": -461.57208251953125,
      "debug/reference_chosen_logps": -426.1760559082031,
      "debug/reference_rejected_logps": -447.0111083984375,
      "epoch": 0.6373626373626373,
      "grad_norm": 4.643648875549545,
      "learning_rate": 1e-06,
      "logits/chosen": 24.6320858001709,
      "logits/rejected": 27.864049911499023,
      "logps/chosen": -408.2926025390625,
      "logps/rejected": -461.57208251953125,
      "loss": 0.2516,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.17883440852165222,
      "rewards/margins": 0.3244439363479614,
      "rewards/rejected": -0.1456095427274704,
      "step": 58
    },
    {
      "debug/policy_chosen_logits": 20.885883331298828,
      "debug/policy_chosen_logps": -412.9990539550781,
      "debug/policy_rejected_logits": 21.050661087036133,
      "debug/policy_rejected_logps": -522.6400756835938,
      "debug/reference_chosen_logps": -440.10931396484375,
      "debug/reference_rejected_logps": -487.7521667480469,
      "epoch": 0.6483516483516484,
      "grad_norm": 4.05657059697319,
      "learning_rate": 1e-06,
      "logits/chosen": 20.885883331298828,
      "logits/rejected": 21.050661087036133,
      "logps/chosen": -412.9990539550781,
      "logps/rejected": -522.6400756835938,
      "loss": 0.2678,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.2711024880409241,
      "rewards/margins": 0.6199814081192017,
      "rewards/rejected": -0.3488789200782776,
      "step": 59
    },
    {
      "debug/policy_chosen_logits": 24.035409927368164,
      "debug/policy_chosen_logps": -399.28765869140625,
      "debug/policy_rejected_logits": 24.84585952758789,
      "debug/policy_rejected_logps": -453.340576171875,
      "debug/reference_chosen_logps": -420.63153076171875,
      "debug/reference_rejected_logps": -434.2669677734375,
      "epoch": 0.6593406593406593,
      "grad_norm": 3.784225958996139,
      "learning_rate": 1e-06,
      "logits/chosen": 24.035409927368164,
      "logits/rejected": 24.84585952758789,
      "logps/chosen": -399.28765869140625,
      "logps/rejected": -453.340576171875,
      "loss": 0.2552,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.2134389877319336,
      "rewards/margins": 0.4041747748851776,
      "rewards/rejected": -0.19073577225208282,
      "step": 60
    },
    {
      "debug/policy_chosen_logits": 21.771154403686523,
      "debug/policy_chosen_logps": -403.99932861328125,
      "debug/policy_rejected_logits": 25.54006576538086,
      "debug/policy_rejected_logps": -431.0410461425781,
      "debug/reference_chosen_logps": -427.13372802734375,
      "debug/reference_rejected_logps": -426.09619140625,
      "epoch": 0.6703296703296703,
      "grad_norm": 4.4444525496723815,
      "learning_rate": 1e-06,
      "logits/chosen": 21.771154403686523,
      "logits/rejected": 25.54006576538086,
      "logps/chosen": -403.99932861328125,
      "logps/rejected": -431.0410461425781,
      "loss": 0.2411,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.2313441038131714,
      "rewards/margins": 0.2807927131652832,
      "rewards/rejected": -0.04944861680269241,
      "step": 61
    },
    {
      "debug/policy_chosen_logits": 21.56890869140625,
      "debug/policy_chosen_logps": -418.62939453125,
      "debug/policy_rejected_logits": 26.892208099365234,
      "debug/policy_rejected_logps": -432.6386413574219,
      "debug/reference_chosen_logps": -446.4366149902344,
      "debug/reference_rejected_logps": -437.1725769042969,
      "epoch": 0.6813186813186813,
      "grad_norm": 6.207177159902563,
      "learning_rate": 1e-06,
      "logits/chosen": 21.56890869140625,
      "logits/rejected": 26.892208099365234,
      "logps/chosen": -418.62939453125,
      "logps/rejected": -432.6386413574219,
      "loss": 0.2765,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.27807170152664185,
      "rewards/margins": 0.23273253440856934,
      "rewards/rejected": 0.04533915966749191,
      "step": 62
    },
    {
      "debug/policy_chosen_logits": 28.32245445251465,
      "debug/policy_chosen_logps": -440.50726318359375,
      "debug/policy_rejected_logits": 25.365680694580078,
      "debug/policy_rejected_logps": -464.0654296875,
      "debug/reference_chosen_logps": -475.3460998535156,
      "debug/reference_rejected_logps": -457.081787109375,
      "epoch": 0.6923076923076923,
      "grad_norm": 5.48574215281406,
      "learning_rate": 1e-06,
      "logits/chosen": 28.32245445251465,
      "logits/rejected": 25.365680694580078,
      "logps/chosen": -440.50726318359375,
      "logps/rejected": -464.0654296875,
      "loss": 0.2435,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.34838855266571045,
      "rewards/margins": 0.4182246923446655,
      "rewards/rejected": -0.06983615458011627,
      "step": 63
    },
    {
      "debug/policy_chosen_logits": 25.090259552001953,
      "debug/policy_chosen_logps": -396.6990051269531,
      "debug/policy_rejected_logits": 27.255340576171875,
      "debug/policy_rejected_logps": -454.5365295410156,
      "debug/reference_chosen_logps": -436.3953552246094,
      "debug/reference_rejected_logps": -430.6047058105469,
      "epoch": 0.7032967032967034,
      "grad_norm": 8.801285897449953,
      "learning_rate": 1e-06,
      "logits/chosen": 25.090259552001953,
      "logits/rejected": 27.255340576171875,
      "logps/chosen": -396.6990051269531,
      "logps/rejected": -454.5365295410156,
      "loss": 0.3064,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.3969634771347046,
      "rewards/margins": 0.6362816095352173,
      "rewards/rejected": -0.23931819200515747,
      "step": 64
    },
    {
      "debug/policy_chosen_logits": 28.422582626342773,
      "debug/policy_chosen_logps": -421.810302734375,
      "debug/policy_rejected_logits": 22.969980239868164,
      "debug/policy_rejected_logps": -464.9712219238281,
      "debug/reference_chosen_logps": -445.0280456542969,
      "debug/reference_rejected_logps": -451.85345458984375,
      "epoch": 0.7142857142857143,
      "grad_norm": 3.3417383455535807,
      "learning_rate": 1e-06,
      "logits/chosen": 28.422582626342773,
      "logits/rejected": 22.969980239868164,
      "logps/chosen": -421.810302734375,
      "logps/rejected": -464.9712219238281,
      "loss": 0.216,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.2321772277355194,
      "rewards/margins": 0.36335471272468567,
      "rewards/rejected": -0.13117747008800507,
      "step": 65
    },
    {
      "debug/policy_chosen_logits": 24.602392196655273,
      "debug/policy_chosen_logps": -409.503662109375,
      "debug/policy_rejected_logits": 26.957948684692383,
      "debug/policy_rejected_logps": -518.37353515625,
      "debug/reference_chosen_logps": -443.19598388671875,
      "debug/reference_rejected_logps": -475.41815185546875,
      "epoch": 0.7252747252747253,
      "grad_norm": 7.382953809999285,
      "learning_rate": 1e-06,
      "logits/chosen": 24.602392196655273,
      "logits/rejected": 26.957948684692383,
      "logps/chosen": -409.503662109375,
      "logps/rejected": -518.37353515625,
      "loss": 0.1895,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.3369232416152954,
      "rewards/margins": 0.7664777040481567,
      "rewards/rejected": -0.42955446243286133,
      "step": 66
    },
    {
      "debug/policy_chosen_logits": 22.386083602905273,
      "debug/policy_chosen_logps": -382.5001220703125,
      "debug/policy_rejected_logits": 16.90252113342285,
      "debug/policy_rejected_logps": -475.36181640625,
      "debug/reference_chosen_logps": -412.29132080078125,
      "debug/reference_rejected_logps": -449.7041015625,
      "epoch": 0.7362637362637363,
      "grad_norm": 5.308279817037437,
      "learning_rate": 1e-06,
      "logits/chosen": 22.386083602905273,
      "logits/rejected": 16.90252113342285,
      "logps/chosen": -382.5001220703125,
      "logps/rejected": -475.36181640625,
      "loss": 0.2588,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.2979119122028351,
      "rewards/margins": 0.5544889569282532,
      "rewards/rejected": -0.2565770447254181,
      "step": 67
    },
    {
      "debug/policy_chosen_logits": 17.921504974365234,
      "debug/policy_chosen_logps": -390.6407470703125,
      "debug/policy_rejected_logits": 24.367341995239258,
      "debug/policy_rejected_logps": -484.7353515625,
      "debug/reference_chosen_logps": -416.8155822753906,
      "debug/reference_rejected_logps": -466.807861328125,
      "epoch": 0.7472527472527473,
      "grad_norm": 5.3343691933569595,
      "learning_rate": 1e-06,
      "logits/chosen": 17.921504974365234,
      "logits/rejected": 24.367341995239258,
      "logps/chosen": -390.6407470703125,
      "logps/rejected": -484.7353515625,
      "loss": 0.2477,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.2617482841014862,
      "rewards/margins": 0.441023588180542,
      "rewards/rejected": -0.17927534878253937,
      "step": 68
    },
    {
      "debug/policy_chosen_logits": 22.636571884155273,
      "debug/policy_chosen_logps": -391.5332336425781,
      "debug/policy_rejected_logits": 27.57520294189453,
      "debug/policy_rejected_logps": -491.03448486328125,
      "debug/reference_chosen_logps": -418.2889709472656,
      "debug/reference_rejected_logps": -467.211181640625,
      "epoch": 0.7582417582417582,
      "grad_norm": 4.491987560088097,
      "learning_rate": 1e-06,
      "logits/chosen": 22.636571884155273,
      "logits/rejected": 27.57520294189453,
      "logps/chosen": -391.5332336425781,
      "logps/rejected": -491.03448486328125,
      "loss": 0.2792,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.26755744218826294,
      "rewards/margins": 0.5057904124259949,
      "rewards/rejected": -0.23823297023773193,
      "step": 69
    },
    {
      "debug/policy_chosen_logits": 27.65935516357422,
      "debug/policy_chosen_logps": -403.2398986816406,
      "debug/policy_rejected_logits": 29.533405303955078,
      "debug/policy_rejected_logps": -492.2855224609375,
      "debug/reference_chosen_logps": -433.0833435058594,
      "debug/reference_rejected_logps": -444.8511962890625,
      "epoch": 0.7692307692307693,
      "grad_norm": 3.4461660659523656,
      "learning_rate": 1e-06,
      "logits/chosen": 27.65935516357422,
      "logits/rejected": 29.533405303955078,
      "logps/chosen": -403.2398986816406,
      "logps/rejected": -492.2855224609375,
      "loss": 0.2317,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.29843446612358093,
      "rewards/margins": 0.7727776169776917,
      "rewards/rejected": -0.47434312105178833,
      "step": 70
    },
    {
      "debug/policy_chosen_logits": 24.503950119018555,
      "debug/policy_chosen_logps": -446.45513916015625,
      "debug/policy_rejected_logits": 22.433277130126953,
      "debug/policy_rejected_logps": -470.9190979003906,
      "debug/reference_chosen_logps": -472.0450134277344,
      "debug/reference_rejected_logps": -448.1822814941406,
      "epoch": 0.7802197802197802,
      "grad_norm": 5.717761852601763,
      "learning_rate": 1e-06,
      "logits/chosen": 24.503950119018555,
      "logits/rejected": 22.433277130126953,
      "logps/chosen": -446.45513916015625,
      "logps/rejected": -470.9190979003906,
      "loss": 0.3119,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.2558988630771637,
      "rewards/margins": 0.4832671284675598,
      "rewards/rejected": -0.22736826539039612,
      "step": 71
    },
    {
      "debug/policy_chosen_logits": 19.840513229370117,
      "debug/policy_chosen_logps": -389.92559814453125,
      "debug/policy_rejected_logits": 24.959850311279297,
      "debug/policy_rejected_logps": -467.8755187988281,
      "debug/reference_chosen_logps": -414.42547607421875,
      "debug/reference_rejected_logps": -441.78948974609375,
      "epoch": 0.7912087912087912,
      "grad_norm": 5.324274056234465,
      "learning_rate": 1e-06,
      "logits/chosen": 19.840513229370117,
      "logits/rejected": 24.959850311279297,
      "logps/chosen": -389.92559814453125,
      "logps/rejected": -467.8755187988281,
      "loss": 0.225,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.24499861896038055,
      "rewards/margins": 0.5058590173721313,
      "rewards/rejected": -0.260860413312912,
      "step": 72
    },
    {
      "debug/policy_chosen_logits": 24.089357376098633,
      "debug/policy_chosen_logps": -417.7742614746094,
      "debug/policy_rejected_logits": 26.09589958190918,
      "debug/policy_rejected_logps": -444.3390808105469,
      "debug/reference_chosen_logps": -446.7651672363281,
      "debug/reference_rejected_logps": -417.492919921875,
      "epoch": 0.8021978021978022,
      "grad_norm": 3.7830573167149204,
      "learning_rate": 1e-06,
      "logits/chosen": 24.089357376098633,
      "logits/rejected": 26.09589958190918,
      "logps/chosen": -417.7742614746094,
      "logps/rejected": -444.3390808105469,
      "loss": 0.2977,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.2899090647697449,
      "rewards/margins": 0.5583702921867371,
      "rewards/rejected": -0.2684612572193146,
      "step": 73
    },
    {
      "debug/policy_chosen_logits": 21.799409866333008,
      "debug/policy_chosen_logps": -405.78070068359375,
      "debug/policy_rejected_logits": 26.489110946655273,
      "debug/policy_rejected_logps": -476.9659423828125,
      "debug/reference_chosen_logps": -435.2866516113281,
      "debug/reference_rejected_logps": -449.07806396484375,
      "epoch": 0.8131868131868132,
      "grad_norm": 3.635552390173908,
      "learning_rate": 1e-06,
      "logits/chosen": 21.799409866333008,
      "logits/rejected": 26.489110946655273,
      "logps/chosen": -405.78070068359375,
      "logps/rejected": -476.9659423828125,
      "loss": 0.2695,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.2950596511363983,
      "rewards/margins": 0.573938250541687,
      "rewards/rejected": -0.2788785696029663,
      "step": 74
    },
    {
      "debug/policy_chosen_logits": 21.563961029052734,
      "debug/policy_chosen_logps": -382.45452880859375,
      "debug/policy_rejected_logits": 27.974708557128906,
      "debug/policy_rejected_logps": -446.31573486328125,
      "debug/reference_chosen_logps": -403.13330078125,
      "debug/reference_rejected_logps": -445.6566467285156,
      "epoch": 0.8241758241758241,
      "grad_norm": 6.518551807981199,
      "learning_rate": 1e-06,
      "logits/chosen": 21.563961029052734,
      "logits/rejected": 27.974708557128906,
      "logps/chosen": -382.45452880859375,
      "logps/rejected": -446.31573486328125,
      "loss": 0.339,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.2067876011133194,
      "rewards/margins": 0.2133782058954239,
      "rewards/rejected": -0.006590619683265686,
      "step": 75
    },
    {
      "debug/policy_chosen_logits": 26.57479476928711,
      "debug/policy_chosen_logps": -421.49554443359375,
      "debug/policy_rejected_logits": 25.88642692565918,
      "debug/policy_rejected_logps": -463.5325622558594,
      "debug/reference_chosen_logps": -445.17974853515625,
      "debug/reference_rejected_logps": -448.434814453125,
      "epoch": 0.8351648351648352,
      "grad_norm": 4.642048379046934,
      "learning_rate": 1e-06,
      "logits/chosen": 26.57479476928711,
      "logits/rejected": 25.88642692565918,
      "logps/chosen": -421.49554443359375,
      "logps/rejected": -463.5325622558594,
      "loss": 0.3014,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.23684199154376984,
      "rewards/margins": 0.3878192901611328,
      "rewards/rejected": -0.15097728371620178,
      "step": 76
    },
    {
      "debug/policy_chosen_logits": 27.36321258544922,
      "debug/policy_chosen_logps": -417.56976318359375,
      "debug/policy_rejected_logits": 27.49226951599121,
      "debug/policy_rejected_logps": -499.5318603515625,
      "debug/reference_chosen_logps": -444.1575927734375,
      "debug/reference_rejected_logps": -475.24273681640625,
      "epoch": 0.8461538461538461,
      "grad_norm": 4.237463467519612,
      "learning_rate": 1e-06,
      "logits/chosen": 27.36321258544922,
      "logits/rejected": 27.49226951599121,
      "logps/chosen": -417.56976318359375,
      "logps/rejected": -499.5318603515625,
      "loss": 0.2199,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.2658780515193939,
      "rewards/margins": 0.5087695121765137,
      "rewards/rejected": -0.24289149045944214,
      "step": 77
    },
    {
      "debug/policy_chosen_logits": 24.683202743530273,
      "debug/policy_chosen_logps": -409.4302673339844,
      "debug/policy_rejected_logits": 23.952590942382812,
      "debug/policy_rejected_logps": -463.2736511230469,
      "debug/reference_chosen_logps": -432.84686279296875,
      "debug/reference_rejected_logps": -437.69964599609375,
      "epoch": 0.8571428571428571,
      "grad_norm": 4.166956738488574,
      "learning_rate": 1e-06,
      "logits/chosen": 24.683202743530273,
      "logits/rejected": 23.952590942382812,
      "logps/chosen": -409.4302673339844,
      "logps/rejected": -463.2736511230469,
      "loss": 0.2483,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.2341659516096115,
      "rewards/margins": 0.48990583419799805,
      "rewards/rejected": -0.25573989748954773,
      "step": 78
    },
    {
      "debug/policy_chosen_logits": 26.540374755859375,
      "debug/policy_chosen_logps": -422.86810302734375,
      "debug/policy_rejected_logits": 30.67388916015625,
      "debug/policy_rejected_logps": -492.8260192871094,
      "debug/reference_chosen_logps": -438.4207763671875,
      "debug/reference_rejected_logps": -458.2353515625,
      "epoch": 0.8681318681318682,
      "grad_norm": 5.889314624750013,
      "learning_rate": 1e-06,
      "logits/chosen": 26.540374755859375,
      "logits/rejected": 30.67388916015625,
      "logps/chosen": -422.86810302734375,
      "logps/rejected": -492.8260192871094,
      "loss": 0.264,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.15552687644958496,
      "rewards/margins": 0.5014333724975586,
      "rewards/rejected": -0.34590649604797363,
      "step": 79
    },
    {
      "debug/policy_chosen_logits": 24.57747459411621,
      "debug/policy_chosen_logps": -390.2710266113281,
      "debug/policy_rejected_logits": 27.390230178833008,
      "debug/policy_rejected_logps": -489.9441223144531,
      "debug/reference_chosen_logps": -416.788818359375,
      "debug/reference_rejected_logps": -440.2818298339844,
      "epoch": 0.8791208791208791,
      "grad_norm": 4.745834840744495,
      "learning_rate": 1e-06,
      "logits/chosen": 24.57747459411621,
      "logits/rejected": 27.390230178833008,
      "logps/chosen": -390.2710266113281,
      "logps/rejected": -489.9441223144531,
      "loss": 0.2894,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.2651780843734741,
      "rewards/margins": 0.76180100440979,
      "rewards/rejected": -0.49662283062934875,
      "step": 80
    },
    {
      "debug/policy_chosen_logits": 25.34842300415039,
      "debug/policy_chosen_logps": -390.0006103515625,
      "debug/policy_rejected_logits": 27.055194854736328,
      "debug/policy_rejected_logps": -460.668701171875,
      "debug/reference_chosen_logps": -408.5838623046875,
      "debug/reference_rejected_logps": -444.5193786621094,
      "epoch": 0.8901098901098901,
      "grad_norm": 4.590683530349925,
      "learning_rate": 1e-06,
      "logits/chosen": 25.34842300415039,
      "logits/rejected": 27.055194854736328,
      "logps/chosen": -390.0006103515625,
      "logps/rejected": -460.668701171875,
      "loss": 0.2902,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.1858326643705368,
      "rewards/margins": 0.3473258316516876,
      "rewards/rejected": -0.16149315237998962,
      "step": 81
    },
    {
      "debug/policy_chosen_logits": 27.74835968017578,
      "debug/policy_chosen_logps": -413.76629638671875,
      "debug/policy_rejected_logits": 27.351459503173828,
      "debug/policy_rejected_logps": -438.90216064453125,
      "debug/reference_chosen_logps": -443.37615966796875,
      "debug/reference_rejected_logps": -417.8377380371094,
      "epoch": 0.9010989010989011,
      "grad_norm": 4.49887483206356,
      "learning_rate": 1e-06,
      "logits/chosen": 27.74835968017578,
      "logits/rejected": 27.351459503173828,
      "logps/chosen": -413.76629638671875,
      "logps/rejected": -438.90216064453125,
      "loss": 0.252,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.29609841108322144,
      "rewards/margins": 0.5067427754402161,
      "rewards/rejected": -0.21064436435699463,
      "step": 82
    },
    {
      "debug/policy_chosen_logits": 26.60724639892578,
      "debug/policy_chosen_logps": -380.8109130859375,
      "debug/policy_rejected_logits": 30.683988571166992,
      "debug/policy_rejected_logps": -487.4014892578125,
      "debug/reference_chosen_logps": -419.3782958984375,
      "debug/reference_rejected_logps": -462.61505126953125,
      "epoch": 0.9120879120879121,
      "grad_norm": 4.452776489719901,
      "learning_rate": 1e-06,
      "logits/chosen": 26.60724639892578,
      "logits/rejected": 30.683988571166992,
      "logps/chosen": -380.8109130859375,
      "logps/rejected": -487.4014892578125,
      "loss": 0.281,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.38567405939102173,
      "rewards/margins": 0.6335387825965881,
      "rewards/rejected": -0.2478647232055664,
      "step": 83
    },
    {
      "debug/policy_chosen_logits": 23.03528594970703,
      "debug/policy_chosen_logps": -416.22314453125,
      "debug/policy_rejected_logits": 23.581472396850586,
      "debug/policy_rejected_logps": -434.1588439941406,
      "debug/reference_chosen_logps": -440.158935546875,
      "debug/reference_rejected_logps": -421.5128173828125,
      "epoch": 0.9230769230769231,
      "grad_norm": 4.971404649364657,
      "learning_rate": 1e-06,
      "logits/chosen": 23.03528594970703,
      "logits/rejected": 23.581472396850586,
      "logps/chosen": -416.22314453125,
      "logps/rejected": -434.1588439941406,
      "loss": 0.2359,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.23935794830322266,
      "rewards/margins": 0.36581796407699585,
      "rewards/rejected": -0.1264600157737732,
      "step": 84
    },
    {
      "debug/policy_chosen_logits": 26.0145320892334,
      "debug/policy_chosen_logps": -407.3331298828125,
      "debug/policy_rejected_logits": 22.86039924621582,
      "debug/policy_rejected_logps": -455.66888427734375,
      "debug/reference_chosen_logps": -429.9805908203125,
      "debug/reference_rejected_logps": -431.9481201171875,
      "epoch": 0.9340659340659341,
      "grad_norm": 4.560309498305691,
      "learning_rate": 1e-06,
      "logits/chosen": 26.0145320892334,
      "logits/rejected": 22.86039924621582,
      "logps/chosen": -407.3331298828125,
      "logps/rejected": -455.66888427734375,
      "loss": 0.2926,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.22647437453269958,
      "rewards/margins": 0.46368181705474854,
      "rewards/rejected": -0.23720744252204895,
      "step": 85
    },
    {
      "debug/policy_chosen_logits": 21.526865005493164,
      "debug/policy_chosen_logps": -391.841552734375,
      "debug/policy_rejected_logits": 25.782846450805664,
      "debug/policy_rejected_logps": -485.3880310058594,
      "debug/reference_chosen_logps": -410.5163269042969,
      "debug/reference_rejected_logps": -450.18896484375,
      "epoch": 0.945054945054945,
      "grad_norm": 3.115000092835439,
      "learning_rate": 1e-06,
      "logits/chosen": 21.526865005493164,
      "logits/rejected": 25.782846450805664,
      "logps/chosen": -391.841552734375,
      "logps/rejected": -485.3880310058594,
      "loss": 0.227,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.1867476999759674,
      "rewards/margins": 0.5387383699417114,
      "rewards/rejected": -0.35199064016342163,
      "step": 86
    },
    {
      "debug/policy_chosen_logits": 24.497520446777344,
      "debug/policy_chosen_logps": -430.42303466796875,
      "debug/policy_rejected_logits": 26.349315643310547,
      "debug/policy_rejected_logps": -465.03375244140625,
      "debug/reference_chosen_logps": -443.20037841796875,
      "debug/reference_rejected_logps": -455.8772888183594,
      "epoch": 0.9560439560439561,
      "grad_norm": 4.513341277064094,
      "learning_rate": 1e-06,
      "logits/chosen": 24.497520446777344,
      "logits/rejected": 26.349315643310547,
      "logps/chosen": -430.42303466796875,
      "logps/rejected": -465.03375244140625,
      "loss": 0.2854,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.12777358293533325,
      "rewards/margins": 0.2193382978439331,
      "rewards/rejected": -0.09156470745801926,
      "step": 87
    },
    {
      "debug/policy_chosen_logits": 18.35858917236328,
      "debug/policy_chosen_logps": -389.84100341796875,
      "debug/policy_rejected_logits": 21.731107711791992,
      "debug/policy_rejected_logps": -423.7160339355469,
      "debug/reference_chosen_logps": -406.00408935546875,
      "debug/reference_rejected_logps": -423.71820068359375,
      "epoch": 0.967032967032967,
      "grad_norm": 4.8398310648982354,
      "learning_rate": 1e-06,
      "logits/chosen": 18.35858917236328,
      "logits/rejected": 21.731107711791992,
      "logps/chosen": -389.84100341796875,
      "logps/rejected": -423.7160339355469,
      "loss": 0.2595,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.16163063049316406,
      "rewards/margins": 0.16160908341407776,
      "rewards/rejected": 2.155173569917679e-05,
      "step": 88
    },
    {
      "debug/policy_chosen_logits": 23.2659969329834,
      "debug/policy_chosen_logps": -423.4161376953125,
      "debug/policy_rejected_logits": 28.146160125732422,
      "debug/policy_rejected_logps": -473.67059326171875,
      "debug/reference_chosen_logps": -441.9681091308594,
      "debug/reference_rejected_logps": -431.1749267578125,
      "epoch": 0.978021978021978,
      "grad_norm": 6.467539115961516,
      "learning_rate": 1e-06,
      "logits/chosen": 23.2659969329834,
      "logits/rejected": 28.146160125732422,
      "logps/chosen": -423.4161376953125,
      "logps/rejected": -473.67059326171875,
      "loss": 0.2244,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.1855195164680481,
      "rewards/margins": 0.610476553440094,
      "rewards/rejected": -0.4249570369720459,
      "step": 89
    },
    {
      "debug/policy_chosen_logits": 27.019420623779297,
      "debug/policy_chosen_logps": -413.4447021484375,
      "debug/policy_rejected_logits": 31.003211975097656,
      "debug/policy_rejected_logps": -500.8101501464844,
      "debug/reference_chosen_logps": -434.320068359375,
      "debug/reference_rejected_logps": -467.6002502441406,
      "epoch": 0.989010989010989,
      "grad_norm": 4.1029333690791745,
      "learning_rate": 1e-06,
      "logits/chosen": 27.019420623779297,
      "logits/rejected": 31.003211975097656,
      "logps/chosen": -413.4447021484375,
      "logps/rejected": -500.8101501464844,
      "loss": 0.2992,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.20875364542007446,
      "rewards/margins": 0.5408526062965393,
      "rewards/rejected": -0.33209896087646484,
      "step": 90
    },
    {
      "debug/policy_chosen_logits": 29.092975616455078,
      "debug/policy_chosen_logps": -404.4205017089844,
      "debug/policy_rejected_logits": 23.611309051513672,
      "debug/policy_rejected_logps": -429.15985107421875,
      "debug/reference_chosen_logps": -424.5293884277344,
      "debug/reference_rejected_logps": -418.74169921875,
      "epoch": 1.0,
      "grad_norm": 5.419471225413546,
      "learning_rate": 1e-06,
      "logits/chosen": 29.092975616455078,
      "logits/rejected": 23.611309051513672,
      "logps/chosen": -404.4205017089844,
      "logps/rejected": -429.15985107421875,
      "loss": 0.2937,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.20108896493911743,
      "rewards/margins": 0.30527064204216003,
      "rewards/rejected": -0.10418166220188141,
      "step": 91
    },
    {
      "epoch": 1.0,
      "step": 91,
      "total_flos": 0.0,
      "train_loss": 0.3198396195094664,
      "train_runtime": 711.7161,
      "train_samples_per_second": 8.179,
      "train_steps_per_second": 0.128
    }
  ],
  "logging_steps": 1,
  "max_steps": 91,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}