File size: 40,694 Bytes
52c5948
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 100,
  "global_step": 47,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "debug/policy_chosen_logits": 31.700279235839844,
      "debug/policy_chosen_logps": -434.26495361328125,
      "debug/policy_rejected_logits": 33.99253845214844,
      "debug/policy_rejected_logps": -441.9063720703125,
      "debug/reference_chosen_logps": -434.26495361328125,
      "debug/reference_rejected_logps": -441.9063720703125,
      "epoch": 0.02127659574468085,
      "grad_norm": 5.407328059506411,
      "learning_rate": 1e-06,
      "logits/chosen": 31.700279235839844,
      "logits/rejected": 33.99253845214844,
      "logps/chosen": -434.26495361328125,
      "logps/rejected": -441.9063720703125,
      "loss": 0.5,
      "rewards/accuracies": 0.0,
      "rewards/chosen": 0.0,
      "rewards/margins": 0.0,
      "rewards/rejected": 0.0,
      "step": 1
    },
    {
      "debug/policy_chosen_logits": 27.489120483398438,
      "debug/policy_chosen_logps": -410.28472900390625,
      "debug/policy_rejected_logits": 31.382970809936523,
      "debug/policy_rejected_logps": -435.17218017578125,
      "debug/reference_chosen_logps": -410.96771240234375,
      "debug/reference_rejected_logps": -436.0491638183594,
      "epoch": 0.0425531914893617,
      "grad_norm": 4.951759612240564,
      "learning_rate": 1e-06,
      "logits/chosen": 27.489120483398438,
      "logits/rejected": 31.382970809936523,
      "logps/chosen": -410.28472900390625,
      "logps/rejected": -435.17218017578125,
      "loss": 0.5009,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.006829871796071529,
      "rewards/margins": -0.001939887646585703,
      "rewards/rejected": 0.00876975990831852,
      "step": 2
    },
    {
      "debug/policy_chosen_logits": 31.120014190673828,
      "debug/policy_chosen_logps": -401.9127197265625,
      "debug/policy_rejected_logits": 33.329689025878906,
      "debug/policy_rejected_logps": -424.90576171875,
      "debug/reference_chosen_logps": -402.24658203125,
      "debug/reference_rejected_logps": -424.7574462890625,
      "epoch": 0.06382978723404255,
      "grad_norm": 5.411851250945231,
      "learning_rate": 1e-06,
      "logits/chosen": 31.120014190673828,
      "logits/rejected": 33.329689025878906,
      "logps/chosen": -401.9127197265625,
      "logps/rejected": -424.90576171875,
      "loss": 0.5007,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.003338394220918417,
      "rewards/margins": 0.004821510519832373,
      "rewards/rejected": -0.001483116764575243,
      "step": 3
    },
    {
      "debug/policy_chosen_logits": 30.066335678100586,
      "debug/policy_chosen_logps": -403.6931457519531,
      "debug/policy_rejected_logits": 33.15522384643555,
      "debug/policy_rejected_logps": -436.77801513671875,
      "debug/reference_chosen_logps": -403.52996826171875,
      "debug/reference_rejected_logps": -436.29296875,
      "epoch": 0.0851063829787234,
      "grad_norm": 6.440647306952527,
      "learning_rate": 1e-06,
      "logits/chosen": 30.066335678100586,
      "logits/rejected": 33.15522384643555,
      "logps/chosen": -403.6931457519531,
      "logps/rejected": -436.77801513671875,
      "loss": 0.4993,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.0016318517737090588,
      "rewards/margins": 0.003218421945348382,
      "rewards/rejected": -0.00485027302056551,
      "step": 4
    },
    {
      "debug/policy_chosen_logits": 28.028703689575195,
      "debug/policy_chosen_logps": -391.64715576171875,
      "debug/policy_rejected_logits": 30.672496795654297,
      "debug/policy_rejected_logps": -420.91143798828125,
      "debug/reference_chosen_logps": -392.4825134277344,
      "debug/reference_rejected_logps": -422.0054931640625,
      "epoch": 0.10638297872340426,
      "grad_norm": 5.433518269383661,
      "learning_rate": 1e-06,
      "logits/chosen": 28.028703689575195,
      "logits/rejected": 30.672496795654297,
      "logps/chosen": -391.64715576171875,
      "logps/rejected": -420.91143798828125,
      "loss": 0.4987,
      "rewards/accuracies": 0.25,
      "rewards/chosen": 0.008353347890079021,
      "rewards/margins": -0.0025872797705233097,
      "rewards/rejected": 0.010940628126263618,
      "step": 5
    },
    {
      "debug/policy_chosen_logits": 26.0015811920166,
      "debug/policy_chosen_logps": -403.2935791015625,
      "debug/policy_rejected_logits": 25.2414608001709,
      "debug/policy_rejected_logps": -407.3106994628906,
      "debug/reference_chosen_logps": -402.87139892578125,
      "debug/reference_rejected_logps": -406.1798095703125,
      "epoch": 0.1276595744680851,
      "grad_norm": 5.048275333461177,
      "learning_rate": 1e-06,
      "logits/chosen": 26.0015811920166,
      "logits/rejected": 25.2414608001709,
      "logps/chosen": -403.2935791015625,
      "logps/rejected": -407.3106994628906,
      "loss": 0.496,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.0042218780145049095,
      "rewards/margins": 0.007086906582117081,
      "rewards/rejected": -0.01130878459662199,
      "step": 6
    },
    {
      "debug/policy_chosen_logits": 27.019393920898438,
      "debug/policy_chosen_logps": -424.65643310546875,
      "debug/policy_rejected_logits": 29.134994506835938,
      "debug/policy_rejected_logps": -420.3319396972656,
      "debug/reference_chosen_logps": -423.5784912109375,
      "debug/reference_rejected_logps": -420.0720520019531,
      "epoch": 0.14893617021276595,
      "grad_norm": 6.092801689841109,
      "learning_rate": 1e-06,
      "logits/chosen": 27.019393920898438,
      "logits/rejected": 29.134994506835938,
      "logps/chosen": -424.65643310546875,
      "logps/rejected": -420.3319396972656,
      "loss": 0.499,
      "rewards/accuracies": 0.125,
      "rewards/chosen": -0.010779608972370625,
      "rewards/margins": -0.008180923759937286,
      "rewards/rejected": -0.0025986863765865564,
      "step": 7
    },
    {
      "debug/policy_chosen_logits": 25.41461944580078,
      "debug/policy_chosen_logps": -420.9305419921875,
      "debug/policy_rejected_logits": 25.522966384887695,
      "debug/policy_rejected_logps": -427.20623779296875,
      "debug/reference_chosen_logps": -420.0583190917969,
      "debug/reference_rejected_logps": -426.08453369140625,
      "epoch": 0.1702127659574468,
      "grad_norm": 5.248588154856571,
      "learning_rate": 1e-06,
      "logits/chosen": 25.41461944580078,
      "logits/rejected": 25.522966384887695,
      "logps/chosen": -420.9305419921875,
      "logps/rejected": -427.20623779296875,
      "loss": 0.4975,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.00872222799807787,
      "rewards/margins": 0.0024948506616055965,
      "rewards/rejected": -0.011217079125344753,
      "step": 8
    },
    {
      "debug/policy_chosen_logits": 27.28373146057129,
      "debug/policy_chosen_logps": -413.857177734375,
      "debug/policy_rejected_logits": 29.01516342163086,
      "debug/policy_rejected_logps": -429.70623779296875,
      "debug/reference_chosen_logps": -413.364501953125,
      "debug/reference_rejected_logps": -429.416259765625,
      "epoch": 0.19148936170212766,
      "grad_norm": 5.836663309727503,
      "learning_rate": 1e-06,
      "logits/chosen": 27.28373146057129,
      "logits/rejected": 29.01516342163086,
      "logps/chosen": -413.857177734375,
      "logps/rejected": -429.70623779296875,
      "loss": 0.4954,
      "rewards/accuracies": 0.375,
      "rewards/chosen": -0.004926986526697874,
      "rewards/margins": -0.002027016133069992,
      "rewards/rejected": -0.002899970393627882,
      "step": 9
    },
    {
      "debug/policy_chosen_logits": 29.55730628967285,
      "debug/policy_chosen_logps": -418.648193359375,
      "debug/policy_rejected_logits": 30.004676818847656,
      "debug/policy_rejected_logps": -430.01788330078125,
      "debug/reference_chosen_logps": -418.4599609375,
      "debug/reference_rejected_logps": -429.8154296875,
      "epoch": 0.2127659574468085,
      "grad_norm": 4.978725766808406,
      "learning_rate": 1e-06,
      "logits/chosen": 29.55730628967285,
      "logits/rejected": 30.004676818847656,
      "logps/chosen": -418.648193359375,
      "logps/rejected": -430.01788330078125,
      "loss": 0.4991,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.0018823242280632257,
      "rewards/margins": 0.00014217384159564972,
      "rewards/rejected": -0.002024497603997588,
      "step": 10
    },
    {
      "debug/policy_chosen_logits": 33.0296516418457,
      "debug/policy_chosen_logps": -434.47308349609375,
      "debug/policy_rejected_logits": 31.160263061523438,
      "debug/policy_rejected_logps": -406.6353759765625,
      "debug/reference_chosen_logps": -433.95892333984375,
      "debug/reference_rejected_logps": -405.2964782714844,
      "epoch": 0.23404255319148937,
      "grad_norm": 5.822177618834045,
      "learning_rate": 1e-06,
      "logits/chosen": 33.0296516418457,
      "logits/rejected": 31.160263061523438,
      "logps/chosen": -434.47308349609375,
      "logps/rejected": -406.6353759765625,
      "loss": 0.493,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.005142059177160263,
      "rewards/margins": 0.008247108198702335,
      "rewards/rejected": -0.013389168307185173,
      "step": 11
    },
    {
      "debug/policy_chosen_logits": 30.720827102661133,
      "debug/policy_chosen_logps": -455.06597900390625,
      "debug/policy_rejected_logits": 32.45933151245117,
      "debug/policy_rejected_logps": -462.2677307128906,
      "debug/reference_chosen_logps": -454.6126403808594,
      "debug/reference_rejected_logps": -459.7181396484375,
      "epoch": 0.2553191489361702,
      "grad_norm": 5.239137130887116,
      "learning_rate": 1e-06,
      "logits/chosen": 30.720827102661133,
      "logits/rejected": 32.45933151245117,
      "logps/chosen": -455.06597900390625,
      "logps/rejected": -462.2677307128906,
      "loss": 0.4991,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.00453338585793972,
      "rewards/margins": 0.020962638780474663,
      "rewards/rejected": -0.025496024638414383,
      "step": 12
    },
    {
      "debug/policy_chosen_logits": 30.186174392700195,
      "debug/policy_chosen_logps": -412.29742431640625,
      "debug/policy_rejected_logits": 28.243711471557617,
      "debug/policy_rejected_logps": -426.9504089355469,
      "debug/reference_chosen_logps": -411.92120361328125,
      "debug/reference_rejected_logps": -425.7698974609375,
      "epoch": 0.2765957446808511,
      "grad_norm": 5.6983956081800855,
      "learning_rate": 1e-06,
      "logits/chosen": 30.186174392700195,
      "logits/rejected": 28.243711471557617,
      "logps/chosen": -412.29742431640625,
      "logps/rejected": -426.9504089355469,
      "loss": 0.4955,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.003762359730899334,
      "rewards/margins": 0.008042870089411736,
      "rewards/rejected": -0.011805228888988495,
      "step": 13
    },
    {
      "debug/policy_chosen_logits": 29.8179931640625,
      "debug/policy_chosen_logps": -402.04205322265625,
      "debug/policy_rejected_logits": 27.887521743774414,
      "debug/policy_rejected_logps": -406.5090637207031,
      "debug/reference_chosen_logps": -402.81463623046875,
      "debug/reference_rejected_logps": -406.35760498046875,
      "epoch": 0.2978723404255319,
      "grad_norm": 5.185829515819964,
      "learning_rate": 1e-06,
      "logits/chosen": 29.8179931640625,
      "logits/rejected": 27.887521743774414,
      "logps/chosen": -402.04205322265625,
      "logps/rejected": -406.5090637207031,
      "loss": 0.4892,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.007725906558334827,
      "rewards/margins": 0.009240342304110527,
      "rewards/rejected": -0.0015144352801144123,
      "step": 14
    },
    {
      "debug/policy_chosen_logits": 30.058448791503906,
      "debug/policy_chosen_logps": -412.29827880859375,
      "debug/policy_rejected_logits": 29.466854095458984,
      "debug/policy_rejected_logps": -412.73504638671875,
      "debug/reference_chosen_logps": -411.7734680175781,
      "debug/reference_rejected_logps": -413.09912109375,
      "epoch": 0.3191489361702128,
      "grad_norm": 5.147098230721813,
      "learning_rate": 1e-06,
      "logits/chosen": 30.058448791503906,
      "logits/rejected": 29.466854095458984,
      "logps/chosen": -412.29827880859375,
      "logps/rejected": -412.73504638671875,
      "loss": 0.4893,
      "rewards/accuracies": 0.25,
      "rewards/chosen": -0.005248222034424543,
      "rewards/margins": -0.008889121934771538,
      "rewards/rejected": 0.0036408999003469944,
      "step": 15
    },
    {
      "debug/policy_chosen_logits": 26.801280975341797,
      "debug/policy_chosen_logps": -453.10833740234375,
      "debug/policy_rejected_logits": 28.296146392822266,
      "debug/policy_rejected_logps": -433.0950927734375,
      "debug/reference_chosen_logps": -453.86102294921875,
      "debug/reference_rejected_logps": -432.42510986328125,
      "epoch": 0.3404255319148936,
      "grad_norm": 5.3650227932794765,
      "learning_rate": 1e-06,
      "logits/chosen": 26.801280975341797,
      "logits/rejected": 28.296146392822266,
      "logps/chosen": -453.10833740234375,
      "logps/rejected": -433.0950927734375,
      "loss": 0.4907,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.007526512257754803,
      "rewards/margins": 0.014225959777832031,
      "rewards/rejected": -0.0066994475200772285,
      "step": 16
    },
    {
      "debug/policy_chosen_logits": 25.992467880249023,
      "debug/policy_chosen_logps": -436.1377258300781,
      "debug/policy_rejected_logits": 27.410860061645508,
      "debug/policy_rejected_logps": -426.43035888671875,
      "debug/reference_chosen_logps": -434.6832275390625,
      "debug/reference_rejected_logps": -424.5072021484375,
      "epoch": 0.3617021276595745,
      "grad_norm": 5.048216336945854,
      "learning_rate": 1e-06,
      "logits/chosen": 25.992467880249023,
      "logits/rejected": 27.410860061645508,
      "logps/chosen": -436.1377258300781,
      "logps/rejected": -426.43035888671875,
      "loss": 0.494,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.014545059762895107,
      "rewards/margins": 0.004686659201979637,
      "rewards/rejected": -0.01923171989619732,
      "step": 17
    },
    {
      "debug/policy_chosen_logits": 27.924072265625,
      "debug/policy_chosen_logps": -456.6978759765625,
      "debug/policy_rejected_logits": 27.263843536376953,
      "debug/policy_rejected_logps": -411.67791748046875,
      "debug/reference_chosen_logps": -455.6437683105469,
      "debug/reference_rejected_logps": -408.3628234863281,
      "epoch": 0.3829787234042553,
      "grad_norm": 4.959644897985259,
      "learning_rate": 1e-06,
      "logits/chosen": 27.924072265625,
      "logits/rejected": 27.263843536376953,
      "logps/chosen": -456.6978759765625,
      "logps/rejected": -411.67791748046875,
      "loss": 0.4872,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.01054123044013977,
      "rewards/margins": 0.022609787061810493,
      "rewards/rejected": -0.033151015639305115,
      "step": 18
    },
    {
      "debug/policy_chosen_logits": 30.296974182128906,
      "debug/policy_chosen_logps": -407.5791320800781,
      "debug/policy_rejected_logits": 29.760583877563477,
      "debug/policy_rejected_logps": -417.291748046875,
      "debug/reference_chosen_logps": -410.10662841796875,
      "debug/reference_rejected_logps": -418.6151123046875,
      "epoch": 0.40425531914893614,
      "grad_norm": 4.809418559441442,
      "learning_rate": 1e-06,
      "logits/chosen": 30.296974182128906,
      "logits/rejected": 29.760583877563477,
      "logps/chosen": -407.5791320800781,
      "logps/rejected": -417.291748046875,
      "loss": 0.4937,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.02527473494410515,
      "rewards/margins": 0.012040939182043076,
      "rewards/rejected": 0.013233794830739498,
      "step": 19
    },
    {
      "debug/policy_chosen_logits": 30.575592041015625,
      "debug/policy_chosen_logps": -413.82574462890625,
      "debug/policy_rejected_logits": 32.98490905761719,
      "debug/policy_rejected_logps": -443.43548583984375,
      "debug/reference_chosen_logps": -414.27642822265625,
      "debug/reference_rejected_logps": -441.5928649902344,
      "epoch": 0.425531914893617,
      "grad_norm": 5.055368747694493,
      "learning_rate": 1e-06,
      "logits/chosen": 30.575592041015625,
      "logits/rejected": 32.98490905761719,
      "logps/chosen": -413.82574462890625,
      "logps/rejected": -443.43548583984375,
      "loss": 0.4771,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.0045069498009979725,
      "rewards/margins": 0.02293361723423004,
      "rewards/rejected": -0.018426666036248207,
      "step": 20
    },
    {
      "debug/policy_chosen_logits": 28.962617874145508,
      "debug/policy_chosen_logps": -416.78582763671875,
      "debug/policy_rejected_logits": 31.380332946777344,
      "debug/policy_rejected_logps": -443.6494445800781,
      "debug/reference_chosen_logps": -419.51043701171875,
      "debug/reference_rejected_logps": -442.1171875,
      "epoch": 0.44680851063829785,
      "grad_norm": 5.2399641694392685,
      "learning_rate": 1e-06,
      "logits/chosen": 28.962617874145508,
      "logits/rejected": 31.380332946777344,
      "logps/chosen": -416.78582763671875,
      "logps/rejected": -443.6494445800781,
      "loss": 0.4861,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.02724616974592209,
      "rewards/margins": 0.04256858676671982,
      "rewards/rejected": -0.015322417952120304,
      "step": 21
    },
    {
      "debug/policy_chosen_logits": 33.89327621459961,
      "debug/policy_chosen_logps": -439.533203125,
      "debug/policy_rejected_logits": 32.8599853515625,
      "debug/policy_rejected_logps": -468.5189208984375,
      "debug/reference_chosen_logps": -437.9319763183594,
      "debug/reference_rejected_logps": -460.019287109375,
      "epoch": 0.46808510638297873,
      "grad_norm": 5.600180944400873,
      "learning_rate": 1e-06,
      "logits/chosen": 33.89327621459961,
      "logits/rejected": 32.8599853515625,
      "logps/chosen": -439.533203125,
      "logps/rejected": -468.5189208984375,
      "loss": 0.4779,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.016012268140912056,
      "rewards/margins": 0.06898414343595505,
      "rewards/rejected": -0.08499641716480255,
      "step": 22
    },
    {
      "debug/policy_chosen_logits": 30.022546768188477,
      "debug/policy_chosen_logps": -448.22540283203125,
      "debug/policy_rejected_logits": 30.50183868408203,
      "debug/policy_rejected_logps": -417.8683776855469,
      "debug/reference_chosen_logps": -448.73858642578125,
      "debug/reference_rejected_logps": -417.9635925292969,
      "epoch": 0.48936170212765956,
      "grad_norm": 5.053771531276715,
      "learning_rate": 1e-06,
      "logits/chosen": 30.022546768188477,
      "logits/rejected": 30.50183868408203,
      "logps/chosen": -448.22540283203125,
      "logps/rejected": -417.8683776855469,
      "loss": 0.4906,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.0051317219622433186,
      "rewards/margins": 0.004179535433650017,
      "rewards/rejected": 0.0009521869942545891,
      "step": 23
    },
    {
      "debug/policy_chosen_logits": 31.068572998046875,
      "debug/policy_chosen_logps": -408.3885192871094,
      "debug/policy_rejected_logits": 30.79738426208496,
      "debug/policy_rejected_logps": -432.73651123046875,
      "debug/reference_chosen_logps": -406.73419189453125,
      "debug/reference_rejected_logps": -432.0497131347656,
      "epoch": 0.5106382978723404,
      "grad_norm": 5.120783229464688,
      "learning_rate": 1e-06,
      "logits/chosen": 31.068572998046875,
      "logits/rejected": 30.79738426208496,
      "logps/chosen": -408.3885192871094,
      "logps/rejected": -432.73651123046875,
      "loss": 0.478,
      "rewards/accuracies": 0.375,
      "rewards/chosen": -0.016543272882699966,
      "rewards/margins": -0.009675255045294762,
      "rewards/rejected": -0.006868018768727779,
      "step": 24
    },
    {
      "debug/policy_chosen_logits": 28.878725051879883,
      "debug/policy_chosen_logps": -434.04144287109375,
      "debug/policy_rejected_logits": 30.279621124267578,
      "debug/policy_rejected_logps": -457.3016357421875,
      "debug/reference_chosen_logps": -433.21746826171875,
      "debug/reference_rejected_logps": -453.06280517578125,
      "epoch": 0.5319148936170213,
      "grad_norm": 5.297697739276052,
      "learning_rate": 1e-06,
      "logits/chosen": 28.878725051879883,
      "logits/rejected": 30.279621124267578,
      "logps/chosen": -434.04144287109375,
      "logps/rejected": -457.3016357421875,
      "loss": 0.485,
      "rewards/accuracies": 0.875,
      "rewards/chosen": -0.00823978427797556,
      "rewards/margins": 0.03414863348007202,
      "rewards/rejected": -0.042388420552015305,
      "step": 25
    },
    {
      "debug/policy_chosen_logits": 30.609947204589844,
      "debug/policy_chosen_logps": -402.73504638671875,
      "debug/policy_rejected_logits": 29.12665367126465,
      "debug/policy_rejected_logps": -411.1260986328125,
      "debug/reference_chosen_logps": -404.96392822265625,
      "debug/reference_rejected_logps": -411.49969482421875,
      "epoch": 0.5531914893617021,
      "grad_norm": 5.066192133102437,
      "learning_rate": 1e-06,
      "logits/chosen": 30.609947204589844,
      "logits/rejected": 29.12665367126465,
      "logps/chosen": -402.73504638671875,
      "logps/rejected": -411.1260986328125,
      "loss": 0.4781,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.022288817912340164,
      "rewards/margins": 0.01855243556201458,
      "rewards/rejected": 0.003736380487680435,
      "step": 26
    },
    {
      "debug/policy_chosen_logits": 26.225852966308594,
      "debug/policy_chosen_logps": -434.0633544921875,
      "debug/policy_rejected_logits": 27.547882080078125,
      "debug/policy_rejected_logps": -460.3682861328125,
      "debug/reference_chosen_logps": -434.0380859375,
      "debug/reference_rejected_logps": -457.2252197265625,
      "epoch": 0.574468085106383,
      "grad_norm": 5.199359660864606,
      "learning_rate": 1e-06,
      "logits/chosen": 26.225852966308594,
      "logits/rejected": 27.547882080078125,
      "logps/chosen": -434.0633544921875,
      "logps/rejected": -460.3682861328125,
      "loss": 0.4775,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.0002528773620724678,
      "rewards/margins": 0.031177710741758347,
      "rewards/rejected": -0.03143058717250824,
      "step": 27
    },
    {
      "debug/policy_chosen_logits": 34.810482025146484,
      "debug/policy_chosen_logps": -432.9176025390625,
      "debug/policy_rejected_logits": 32.29673385620117,
      "debug/policy_rejected_logps": -435.6657409667969,
      "debug/reference_chosen_logps": -433.37603759765625,
      "debug/reference_rejected_logps": -432.38958740234375,
      "epoch": 0.5957446808510638,
      "grad_norm": 5.456307945174751,
      "learning_rate": 1e-06,
      "logits/chosen": 34.810482025146484,
      "logits/rejected": 32.29673385620117,
      "logps/chosen": -432.9176025390625,
      "logps/rejected": -435.6657409667969,
      "loss": 0.4635,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.004584426060318947,
      "rewards/margins": 0.03734596073627472,
      "rewards/rejected": -0.03276153653860092,
      "step": 28
    },
    {
      "debug/policy_chosen_logits": 28.973360061645508,
      "debug/policy_chosen_logps": -432.0859375,
      "debug/policy_rejected_logits": 27.616941452026367,
      "debug/policy_rejected_logps": -419.5810546875,
      "debug/reference_chosen_logps": -432.6524658203125,
      "debug/reference_rejected_logps": -413.3448181152344,
      "epoch": 0.6170212765957447,
      "grad_norm": 4.889651674840413,
      "learning_rate": 1e-06,
      "logits/chosen": 28.973360061645508,
      "logits/rejected": 27.616941452026367,
      "logps/chosen": -432.0859375,
      "logps/rejected": -419.5810546875,
      "loss": 0.4789,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.005664861761033535,
      "rewards/margins": 0.06802742183208466,
      "rewards/rejected": -0.062362559139728546,
      "step": 29
    },
    {
      "debug/policy_chosen_logits": 28.820457458496094,
      "debug/policy_chosen_logps": -419.7233581542969,
      "debug/policy_rejected_logits": 30.256000518798828,
      "debug/policy_rejected_logps": -422.4107971191406,
      "debug/reference_chosen_logps": -423.3680725097656,
      "debug/reference_rejected_logps": -421.3091125488281,
      "epoch": 0.6382978723404256,
      "grad_norm": 5.137030977785722,
      "learning_rate": 1e-06,
      "logits/chosen": 28.820457458496094,
      "logits/rejected": 30.256000518798828,
      "logps/chosen": -419.7233581542969,
      "logps/rejected": -422.4107971191406,
      "loss": 0.4823,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.03644702956080437,
      "rewards/margins": 0.047463756054639816,
      "rewards/rejected": -0.011016730219125748,
      "step": 30
    },
    {
      "debug/policy_chosen_logits": 29.283926010131836,
      "debug/policy_chosen_logps": -392.496826171875,
      "debug/policy_rejected_logits": 31.77328109741211,
      "debug/policy_rejected_logps": -434.51806640625,
      "debug/reference_chosen_logps": -395.81146240234375,
      "debug/reference_rejected_logps": -434.8221435546875,
      "epoch": 0.6595744680851063,
      "grad_norm": 4.951189622094444,
      "learning_rate": 1e-06,
      "logits/chosen": 29.283926010131836,
      "logits/rejected": 31.77328109741211,
      "logps/chosen": -392.496826171875,
      "logps/rejected": -434.51806640625,
      "loss": 0.4638,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.033146705478429794,
      "rewards/margins": 0.03010578267276287,
      "rewards/rejected": 0.003040926530957222,
      "step": 31
    },
    {
      "debug/policy_chosen_logits": 29.353422164916992,
      "debug/policy_chosen_logps": -414.32415771484375,
      "debug/policy_rejected_logits": 30.822248458862305,
      "debug/policy_rejected_logps": -430.6376037597656,
      "debug/reference_chosen_logps": -415.54888916015625,
      "debug/reference_rejected_logps": -431.1400146484375,
      "epoch": 0.6808510638297872,
      "grad_norm": 4.999575032095535,
      "learning_rate": 1e-06,
      "logits/chosen": 29.353422164916992,
      "logits/rejected": 30.822248458862305,
      "logps/chosen": -414.32415771484375,
      "logps/rejected": -430.6376037597656,
      "loss": 0.4834,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.012247240170836449,
      "rewards/margins": 0.007223015185445547,
      "rewards/rejected": 0.00502422172576189,
      "step": 32
    },
    {
      "debug/policy_chosen_logits": 27.81666374206543,
      "debug/policy_chosen_logps": -437.5671081542969,
      "debug/policy_rejected_logits": 29.937236785888672,
      "debug/policy_rejected_logps": -429.474853515625,
      "debug/reference_chosen_logps": -440.85504150390625,
      "debug/reference_rejected_logps": -431.23309326171875,
      "epoch": 0.7021276595744681,
      "grad_norm": 5.116678809536897,
      "learning_rate": 1e-06,
      "logits/chosen": 27.81666374206543,
      "logits/rejected": 29.937236785888672,
      "logps/chosen": -437.5671081542969,
      "logps/rejected": -429.474853515625,
      "loss": 0.4783,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.03287952393293381,
      "rewards/margins": 0.01529712788760662,
      "rewards/rejected": 0.017582397907972336,
      "step": 33
    },
    {
      "debug/policy_chosen_logits": 32.981014251708984,
      "debug/policy_chosen_logps": -450.77899169921875,
      "debug/policy_rejected_logits": 29.245454788208008,
      "debug/policy_rejected_logps": -421.23468017578125,
      "debug/reference_chosen_logps": -454.22735595703125,
      "debug/reference_rejected_logps": -425.0584716796875,
      "epoch": 0.723404255319149,
      "grad_norm": 5.485713658124459,
      "learning_rate": 1e-06,
      "logits/chosen": 32.981014251708984,
      "logits/rejected": 29.245454788208008,
      "logps/chosen": -450.77899169921875,
      "logps/rejected": -421.23468017578125,
      "loss": 0.4736,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.03448398783802986,
      "rewards/margins": -0.003754120320081711,
      "rewards/rejected": 0.038238104432821274,
      "step": 34
    },
    {
      "debug/policy_chosen_logits": 30.9548397064209,
      "debug/policy_chosen_logps": -427.41632080078125,
      "debug/policy_rejected_logits": 29.430871963500977,
      "debug/policy_rejected_logps": -436.83050537109375,
      "debug/reference_chosen_logps": -430.6944580078125,
      "debug/reference_rejected_logps": -433.17041015625,
      "epoch": 0.7446808510638298,
      "grad_norm": 5.73163901812078,
      "learning_rate": 1e-06,
      "logits/chosen": 30.9548397064209,
      "logits/rejected": 29.430871963500977,
      "logps/chosen": -427.41632080078125,
      "logps/rejected": -436.83050537109375,
      "loss": 0.4459,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.03278125822544098,
      "rewards/margins": 0.06938225030899048,
      "rewards/rejected": -0.0366009883582592,
      "step": 35
    },
    {
      "debug/policy_chosen_logits": 28.33571434020996,
      "debug/policy_chosen_logps": -400.690673828125,
      "debug/policy_rejected_logits": 24.813756942749023,
      "debug/policy_rejected_logps": -445.54791259765625,
      "debug/reference_chosen_logps": -404.2062072753906,
      "debug/reference_rejected_logps": -433.4603576660156,
      "epoch": 0.7659574468085106,
      "grad_norm": 5.207209058021427,
      "learning_rate": 1e-06,
      "logits/chosen": 28.33571434020996,
      "logits/rejected": 24.813756942749023,
      "logps/chosen": -400.690673828125,
      "logps/rejected": -445.54791259765625,
      "loss": 0.4546,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.03515518084168434,
      "rewards/margins": 0.15603074431419373,
      "rewards/rejected": -0.12087554484605789,
      "step": 36
    },
    {
      "debug/policy_chosen_logits": 29.32408332824707,
      "debug/policy_chosen_logps": -427.9587707519531,
      "debug/policy_rejected_logits": 27.91067123413086,
      "debug/policy_rejected_logps": -405.4839172363281,
      "debug/reference_chosen_logps": -427.9952392578125,
      "debug/reference_rejected_logps": -407.0904846191406,
      "epoch": 0.7872340425531915,
      "grad_norm": 5.293688440117633,
      "learning_rate": 1e-06,
      "logits/chosen": 29.32408332824707,
      "logits/rejected": 27.91067123413086,
      "logps/chosen": -427.9587707519531,
      "logps/rejected": -405.4839172363281,
      "loss": 0.5106,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.00036445818841457367,
      "rewards/margins": -0.01570144295692444,
      "rewards/rejected": 0.01606590300798416,
      "step": 37
    },
    {
      "debug/policy_chosen_logits": 28.01889419555664,
      "debug/policy_chosen_logps": -404.5608825683594,
      "debug/policy_rejected_logits": 28.02815055847168,
      "debug/policy_rejected_logps": -413.2740173339844,
      "debug/reference_chosen_logps": -405.4373779296875,
      "debug/reference_rejected_logps": -406.2366027832031,
      "epoch": 0.8085106382978723,
      "grad_norm": 4.841695423817661,
      "learning_rate": 1e-06,
      "logits/chosen": 28.01889419555664,
      "logits/rejected": 28.02815055847168,
      "logps/chosen": -404.5608825683594,
      "logps/rejected": -413.2740173339844,
      "loss": 0.4719,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.00876510702073574,
      "rewards/margins": 0.07913925498723984,
      "rewards/rejected": -0.07037414610385895,
      "step": 38
    },
    {
      "debug/policy_chosen_logits": 26.369901657104492,
      "debug/policy_chosen_logps": -418.9932556152344,
      "debug/policy_rejected_logits": 24.119754791259766,
      "debug/policy_rejected_logps": -405.0431213378906,
      "debug/reference_chosen_logps": -421.83203125,
      "debug/reference_rejected_logps": -404.04913330078125,
      "epoch": 0.8297872340425532,
      "grad_norm": 5.464322991896784,
      "learning_rate": 1e-06,
      "logits/chosen": 26.369901657104492,
      "logits/rejected": 24.119754791259766,
      "logps/chosen": -418.9932556152344,
      "logps/rejected": -405.0431213378906,
      "loss": 0.4666,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.028387565165758133,
      "rewards/margins": 0.03832760080695152,
      "rewards/rejected": -0.009940031915903091,
      "step": 39
    },
    {
      "debug/policy_chosen_logits": 31.222116470336914,
      "debug/policy_chosen_logps": -408.92315673828125,
      "debug/policy_rejected_logits": 30.565526962280273,
      "debug/policy_rejected_logps": -438.00384521484375,
      "debug/reference_chosen_logps": -410.3876037597656,
      "debug/reference_rejected_logps": -428.6034851074219,
      "epoch": 0.851063829787234,
      "grad_norm": 5.087286245449022,
      "learning_rate": 1e-06,
      "logits/chosen": 31.222116470336914,
      "logits/rejected": 30.565526962280273,
      "logps/chosen": -408.92315673828125,
      "logps/rejected": -438.00384521484375,
      "loss": 0.4691,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.014644507318735123,
      "rewards/margins": 0.10864795744419098,
      "rewards/rejected": -0.09400344640016556,
      "step": 40
    },
    {
      "debug/policy_chosen_logits": 29.859107971191406,
      "debug/policy_chosen_logps": -403.02581787109375,
      "debug/policy_rejected_logits": 23.891035079956055,
      "debug/policy_rejected_logps": -395.781005859375,
      "debug/reference_chosen_logps": -409.97967529296875,
      "debug/reference_rejected_logps": -398.636474609375,
      "epoch": 0.8723404255319149,
      "grad_norm": 4.953135237737809,
      "learning_rate": 1e-06,
      "logits/chosen": 29.859107971191406,
      "logits/rejected": 23.891035079956055,
      "logps/chosen": -403.02581787109375,
      "logps/rejected": -395.781005859375,
      "loss": 0.458,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.06953833997249603,
      "rewards/margins": 0.04098331928253174,
      "rewards/rejected": 0.02855503186583519,
      "step": 41
    },
    {
      "debug/policy_chosen_logits": 31.02197265625,
      "debug/policy_chosen_logps": -424.50299072265625,
      "debug/policy_rejected_logits": 29.632427215576172,
      "debug/policy_rejected_logps": -422.94561767578125,
      "debug/reference_chosen_logps": -429.08648681640625,
      "debug/reference_rejected_logps": -424.3019104003906,
      "epoch": 0.8936170212765957,
      "grad_norm": 5.514556663745466,
      "learning_rate": 1e-06,
      "logits/chosen": 31.02197265625,
      "logits/rejected": 29.632427215576172,
      "logps/chosen": -424.50299072265625,
      "logps/rejected": -422.94561767578125,
      "loss": 0.4735,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.04583461582660675,
      "rewards/margins": 0.032271310687065125,
      "rewards/rejected": 0.01356330793350935,
      "step": 42
    },
    {
      "debug/policy_chosen_logits": 32.29981231689453,
      "debug/policy_chosen_logps": -437.9972229003906,
      "debug/policy_rejected_logits": 30.15468978881836,
      "debug/policy_rejected_logps": -440.30535888671875,
      "debug/reference_chosen_logps": -436.08892822265625,
      "debug/reference_rejected_logps": -434.7149353027344,
      "epoch": 0.9148936170212766,
      "grad_norm": 4.808288177536615,
      "learning_rate": 1e-06,
      "logits/chosen": 32.29981231689453,
      "logits/rejected": 30.15468978881836,
      "logps/chosen": -437.9972229003906,
      "logps/rejected": -440.30535888671875,
      "loss": 0.4598,
      "rewards/accuracies": 0.375,
      "rewards/chosen": -0.01908310130238533,
      "rewards/margins": 0.03682101517915726,
      "rewards/rejected": -0.055904120206832886,
      "step": 43
    },
    {
      "debug/policy_chosen_logits": 26.440486907958984,
      "debug/policy_chosen_logps": -391.61572265625,
      "debug/policy_rejected_logits": 29.678592681884766,
      "debug/policy_rejected_logps": -430.9735412597656,
      "debug/reference_chosen_logps": -396.64862060546875,
      "debug/reference_rejected_logps": -434.947998046875,
      "epoch": 0.9361702127659575,
      "grad_norm": 5.150941817603919,
      "learning_rate": 1e-06,
      "logits/chosen": 26.440486907958984,
      "logits/rejected": 29.678592681884766,
      "logps/chosen": -391.61572265625,
      "logps/rejected": -430.9735412597656,
      "loss": 0.4568,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.050329361110925674,
      "rewards/margins": 0.010584792122244835,
      "rewards/rejected": 0.03974456712603569,
      "step": 44
    },
    {
      "debug/policy_chosen_logits": 29.451526641845703,
      "debug/policy_chosen_logps": -425.8896484375,
      "debug/policy_rejected_logits": 32.46401596069336,
      "debug/policy_rejected_logps": -428.1052551269531,
      "debug/reference_chosen_logps": -424.65936279296875,
      "debug/reference_rejected_logps": -427.16961669921875,
      "epoch": 0.9574468085106383,
      "grad_norm": 5.012447008649623,
      "learning_rate": 1e-06,
      "logits/chosen": 29.451526641845703,
      "logits/rejected": 32.46401596069336,
      "logps/chosen": -425.8896484375,
      "logps/rejected": -428.1052551269531,
      "loss": 0.4777,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.012303046882152557,
      "rewards/margins": -0.0029468159191310406,
      "rewards/rejected": -0.009356231428682804,
      "step": 45
    },
    {
      "debug/policy_chosen_logits": 31.168346405029297,
      "debug/policy_chosen_logps": -426.1267395019531,
      "debug/policy_rejected_logits": 29.51166534423828,
      "debug/policy_rejected_logps": -444.91766357421875,
      "debug/reference_chosen_logps": -429.6617736816406,
      "debug/reference_rejected_logps": -439.4256591796875,
      "epoch": 0.9787234042553191,
      "grad_norm": 5.030625016447312,
      "learning_rate": 1e-06,
      "logits/chosen": 31.168346405029297,
      "logits/rejected": 29.51166534423828,
      "logps/chosen": -426.1267395019531,
      "logps/rejected": -444.91766357421875,
      "loss": 0.4647,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.03535018861293793,
      "rewards/margins": 0.09027023613452911,
      "rewards/rejected": -0.05492004379630089,
      "step": 46
    },
    {
      "debug/policy_chosen_logits": 28.43193244934082,
      "debug/policy_chosen_logps": -419.4750671386719,
      "debug/policy_rejected_logits": 27.273754119873047,
      "debug/policy_rejected_logps": -438.4751892089844,
      "debug/reference_chosen_logps": -421.2059326171875,
      "debug/reference_rejected_logps": -437.2878723144531,
      "epoch": 1.0,
      "grad_norm": 5.486914446149956,
      "learning_rate": 1e-06,
      "logits/chosen": 28.43193244934082,
      "logits/rejected": 27.273754119873047,
      "logps/chosen": -419.4750671386719,
      "logps/rejected": -438.4751892089844,
      "loss": 0.4596,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.01730876788496971,
      "rewards/margins": 0.02918224036693573,
      "rewards/rejected": -0.011873474344611168,
      "step": 47
    },
    {
      "epoch": 1.0,
      "step": 47,
      "total_flos": 0.0,
      "train_loss": 0.48215872493196044,
      "train_runtime": 474.4995,
      "train_samples_per_second": 6.327,
      "train_steps_per_second": 0.099
    }
  ],
  "logging_steps": 1,
  "max_steps": 47,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}