yiran-wang3 commited on
Commit
52c5948
1 Parent(s): 3532c7f

End of training

Browse files
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: yiran-wang3/ds_coder_adamw_iter1
5
+ tags:
6
+ - alignment-handbook
7
+ - generated_from_trainer
8
+ - trl
9
+ - dpo
10
+ datasets:
11
+ - self-generate/ds_coder_sppo_hard_new_cn_mining_oj_iter1-binarized
12
+ model-index:
13
+ - name: ds_coder_adamw_iter2
14
+ results: []
15
+ ---
16
+
17
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
+ should probably proofread and complete it, then remove this comment. -->
19
+
20
+ # ds_coder_adamw_iter2
21
+
22
+ This model is a fine-tuned version of [yiran-wang3/ds_coder_adamw_iter1](https://huggingface.co/yiran-wang3/ds_coder_adamw_iter1) on the self-generate/ds_coder_sppo_hard_new_cn_mining_oj_iter1-binarized dataset.
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 1e-06
42
+ - train_batch_size: 8
43
+ - eval_batch_size: 4
44
+ - seed: 42
45
+ - distributed_type: multi-GPU
46
+ - num_devices: 8
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 32
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: constant
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - lr_scheduler_warmup_steps: 100
53
+ - num_epochs: 1.0
54
+
55
+ ### Training results
56
+
57
+
58
+
59
+ ### Framework versions
60
+
61
+ - Transformers 4.45.0
62
+ - Pytorch 2.4.0+cu121
63
+ - Datasets 2.14.6
64
+ - Tokenizers 0.20.3
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.48215872493196044,
5
+ "train_runtime": 474.4995,
6
+ "train_samples": 3002,
7
+ "train_samples_per_second": 6.327,
8
+ "train_steps_per_second": 0.099
9
+ }
config.json CHANGED
@@ -25,6 +25,6 @@
25
  "tie_word_embeddings": false,
26
  "torch_dtype": "bfloat16",
27
  "transformers_version": "4.45.0",
28
- "use_cache": false,
29
  "vocab_size": 102400
30
  }
 
25
  "tie_word_embeddings": false,
26
  "torch_dtype": "bfloat16",
27
  "transformers_version": "4.45.0",
28
+ "use_cache": true,
29
  "vocab_size": 102400
30
  }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 100000,
4
+ "eos_token_id": 100015,
5
+ "transformers_version": "4.45.0"
6
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.48215872493196044,
5
+ "train_runtime": 474.4995,
6
+ "train_samples": 3002,
7
+ "train_samples_per_second": 6.327,
8
+ "train_steps_per_second": 0.099
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1029 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 100,
6
+ "global_step": 47,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "debug/policy_chosen_logits": 31.700279235839844,
13
+ "debug/policy_chosen_logps": -434.26495361328125,
14
+ "debug/policy_rejected_logits": 33.99253845214844,
15
+ "debug/policy_rejected_logps": -441.9063720703125,
16
+ "debug/reference_chosen_logps": -434.26495361328125,
17
+ "debug/reference_rejected_logps": -441.9063720703125,
18
+ "epoch": 0.02127659574468085,
19
+ "grad_norm": 5.407328059506411,
20
+ "learning_rate": 1e-06,
21
+ "logits/chosen": 31.700279235839844,
22
+ "logits/rejected": 33.99253845214844,
23
+ "logps/chosen": -434.26495361328125,
24
+ "logps/rejected": -441.9063720703125,
25
+ "loss": 0.5,
26
+ "rewards/accuracies": 0.0,
27
+ "rewards/chosen": 0.0,
28
+ "rewards/margins": 0.0,
29
+ "rewards/rejected": 0.0,
30
+ "step": 1
31
+ },
32
+ {
33
+ "debug/policy_chosen_logits": 27.489120483398438,
34
+ "debug/policy_chosen_logps": -410.28472900390625,
35
+ "debug/policy_rejected_logits": 31.382970809936523,
36
+ "debug/policy_rejected_logps": -435.17218017578125,
37
+ "debug/reference_chosen_logps": -410.96771240234375,
38
+ "debug/reference_rejected_logps": -436.0491638183594,
39
+ "epoch": 0.0425531914893617,
40
+ "grad_norm": 4.951759612240564,
41
+ "learning_rate": 1e-06,
42
+ "logits/chosen": 27.489120483398438,
43
+ "logits/rejected": 31.382970809936523,
44
+ "logps/chosen": -410.28472900390625,
45
+ "logps/rejected": -435.17218017578125,
46
+ "loss": 0.5009,
47
+ "rewards/accuracies": 0.625,
48
+ "rewards/chosen": 0.006829871796071529,
49
+ "rewards/margins": -0.001939887646585703,
50
+ "rewards/rejected": 0.00876975990831852,
51
+ "step": 2
52
+ },
53
+ {
54
+ "debug/policy_chosen_logits": 31.120014190673828,
55
+ "debug/policy_chosen_logps": -401.9127197265625,
56
+ "debug/policy_rejected_logits": 33.329689025878906,
57
+ "debug/policy_rejected_logps": -424.90576171875,
58
+ "debug/reference_chosen_logps": -402.24658203125,
59
+ "debug/reference_rejected_logps": -424.7574462890625,
60
+ "epoch": 0.06382978723404255,
61
+ "grad_norm": 5.411851250945231,
62
+ "learning_rate": 1e-06,
63
+ "logits/chosen": 31.120014190673828,
64
+ "logits/rejected": 33.329689025878906,
65
+ "logps/chosen": -401.9127197265625,
66
+ "logps/rejected": -424.90576171875,
67
+ "loss": 0.5007,
68
+ "rewards/accuracies": 0.5,
69
+ "rewards/chosen": 0.003338394220918417,
70
+ "rewards/margins": 0.004821510519832373,
71
+ "rewards/rejected": -0.001483116764575243,
72
+ "step": 3
73
+ },
74
+ {
75
+ "debug/policy_chosen_logits": 30.066335678100586,
76
+ "debug/policy_chosen_logps": -403.6931457519531,
77
+ "debug/policy_rejected_logits": 33.15522384643555,
78
+ "debug/policy_rejected_logps": -436.77801513671875,
79
+ "debug/reference_chosen_logps": -403.52996826171875,
80
+ "debug/reference_rejected_logps": -436.29296875,
81
+ "epoch": 0.0851063829787234,
82
+ "grad_norm": 6.440647306952527,
83
+ "learning_rate": 1e-06,
84
+ "logits/chosen": 30.066335678100586,
85
+ "logits/rejected": 33.15522384643555,
86
+ "logps/chosen": -403.6931457519531,
87
+ "logps/rejected": -436.77801513671875,
88
+ "loss": 0.4993,
89
+ "rewards/accuracies": 0.75,
90
+ "rewards/chosen": -0.0016318517737090588,
91
+ "rewards/margins": 0.003218421945348382,
92
+ "rewards/rejected": -0.00485027302056551,
93
+ "step": 4
94
+ },
95
+ {
96
+ "debug/policy_chosen_logits": 28.028703689575195,
97
+ "debug/policy_chosen_logps": -391.64715576171875,
98
+ "debug/policy_rejected_logits": 30.672496795654297,
99
+ "debug/policy_rejected_logps": -420.91143798828125,
100
+ "debug/reference_chosen_logps": -392.4825134277344,
101
+ "debug/reference_rejected_logps": -422.0054931640625,
102
+ "epoch": 0.10638297872340426,
103
+ "grad_norm": 5.433518269383661,
104
+ "learning_rate": 1e-06,
105
+ "logits/chosen": 28.028703689575195,
106
+ "logits/rejected": 30.672496795654297,
107
+ "logps/chosen": -391.64715576171875,
108
+ "logps/rejected": -420.91143798828125,
109
+ "loss": 0.4987,
110
+ "rewards/accuracies": 0.25,
111
+ "rewards/chosen": 0.008353347890079021,
112
+ "rewards/margins": -0.0025872797705233097,
113
+ "rewards/rejected": 0.010940628126263618,
114
+ "step": 5
115
+ },
116
+ {
117
+ "debug/policy_chosen_logits": 26.0015811920166,
118
+ "debug/policy_chosen_logps": -403.2935791015625,
119
+ "debug/policy_rejected_logits": 25.2414608001709,
120
+ "debug/policy_rejected_logps": -407.3106994628906,
121
+ "debug/reference_chosen_logps": -402.87139892578125,
122
+ "debug/reference_rejected_logps": -406.1798095703125,
123
+ "epoch": 0.1276595744680851,
124
+ "grad_norm": 5.048275333461177,
125
+ "learning_rate": 1e-06,
126
+ "logits/chosen": 26.0015811920166,
127
+ "logits/rejected": 25.2414608001709,
128
+ "logps/chosen": -403.2935791015625,
129
+ "logps/rejected": -407.3106994628906,
130
+ "loss": 0.496,
131
+ "rewards/accuracies": 0.5,
132
+ "rewards/chosen": -0.0042218780145049095,
133
+ "rewards/margins": 0.007086906582117081,
134
+ "rewards/rejected": -0.01130878459662199,
135
+ "step": 6
136
+ },
137
+ {
138
+ "debug/policy_chosen_logits": 27.019393920898438,
139
+ "debug/policy_chosen_logps": -424.65643310546875,
140
+ "debug/policy_rejected_logits": 29.134994506835938,
141
+ "debug/policy_rejected_logps": -420.3319396972656,
142
+ "debug/reference_chosen_logps": -423.5784912109375,
143
+ "debug/reference_rejected_logps": -420.0720520019531,
144
+ "epoch": 0.14893617021276595,
145
+ "grad_norm": 6.092801689841109,
146
+ "learning_rate": 1e-06,
147
+ "logits/chosen": 27.019393920898438,
148
+ "logits/rejected": 29.134994506835938,
149
+ "logps/chosen": -424.65643310546875,
150
+ "logps/rejected": -420.3319396972656,
151
+ "loss": 0.499,
152
+ "rewards/accuracies": 0.125,
153
+ "rewards/chosen": -0.010779608972370625,
154
+ "rewards/margins": -0.008180923759937286,
155
+ "rewards/rejected": -0.0025986863765865564,
156
+ "step": 7
157
+ },
158
+ {
159
+ "debug/policy_chosen_logits": 25.41461944580078,
160
+ "debug/policy_chosen_logps": -420.9305419921875,
161
+ "debug/policy_rejected_logits": 25.522966384887695,
162
+ "debug/policy_rejected_logps": -427.20623779296875,
163
+ "debug/reference_chosen_logps": -420.0583190917969,
164
+ "debug/reference_rejected_logps": -426.08453369140625,
165
+ "epoch": 0.1702127659574468,
166
+ "grad_norm": 5.248588154856571,
167
+ "learning_rate": 1e-06,
168
+ "logits/chosen": 25.41461944580078,
169
+ "logits/rejected": 25.522966384887695,
170
+ "logps/chosen": -420.9305419921875,
171
+ "logps/rejected": -427.20623779296875,
172
+ "loss": 0.4975,
173
+ "rewards/accuracies": 0.5,
174
+ "rewards/chosen": -0.00872222799807787,
175
+ "rewards/margins": 0.0024948506616055965,
176
+ "rewards/rejected": -0.011217079125344753,
177
+ "step": 8
178
+ },
179
+ {
180
+ "debug/policy_chosen_logits": 27.28373146057129,
181
+ "debug/policy_chosen_logps": -413.857177734375,
182
+ "debug/policy_rejected_logits": 29.01516342163086,
183
+ "debug/policy_rejected_logps": -429.70623779296875,
184
+ "debug/reference_chosen_logps": -413.364501953125,
185
+ "debug/reference_rejected_logps": -429.416259765625,
186
+ "epoch": 0.19148936170212766,
187
+ "grad_norm": 5.836663309727503,
188
+ "learning_rate": 1e-06,
189
+ "logits/chosen": 27.28373146057129,
190
+ "logits/rejected": 29.01516342163086,
191
+ "logps/chosen": -413.857177734375,
192
+ "logps/rejected": -429.70623779296875,
193
+ "loss": 0.4954,
194
+ "rewards/accuracies": 0.375,
195
+ "rewards/chosen": -0.004926986526697874,
196
+ "rewards/margins": -0.002027016133069992,
197
+ "rewards/rejected": -0.002899970393627882,
198
+ "step": 9
199
+ },
200
+ {
201
+ "debug/policy_chosen_logits": 29.55730628967285,
202
+ "debug/policy_chosen_logps": -418.648193359375,
203
+ "debug/policy_rejected_logits": 30.004676818847656,
204
+ "debug/policy_rejected_logps": -430.01788330078125,
205
+ "debug/reference_chosen_logps": -418.4599609375,
206
+ "debug/reference_rejected_logps": -429.8154296875,
207
+ "epoch": 0.2127659574468085,
208
+ "grad_norm": 4.978725766808406,
209
+ "learning_rate": 1e-06,
210
+ "logits/chosen": 29.55730628967285,
211
+ "logits/rejected": 30.004676818847656,
212
+ "logps/chosen": -418.648193359375,
213
+ "logps/rejected": -430.01788330078125,
214
+ "loss": 0.4991,
215
+ "rewards/accuracies": 0.5,
216
+ "rewards/chosen": -0.0018823242280632257,
217
+ "rewards/margins": 0.00014217384159564972,
218
+ "rewards/rejected": -0.002024497603997588,
219
+ "step": 10
220
+ },
221
+ {
222
+ "debug/policy_chosen_logits": 33.0296516418457,
223
+ "debug/policy_chosen_logps": -434.47308349609375,
224
+ "debug/policy_rejected_logits": 31.160263061523438,
225
+ "debug/policy_rejected_logps": -406.6353759765625,
226
+ "debug/reference_chosen_logps": -433.95892333984375,
227
+ "debug/reference_rejected_logps": -405.2964782714844,
228
+ "epoch": 0.23404255319148937,
229
+ "grad_norm": 5.822177618834045,
230
+ "learning_rate": 1e-06,
231
+ "logits/chosen": 33.0296516418457,
232
+ "logits/rejected": 31.160263061523438,
233
+ "logps/chosen": -434.47308349609375,
234
+ "logps/rejected": -406.6353759765625,
235
+ "loss": 0.493,
236
+ "rewards/accuracies": 0.5,
237
+ "rewards/chosen": -0.005142059177160263,
238
+ "rewards/margins": 0.008247108198702335,
239
+ "rewards/rejected": -0.013389168307185173,
240
+ "step": 11
241
+ },
242
+ {
243
+ "debug/policy_chosen_logits": 30.720827102661133,
244
+ "debug/policy_chosen_logps": -455.06597900390625,
245
+ "debug/policy_rejected_logits": 32.45933151245117,
246
+ "debug/policy_rejected_logps": -462.2677307128906,
247
+ "debug/reference_chosen_logps": -454.6126403808594,
248
+ "debug/reference_rejected_logps": -459.7181396484375,
249
+ "epoch": 0.2553191489361702,
250
+ "grad_norm": 5.239137130887116,
251
+ "learning_rate": 1e-06,
252
+ "logits/chosen": 30.720827102661133,
253
+ "logits/rejected": 32.45933151245117,
254
+ "logps/chosen": -455.06597900390625,
255
+ "logps/rejected": -462.2677307128906,
256
+ "loss": 0.4991,
257
+ "rewards/accuracies": 0.75,
258
+ "rewards/chosen": -0.00453338585793972,
259
+ "rewards/margins": 0.020962638780474663,
260
+ "rewards/rejected": -0.025496024638414383,
261
+ "step": 12
262
+ },
263
+ {
264
+ "debug/policy_chosen_logits": 30.186174392700195,
265
+ "debug/policy_chosen_logps": -412.29742431640625,
266
+ "debug/policy_rejected_logits": 28.243711471557617,
267
+ "debug/policy_rejected_logps": -426.9504089355469,
268
+ "debug/reference_chosen_logps": -411.92120361328125,
269
+ "debug/reference_rejected_logps": -425.7698974609375,
270
+ "epoch": 0.2765957446808511,
271
+ "grad_norm": 5.6983956081800855,
272
+ "learning_rate": 1e-06,
273
+ "logits/chosen": 30.186174392700195,
274
+ "logits/rejected": 28.243711471557617,
275
+ "logps/chosen": -412.29742431640625,
276
+ "logps/rejected": -426.9504089355469,
277
+ "loss": 0.4955,
278
+ "rewards/accuracies": 0.75,
279
+ "rewards/chosen": -0.003762359730899334,
280
+ "rewards/margins": 0.008042870089411736,
281
+ "rewards/rejected": -0.011805228888988495,
282
+ "step": 13
283
+ },
284
+ {
285
+ "debug/policy_chosen_logits": 29.8179931640625,
286
+ "debug/policy_chosen_logps": -402.04205322265625,
287
+ "debug/policy_rejected_logits": 27.887521743774414,
288
+ "debug/policy_rejected_logps": -406.5090637207031,
289
+ "debug/reference_chosen_logps": -402.81463623046875,
290
+ "debug/reference_rejected_logps": -406.35760498046875,
291
+ "epoch": 0.2978723404255319,
292
+ "grad_norm": 5.185829515819964,
293
+ "learning_rate": 1e-06,
294
+ "logits/chosen": 29.8179931640625,
295
+ "logits/rejected": 27.887521743774414,
296
+ "logps/chosen": -402.04205322265625,
297
+ "logps/rejected": -406.5090637207031,
298
+ "loss": 0.4892,
299
+ "rewards/accuracies": 0.625,
300
+ "rewards/chosen": 0.007725906558334827,
301
+ "rewards/margins": 0.009240342304110527,
302
+ "rewards/rejected": -0.0015144352801144123,
303
+ "step": 14
304
+ },
305
+ {
306
+ "debug/policy_chosen_logits": 30.058448791503906,
307
+ "debug/policy_chosen_logps": -412.29827880859375,
308
+ "debug/policy_rejected_logits": 29.466854095458984,
309
+ "debug/policy_rejected_logps": -412.73504638671875,
310
+ "debug/reference_chosen_logps": -411.7734680175781,
311
+ "debug/reference_rejected_logps": -413.09912109375,
312
+ "epoch": 0.3191489361702128,
313
+ "grad_norm": 5.147098230721813,
314
+ "learning_rate": 1e-06,
315
+ "logits/chosen": 30.058448791503906,
316
+ "logits/rejected": 29.466854095458984,
317
+ "logps/chosen": -412.29827880859375,
318
+ "logps/rejected": -412.73504638671875,
319
+ "loss": 0.4893,
320
+ "rewards/accuracies": 0.25,
321
+ "rewards/chosen": -0.005248222034424543,
322
+ "rewards/margins": -0.008889121934771538,
323
+ "rewards/rejected": 0.0036408999003469944,
324
+ "step": 15
325
+ },
326
+ {
327
+ "debug/policy_chosen_logits": 26.801280975341797,
328
+ "debug/policy_chosen_logps": -453.10833740234375,
329
+ "debug/policy_rejected_logits": 28.296146392822266,
330
+ "debug/policy_rejected_logps": -433.0950927734375,
331
+ "debug/reference_chosen_logps": -453.86102294921875,
332
+ "debug/reference_rejected_logps": -432.42510986328125,
333
+ "epoch": 0.3404255319148936,
334
+ "grad_norm": 5.3650227932794765,
335
+ "learning_rate": 1e-06,
336
+ "logits/chosen": 26.801280975341797,
337
+ "logits/rejected": 28.296146392822266,
338
+ "logps/chosen": -453.10833740234375,
339
+ "logps/rejected": -433.0950927734375,
340
+ "loss": 0.4907,
341
+ "rewards/accuracies": 0.75,
342
+ "rewards/chosen": 0.007526512257754803,
343
+ "rewards/margins": 0.014225959777832031,
344
+ "rewards/rejected": -0.0066994475200772285,
345
+ "step": 16
346
+ },
347
+ {
348
+ "debug/policy_chosen_logits": 25.992467880249023,
349
+ "debug/policy_chosen_logps": -436.1377258300781,
350
+ "debug/policy_rejected_logits": 27.410860061645508,
351
+ "debug/policy_rejected_logps": -426.43035888671875,
352
+ "debug/reference_chosen_logps": -434.6832275390625,
353
+ "debug/reference_rejected_logps": -424.5072021484375,
354
+ "epoch": 0.3617021276595745,
355
+ "grad_norm": 5.048216336945854,
356
+ "learning_rate": 1e-06,
357
+ "logits/chosen": 25.992467880249023,
358
+ "logits/rejected": 27.410860061645508,
359
+ "logps/chosen": -436.1377258300781,
360
+ "logps/rejected": -426.43035888671875,
361
+ "loss": 0.494,
362
+ "rewards/accuracies": 0.625,
363
+ "rewards/chosen": -0.014545059762895107,
364
+ "rewards/margins": 0.004686659201979637,
365
+ "rewards/rejected": -0.01923171989619732,
366
+ "step": 17
367
+ },
368
+ {
369
+ "debug/policy_chosen_logits": 27.924072265625,
370
+ "debug/policy_chosen_logps": -456.6978759765625,
371
+ "debug/policy_rejected_logits": 27.263843536376953,
372
+ "debug/policy_rejected_logps": -411.67791748046875,
373
+ "debug/reference_chosen_logps": -455.6437683105469,
374
+ "debug/reference_rejected_logps": -408.3628234863281,
375
+ "epoch": 0.3829787234042553,
376
+ "grad_norm": 4.959644897985259,
377
+ "learning_rate": 1e-06,
378
+ "logits/chosen": 27.924072265625,
379
+ "logits/rejected": 27.263843536376953,
380
+ "logps/chosen": -456.6978759765625,
381
+ "logps/rejected": -411.67791748046875,
382
+ "loss": 0.4872,
383
+ "rewards/accuracies": 0.75,
384
+ "rewards/chosen": -0.01054123044013977,
385
+ "rewards/margins": 0.022609787061810493,
386
+ "rewards/rejected": -0.033151015639305115,
387
+ "step": 18
388
+ },
389
+ {
390
+ "debug/policy_chosen_logits": 30.296974182128906,
391
+ "debug/policy_chosen_logps": -407.5791320800781,
392
+ "debug/policy_rejected_logits": 29.760583877563477,
393
+ "debug/policy_rejected_logps": -417.291748046875,
394
+ "debug/reference_chosen_logps": -410.10662841796875,
395
+ "debug/reference_rejected_logps": -418.6151123046875,
396
+ "epoch": 0.40425531914893614,
397
+ "grad_norm": 4.809418559441442,
398
+ "learning_rate": 1e-06,
399
+ "logits/chosen": 30.296974182128906,
400
+ "logits/rejected": 29.760583877563477,
401
+ "logps/chosen": -407.5791320800781,
402
+ "logps/rejected": -417.291748046875,
403
+ "loss": 0.4937,
404
+ "rewards/accuracies": 0.875,
405
+ "rewards/chosen": 0.02527473494410515,
406
+ "rewards/margins": 0.012040939182043076,
407
+ "rewards/rejected": 0.013233794830739498,
408
+ "step": 19
409
+ },
410
+ {
411
+ "debug/policy_chosen_logits": 30.575592041015625,
412
+ "debug/policy_chosen_logps": -413.82574462890625,
413
+ "debug/policy_rejected_logits": 32.98490905761719,
414
+ "debug/policy_rejected_logps": -443.43548583984375,
415
+ "debug/reference_chosen_logps": -414.27642822265625,
416
+ "debug/reference_rejected_logps": -441.5928649902344,
417
+ "epoch": 0.425531914893617,
418
+ "grad_norm": 5.055368747694493,
419
+ "learning_rate": 1e-06,
420
+ "logits/chosen": 30.575592041015625,
421
+ "logits/rejected": 32.98490905761719,
422
+ "logps/chosen": -413.82574462890625,
423
+ "logps/rejected": -443.43548583984375,
424
+ "loss": 0.4771,
425
+ "rewards/accuracies": 0.875,
426
+ "rewards/chosen": 0.0045069498009979725,
427
+ "rewards/margins": 0.02293361723423004,
428
+ "rewards/rejected": -0.018426666036248207,
429
+ "step": 20
430
+ },
431
+ {
432
+ "debug/policy_chosen_logits": 28.962617874145508,
433
+ "debug/policy_chosen_logps": -416.78582763671875,
434
+ "debug/policy_rejected_logits": 31.380332946777344,
435
+ "debug/policy_rejected_logps": -443.6494445800781,
436
+ "debug/reference_chosen_logps": -419.51043701171875,
437
+ "debug/reference_rejected_logps": -442.1171875,
438
+ "epoch": 0.44680851063829785,
439
+ "grad_norm": 5.2399641694392685,
440
+ "learning_rate": 1e-06,
441
+ "logits/chosen": 28.962617874145508,
442
+ "logits/rejected": 31.380332946777344,
443
+ "logps/chosen": -416.78582763671875,
444
+ "logps/rejected": -443.6494445800781,
445
+ "loss": 0.4861,
446
+ "rewards/accuracies": 0.75,
447
+ "rewards/chosen": 0.02724616974592209,
448
+ "rewards/margins": 0.04256858676671982,
449
+ "rewards/rejected": -0.015322417952120304,
450
+ "step": 21
451
+ },
452
+ {
453
+ "debug/policy_chosen_logits": 33.89327621459961,
454
+ "debug/policy_chosen_logps": -439.533203125,
455
+ "debug/policy_rejected_logits": 32.8599853515625,
456
+ "debug/policy_rejected_logps": -468.5189208984375,
457
+ "debug/reference_chosen_logps": -437.9319763183594,
458
+ "debug/reference_rejected_logps": -460.019287109375,
459
+ "epoch": 0.46808510638297873,
460
+ "grad_norm": 5.600180944400873,
461
+ "learning_rate": 1e-06,
462
+ "logits/chosen": 33.89327621459961,
463
+ "logits/rejected": 32.8599853515625,
464
+ "logps/chosen": -439.533203125,
465
+ "logps/rejected": -468.5189208984375,
466
+ "loss": 0.4779,
467
+ "rewards/accuracies": 0.75,
468
+ "rewards/chosen": -0.016012268140912056,
469
+ "rewards/margins": 0.06898414343595505,
470
+ "rewards/rejected": -0.08499641716480255,
471
+ "step": 22
472
+ },
473
+ {
474
+ "debug/policy_chosen_logits": 30.022546768188477,
475
+ "debug/policy_chosen_logps": -448.22540283203125,
476
+ "debug/policy_rejected_logits": 30.50183868408203,
477
+ "debug/policy_rejected_logps": -417.8683776855469,
478
+ "debug/reference_chosen_logps": -448.73858642578125,
479
+ "debug/reference_rejected_logps": -417.9635925292969,
480
+ "epoch": 0.48936170212765956,
481
+ "grad_norm": 5.053771531276715,
482
+ "learning_rate": 1e-06,
483
+ "logits/chosen": 30.022546768188477,
484
+ "logits/rejected": 30.50183868408203,
485
+ "logps/chosen": -448.22540283203125,
486
+ "logps/rejected": -417.8683776855469,
487
+ "loss": 0.4906,
488
+ "rewards/accuracies": 0.5,
489
+ "rewards/chosen": 0.0051317219622433186,
490
+ "rewards/margins": 0.004179535433650017,
491
+ "rewards/rejected": 0.0009521869942545891,
492
+ "step": 23
493
+ },
494
+ {
495
+ "debug/policy_chosen_logits": 31.068572998046875,
496
+ "debug/policy_chosen_logps": -408.3885192871094,
497
+ "debug/policy_rejected_logits": 30.79738426208496,
498
+ "debug/policy_rejected_logps": -432.73651123046875,
499
+ "debug/reference_chosen_logps": -406.73419189453125,
500
+ "debug/reference_rejected_logps": -432.0497131347656,
501
+ "epoch": 0.5106382978723404,
502
+ "grad_norm": 5.120783229464688,
503
+ "learning_rate": 1e-06,
504
+ "logits/chosen": 31.068572998046875,
505
+ "logits/rejected": 30.79738426208496,
506
+ "logps/chosen": -408.3885192871094,
507
+ "logps/rejected": -432.73651123046875,
508
+ "loss": 0.478,
509
+ "rewards/accuracies": 0.375,
510
+ "rewards/chosen": -0.016543272882699966,
511
+ "rewards/margins": -0.009675255045294762,
512
+ "rewards/rejected": -0.006868018768727779,
513
+ "step": 24
514
+ },
515
+ {
516
+ "debug/policy_chosen_logits": 28.878725051879883,
517
+ "debug/policy_chosen_logps": -434.04144287109375,
518
+ "debug/policy_rejected_logits": 30.279621124267578,
519
+ "debug/policy_rejected_logps": -457.3016357421875,
520
+ "debug/reference_chosen_logps": -433.21746826171875,
521
+ "debug/reference_rejected_logps": -453.06280517578125,
522
+ "epoch": 0.5319148936170213,
523
+ "grad_norm": 5.297697739276052,
524
+ "learning_rate": 1e-06,
525
+ "logits/chosen": 28.878725051879883,
526
+ "logits/rejected": 30.279621124267578,
527
+ "logps/chosen": -434.04144287109375,
528
+ "logps/rejected": -457.3016357421875,
529
+ "loss": 0.485,
530
+ "rewards/accuracies": 0.875,
531
+ "rewards/chosen": -0.00823978427797556,
532
+ "rewards/margins": 0.03414863348007202,
533
+ "rewards/rejected": -0.042388420552015305,
534
+ "step": 25
535
+ },
536
+ {
537
+ "debug/policy_chosen_logits": 30.609947204589844,
538
+ "debug/policy_chosen_logps": -402.73504638671875,
539
+ "debug/policy_rejected_logits": 29.12665367126465,
540
+ "debug/policy_rejected_logps": -411.1260986328125,
541
+ "debug/reference_chosen_logps": -404.96392822265625,
542
+ "debug/reference_rejected_logps": -411.49969482421875,
543
+ "epoch": 0.5531914893617021,
544
+ "grad_norm": 5.066192133102437,
545
+ "learning_rate": 1e-06,
546
+ "logits/chosen": 30.609947204589844,
547
+ "logits/rejected": 29.12665367126465,
548
+ "logps/chosen": -402.73504638671875,
549
+ "logps/rejected": -411.1260986328125,
550
+ "loss": 0.4781,
551
+ "rewards/accuracies": 0.75,
552
+ "rewards/chosen": 0.022288817912340164,
553
+ "rewards/margins": 0.01855243556201458,
554
+ "rewards/rejected": 0.003736380487680435,
555
+ "step": 26
556
+ },
557
+ {
558
+ "debug/policy_chosen_logits": 26.225852966308594,
559
+ "debug/policy_chosen_logps": -434.0633544921875,
560
+ "debug/policy_rejected_logits": 27.547882080078125,
561
+ "debug/policy_rejected_logps": -460.3682861328125,
562
+ "debug/reference_chosen_logps": -434.0380859375,
563
+ "debug/reference_rejected_logps": -457.2252197265625,
564
+ "epoch": 0.574468085106383,
565
+ "grad_norm": 5.199359660864606,
566
+ "learning_rate": 1e-06,
567
+ "logits/chosen": 26.225852966308594,
568
+ "logits/rejected": 27.547882080078125,
569
+ "logps/chosen": -434.0633544921875,
570
+ "logps/rejected": -460.3682861328125,
571
+ "loss": 0.4775,
572
+ "rewards/accuracies": 0.625,
573
+ "rewards/chosen": -0.0002528773620724678,
574
+ "rewards/margins": 0.031177710741758347,
575
+ "rewards/rejected": -0.03143058717250824,
576
+ "step": 27
577
+ },
578
+ {
579
+ "debug/policy_chosen_logits": 34.810482025146484,
580
+ "debug/policy_chosen_logps": -432.9176025390625,
581
+ "debug/policy_rejected_logits": 32.29673385620117,
582
+ "debug/policy_rejected_logps": -435.6657409667969,
583
+ "debug/reference_chosen_logps": -433.37603759765625,
584
+ "debug/reference_rejected_logps": -432.38958740234375,
585
+ "epoch": 0.5957446808510638,
586
+ "grad_norm": 5.456307945174751,
587
+ "learning_rate": 1e-06,
588
+ "logits/chosen": 34.810482025146484,
589
+ "logits/rejected": 32.29673385620117,
590
+ "logps/chosen": -432.9176025390625,
591
+ "logps/rejected": -435.6657409667969,
592
+ "loss": 0.4635,
593
+ "rewards/accuracies": 0.625,
594
+ "rewards/chosen": 0.004584426060318947,
595
+ "rewards/margins": 0.03734596073627472,
596
+ "rewards/rejected": -0.03276153653860092,
597
+ "step": 28
598
+ },
599
+ {
600
+ "debug/policy_chosen_logits": 28.973360061645508,
601
+ "debug/policy_chosen_logps": -432.0859375,
602
+ "debug/policy_rejected_logits": 27.616941452026367,
603
+ "debug/policy_rejected_logps": -419.5810546875,
604
+ "debug/reference_chosen_logps": -432.6524658203125,
605
+ "debug/reference_rejected_logps": -413.3448181152344,
606
+ "epoch": 0.6170212765957447,
607
+ "grad_norm": 4.889651674840413,
608
+ "learning_rate": 1e-06,
609
+ "logits/chosen": 28.973360061645508,
610
+ "logits/rejected": 27.616941452026367,
611
+ "logps/chosen": -432.0859375,
612
+ "logps/rejected": -419.5810546875,
613
+ "loss": 0.4789,
614
+ "rewards/accuracies": 0.625,
615
+ "rewards/chosen": 0.005664861761033535,
616
+ "rewards/margins": 0.06802742183208466,
617
+ "rewards/rejected": -0.062362559139728546,
618
+ "step": 29
619
+ },
620
+ {
621
+ "debug/policy_chosen_logits": 28.820457458496094,
622
+ "debug/policy_chosen_logps": -419.7233581542969,
623
+ "debug/policy_rejected_logits": 30.256000518798828,
624
+ "debug/policy_rejected_logps": -422.4107971191406,
625
+ "debug/reference_chosen_logps": -423.3680725097656,
626
+ "debug/reference_rejected_logps": -421.3091125488281,
627
+ "epoch": 0.6382978723404256,
628
+ "grad_norm": 5.137030977785722,
629
+ "learning_rate": 1e-06,
630
+ "logits/chosen": 28.820457458496094,
631
+ "logits/rejected": 30.256000518798828,
632
+ "logps/chosen": -419.7233581542969,
633
+ "logps/rejected": -422.4107971191406,
634
+ "loss": 0.4823,
635
+ "rewards/accuracies": 0.625,
636
+ "rewards/chosen": 0.03644702956080437,
637
+ "rewards/margins": 0.047463756054639816,
638
+ "rewards/rejected": -0.011016730219125748,
639
+ "step": 30
640
+ },
641
+ {
642
+ "debug/policy_chosen_logits": 29.283926010131836,
643
+ "debug/policy_chosen_logps": -392.496826171875,
644
+ "debug/policy_rejected_logits": 31.77328109741211,
645
+ "debug/policy_rejected_logps": -434.51806640625,
646
+ "debug/reference_chosen_logps": -395.81146240234375,
647
+ "debug/reference_rejected_logps": -434.8221435546875,
648
+ "epoch": 0.6595744680851063,
649
+ "grad_norm": 4.951189622094444,
650
+ "learning_rate": 1e-06,
651
+ "logits/chosen": 29.283926010131836,
652
+ "logits/rejected": 31.77328109741211,
653
+ "logps/chosen": -392.496826171875,
654
+ "logps/rejected": -434.51806640625,
655
+ "loss": 0.4638,
656
+ "rewards/accuracies": 0.5,
657
+ "rewards/chosen": 0.033146705478429794,
658
+ "rewards/margins": 0.03010578267276287,
659
+ "rewards/rejected": 0.003040926530957222,
660
+ "step": 31
661
+ },
662
+ {
663
+ "debug/policy_chosen_logits": 29.353422164916992,
664
+ "debug/policy_chosen_logps": -414.32415771484375,
665
+ "debug/policy_rejected_logits": 30.822248458862305,
666
+ "debug/policy_rejected_logps": -430.6376037597656,
667
+ "debug/reference_chosen_logps": -415.54888916015625,
668
+ "debug/reference_rejected_logps": -431.1400146484375,
669
+ "epoch": 0.6808510638297872,
670
+ "grad_norm": 4.999575032095535,
671
+ "learning_rate": 1e-06,
672
+ "logits/chosen": 29.353422164916992,
673
+ "logits/rejected": 30.822248458862305,
674
+ "logps/chosen": -414.32415771484375,
675
+ "logps/rejected": -430.6376037597656,
676
+ "loss": 0.4834,
677
+ "rewards/accuracies": 0.625,
678
+ "rewards/chosen": 0.012247240170836449,
679
+ "rewards/margins": 0.007223015185445547,
680
+ "rewards/rejected": 0.00502422172576189,
681
+ "step": 32
682
+ },
683
+ {
684
+ "debug/policy_chosen_logits": 27.81666374206543,
685
+ "debug/policy_chosen_logps": -437.5671081542969,
686
+ "debug/policy_rejected_logits": 29.937236785888672,
687
+ "debug/policy_rejected_logps": -429.474853515625,
688
+ "debug/reference_chosen_logps": -440.85504150390625,
689
+ "debug/reference_rejected_logps": -431.23309326171875,
690
+ "epoch": 0.7021276595744681,
691
+ "grad_norm": 5.116678809536897,
692
+ "learning_rate": 1e-06,
693
+ "logits/chosen": 27.81666374206543,
694
+ "logits/rejected": 29.937236785888672,
695
+ "logps/chosen": -437.5671081542969,
696
+ "logps/rejected": -429.474853515625,
697
+ "loss": 0.4783,
698
+ "rewards/accuracies": 0.5,
699
+ "rewards/chosen": 0.03287952393293381,
700
+ "rewards/margins": 0.01529712788760662,
701
+ "rewards/rejected": 0.017582397907972336,
702
+ "step": 33
703
+ },
704
+ {
705
+ "debug/policy_chosen_logits": 32.981014251708984,
706
+ "debug/policy_chosen_logps": -450.77899169921875,
707
+ "debug/policy_rejected_logits": 29.245454788208008,
708
+ "debug/policy_rejected_logps": -421.23468017578125,
709
+ "debug/reference_chosen_logps": -454.22735595703125,
710
+ "debug/reference_rejected_logps": -425.0584716796875,
711
+ "epoch": 0.723404255319149,
712
+ "grad_norm": 5.485713658124459,
713
+ "learning_rate": 1e-06,
714
+ "logits/chosen": 32.981014251708984,
715
+ "logits/rejected": 29.245454788208008,
716
+ "logps/chosen": -450.77899169921875,
717
+ "logps/rejected": -421.23468017578125,
718
+ "loss": 0.4736,
719
+ "rewards/accuracies": 0.625,
720
+ "rewards/chosen": 0.03448398783802986,
721
+ "rewards/margins": -0.003754120320081711,
722
+ "rewards/rejected": 0.038238104432821274,
723
+ "step": 34
724
+ },
725
+ {
726
+ "debug/policy_chosen_logits": 30.9548397064209,
727
+ "debug/policy_chosen_logps": -427.41632080078125,
728
+ "debug/policy_rejected_logits": 29.430871963500977,
729
+ "debug/policy_rejected_logps": -436.83050537109375,
730
+ "debug/reference_chosen_logps": -430.6944580078125,
731
+ "debug/reference_rejected_logps": -433.17041015625,
732
+ "epoch": 0.7446808510638298,
733
+ "grad_norm": 5.73163901812078,
734
+ "learning_rate": 1e-06,
735
+ "logits/chosen": 30.9548397064209,
736
+ "logits/rejected": 29.430871963500977,
737
+ "logps/chosen": -427.41632080078125,
738
+ "logps/rejected": -436.83050537109375,
739
+ "loss": 0.4459,
740
+ "rewards/accuracies": 0.875,
741
+ "rewards/chosen": 0.03278125822544098,
742
+ "rewards/margins": 0.06938225030899048,
743
+ "rewards/rejected": -0.0366009883582592,
744
+ "step": 35
745
+ },
746
+ {
747
+ "debug/policy_chosen_logits": 28.33571434020996,
748
+ "debug/policy_chosen_logps": -400.690673828125,
749
+ "debug/policy_rejected_logits": 24.813756942749023,
750
+ "debug/policy_rejected_logps": -445.54791259765625,
751
+ "debug/reference_chosen_logps": -404.2062072753906,
752
+ "debug/reference_rejected_logps": -433.4603576660156,
753
+ "epoch": 0.7659574468085106,
754
+ "grad_norm": 5.207209058021427,
755
+ "learning_rate": 1e-06,
756
+ "logits/chosen": 28.33571434020996,
757
+ "logits/rejected": 24.813756942749023,
758
+ "logps/chosen": -400.690673828125,
759
+ "logps/rejected": -445.54791259765625,
760
+ "loss": 0.4546,
761
+ "rewards/accuracies": 0.875,
762
+ "rewards/chosen": 0.03515518084168434,
763
+ "rewards/margins": 0.15603074431419373,
764
+ "rewards/rejected": -0.12087554484605789,
765
+ "step": 36
766
+ },
767
+ {
768
+ "debug/policy_chosen_logits": 29.32408332824707,
769
+ "debug/policy_chosen_logps": -427.9587707519531,
770
+ "debug/policy_rejected_logits": 27.91067123413086,
771
+ "debug/policy_rejected_logps": -405.4839172363281,
772
+ "debug/reference_chosen_logps": -427.9952392578125,
773
+ "debug/reference_rejected_logps": -407.0904846191406,
774
+ "epoch": 0.7872340425531915,
775
+ "grad_norm": 5.293688440117633,
776
+ "learning_rate": 1e-06,
777
+ "logits/chosen": 29.32408332824707,
778
+ "logits/rejected": 27.91067123413086,
779
+ "logps/chosen": -427.9587707519531,
780
+ "logps/rejected": -405.4839172363281,
781
+ "loss": 0.5106,
782
+ "rewards/accuracies": 0.625,
783
+ "rewards/chosen": 0.00036445818841457367,
784
+ "rewards/margins": -0.01570144295692444,
785
+ "rewards/rejected": 0.01606590300798416,
786
+ "step": 37
787
+ },
788
+ {
789
+ "debug/policy_chosen_logits": 28.01889419555664,
790
+ "debug/policy_chosen_logps": -404.5608825683594,
791
+ "debug/policy_rejected_logits": 28.02815055847168,
792
+ "debug/policy_rejected_logps": -413.2740173339844,
793
+ "debug/reference_chosen_logps": -405.4373779296875,
794
+ "debug/reference_rejected_logps": -406.2366027832031,
795
+ "epoch": 0.8085106382978723,
796
+ "grad_norm": 4.841695423817661,
797
+ "learning_rate": 1e-06,
798
+ "logits/chosen": 28.01889419555664,
799
+ "logits/rejected": 28.02815055847168,
800
+ "logps/chosen": -404.5608825683594,
801
+ "logps/rejected": -413.2740173339844,
802
+ "loss": 0.4719,
803
+ "rewards/accuracies": 0.875,
804
+ "rewards/chosen": 0.00876510702073574,
805
+ "rewards/margins": 0.07913925498723984,
806
+ "rewards/rejected": -0.07037414610385895,
807
+ "step": 38
808
+ },
809
+ {
810
+ "debug/policy_chosen_logits": 26.369901657104492,
811
+ "debug/policy_chosen_logps": -418.9932556152344,
812
+ "debug/policy_rejected_logits": 24.119754791259766,
813
+ "debug/policy_rejected_logps": -405.0431213378906,
814
+ "debug/reference_chosen_logps": -421.83203125,
815
+ "debug/reference_rejected_logps": -404.04913330078125,
816
+ "epoch": 0.8297872340425532,
817
+ "grad_norm": 5.464322991896784,
818
+ "learning_rate": 1e-06,
819
+ "logits/chosen": 26.369901657104492,
820
+ "logits/rejected": 24.119754791259766,
821
+ "logps/chosen": -418.9932556152344,
822
+ "logps/rejected": -405.0431213378906,
823
+ "loss": 0.4666,
824
+ "rewards/accuracies": 0.625,
825
+ "rewards/chosen": 0.028387565165758133,
826
+ "rewards/margins": 0.03832760080695152,
827
+ "rewards/rejected": -0.009940031915903091,
828
+ "step": 39
829
+ },
830
+ {
831
+ "debug/policy_chosen_logits": 31.222116470336914,
832
+ "debug/policy_chosen_logps": -408.92315673828125,
833
+ "debug/policy_rejected_logits": 30.565526962280273,
834
+ "debug/policy_rejected_logps": -438.00384521484375,
835
+ "debug/reference_chosen_logps": -410.3876037597656,
836
+ "debug/reference_rejected_logps": -428.6034851074219,
837
+ "epoch": 0.851063829787234,
838
+ "grad_norm": 5.087286245449022,
839
+ "learning_rate": 1e-06,
840
+ "logits/chosen": 31.222116470336914,
841
+ "logits/rejected": 30.565526962280273,
842
+ "logps/chosen": -408.92315673828125,
843
+ "logps/rejected": -438.00384521484375,
844
+ "loss": 0.4691,
845
+ "rewards/accuracies": 0.75,
846
+ "rewards/chosen": 0.014644507318735123,
847
+ "rewards/margins": 0.10864795744419098,
848
+ "rewards/rejected": -0.09400344640016556,
849
+ "step": 40
850
+ },
851
+ {
852
+ "debug/policy_chosen_logits": 29.859107971191406,
853
+ "debug/policy_chosen_logps": -403.02581787109375,
854
+ "debug/policy_rejected_logits": 23.891035079956055,
855
+ "debug/policy_rejected_logps": -395.781005859375,
856
+ "debug/reference_chosen_logps": -409.97967529296875,
857
+ "debug/reference_rejected_logps": -398.636474609375,
858
+ "epoch": 0.8723404255319149,
859
+ "grad_norm": 4.953135237737809,
860
+ "learning_rate": 1e-06,
861
+ "logits/chosen": 29.859107971191406,
862
+ "logits/rejected": 23.891035079956055,
863
+ "logps/chosen": -403.02581787109375,
864
+ "logps/rejected": -395.781005859375,
865
+ "loss": 0.458,
866
+ "rewards/accuracies": 0.75,
867
+ "rewards/chosen": 0.06953833997249603,
868
+ "rewards/margins": 0.04098331928253174,
869
+ "rewards/rejected": 0.02855503186583519,
870
+ "step": 41
871
+ },
872
+ {
873
+ "debug/policy_chosen_logits": 31.02197265625,
874
+ "debug/policy_chosen_logps": -424.50299072265625,
875
+ "debug/policy_rejected_logits": 29.632427215576172,
876
+ "debug/policy_rejected_logps": -422.94561767578125,
877
+ "debug/reference_chosen_logps": -429.08648681640625,
878
+ "debug/reference_rejected_logps": -424.3019104003906,
879
+ "epoch": 0.8936170212765957,
880
+ "grad_norm": 5.514556663745466,
881
+ "learning_rate": 1e-06,
882
+ "logits/chosen": 31.02197265625,
883
+ "logits/rejected": 29.632427215576172,
884
+ "logps/chosen": -424.50299072265625,
885
+ "logps/rejected": -422.94561767578125,
886
+ "loss": 0.4735,
887
+ "rewards/accuracies": 0.75,
888
+ "rewards/chosen": 0.04583461582660675,
889
+ "rewards/margins": 0.032271310687065125,
890
+ "rewards/rejected": 0.01356330793350935,
891
+ "step": 42
892
+ },
893
+ {
894
+ "debug/policy_chosen_logits": 32.29981231689453,
895
+ "debug/policy_chosen_logps": -437.9972229003906,
896
+ "debug/policy_rejected_logits": 30.15468978881836,
897
+ "debug/policy_rejected_logps": -440.30535888671875,
898
+ "debug/reference_chosen_logps": -436.08892822265625,
899
+ "debug/reference_rejected_logps": -434.7149353027344,
900
+ "epoch": 0.9148936170212766,
901
+ "grad_norm": 4.808288177536615,
902
+ "learning_rate": 1e-06,
903
+ "logits/chosen": 32.29981231689453,
904
+ "logits/rejected": 30.15468978881836,
905
+ "logps/chosen": -437.9972229003906,
906
+ "logps/rejected": -440.30535888671875,
907
+ "loss": 0.4598,
908
+ "rewards/accuracies": 0.375,
909
+ "rewards/chosen": -0.01908310130238533,
910
+ "rewards/margins": 0.03682101517915726,
911
+ "rewards/rejected": -0.055904120206832886,
912
+ "step": 43
913
+ },
914
+ {
915
+ "debug/policy_chosen_logits": 26.440486907958984,
916
+ "debug/policy_chosen_logps": -391.61572265625,
917
+ "debug/policy_rejected_logits": 29.678592681884766,
918
+ "debug/policy_rejected_logps": -430.9735412597656,
919
+ "debug/reference_chosen_logps": -396.64862060546875,
920
+ "debug/reference_rejected_logps": -434.947998046875,
921
+ "epoch": 0.9361702127659575,
922
+ "grad_norm": 5.150941817603919,
923
+ "learning_rate": 1e-06,
924
+ "logits/chosen": 26.440486907958984,
925
+ "logits/rejected": 29.678592681884766,
926
+ "logps/chosen": -391.61572265625,
927
+ "logps/rejected": -430.9735412597656,
928
+ "loss": 0.4568,
929
+ "rewards/accuracies": 0.75,
930
+ "rewards/chosen": 0.050329361110925674,
931
+ "rewards/margins": 0.010584792122244835,
932
+ "rewards/rejected": 0.03974456712603569,
933
+ "step": 44
934
+ },
935
+ {
936
+ "debug/policy_chosen_logits": 29.451526641845703,
937
+ "debug/policy_chosen_logps": -425.8896484375,
938
+ "debug/policy_rejected_logits": 32.46401596069336,
939
+ "debug/policy_rejected_logps": -428.1052551269531,
940
+ "debug/reference_chosen_logps": -424.65936279296875,
941
+ "debug/reference_rejected_logps": -427.16961669921875,
942
+ "epoch": 0.9574468085106383,
943
+ "grad_norm": 5.012447008649623,
944
+ "learning_rate": 1e-06,
945
+ "logits/chosen": 29.451526641845703,
946
+ "logits/rejected": 32.46401596069336,
947
+ "logps/chosen": -425.8896484375,
948
+ "logps/rejected": -428.1052551269531,
949
+ "loss": 0.4777,
950
+ "rewards/accuracies": 0.5,
951
+ "rewards/chosen": -0.012303046882152557,
952
+ "rewards/margins": -0.0029468159191310406,
953
+ "rewards/rejected": -0.009356231428682804,
954
+ "step": 45
955
+ },
956
+ {
957
+ "debug/policy_chosen_logits": 31.168346405029297,
958
+ "debug/policy_chosen_logps": -426.1267395019531,
959
+ "debug/policy_rejected_logits": 29.51166534423828,
960
+ "debug/policy_rejected_logps": -444.91766357421875,
961
+ "debug/reference_chosen_logps": -429.6617736816406,
962
+ "debug/reference_rejected_logps": -439.4256591796875,
963
+ "epoch": 0.9787234042553191,
964
+ "grad_norm": 5.030625016447312,
965
+ "learning_rate": 1e-06,
966
+ "logits/chosen": 31.168346405029297,
967
+ "logits/rejected": 29.51166534423828,
968
+ "logps/chosen": -426.1267395019531,
969
+ "logps/rejected": -444.91766357421875,
970
+ "loss": 0.4647,
971
+ "rewards/accuracies": 1.0,
972
+ "rewards/chosen": 0.03535018861293793,
973
+ "rewards/margins": 0.09027023613452911,
974
+ "rewards/rejected": -0.05492004379630089,
975
+ "step": 46
976
+ },
977
+ {
978
+ "debug/policy_chosen_logits": 28.43193244934082,
979
+ "debug/policy_chosen_logps": -419.4750671386719,
980
+ "debug/policy_rejected_logits": 27.273754119873047,
981
+ "debug/policy_rejected_logps": -438.4751892089844,
982
+ "debug/reference_chosen_logps": -421.2059326171875,
983
+ "debug/reference_rejected_logps": -437.2878723144531,
984
+ "epoch": 1.0,
985
+ "grad_norm": 5.486914446149956,
986
+ "learning_rate": 1e-06,
987
+ "logits/chosen": 28.43193244934082,
988
+ "logits/rejected": 27.273754119873047,
989
+ "logps/chosen": -419.4750671386719,
990
+ "logps/rejected": -438.4751892089844,
991
+ "loss": 0.4596,
992
+ "rewards/accuracies": 0.75,
993
+ "rewards/chosen": 0.01730876788496971,
994
+ "rewards/margins": 0.02918224036693573,
995
+ "rewards/rejected": -0.011873474344611168,
996
+ "step": 47
997
+ },
998
+ {
999
+ "epoch": 1.0,
1000
+ "step": 47,
1001
+ "total_flos": 0.0,
1002
+ "train_loss": 0.48215872493196044,
1003
+ "train_runtime": 474.4995,
1004
+ "train_samples_per_second": 6.327,
1005
+ "train_steps_per_second": 0.099
1006
+ }
1007
+ ],
1008
+ "logging_steps": 1,
1009
+ "max_steps": 47,
1010
+ "num_input_tokens_seen": 0,
1011
+ "num_train_epochs": 1,
1012
+ "save_steps": 500,
1013
+ "stateful_callbacks": {
1014
+ "TrainerControl": {
1015
+ "args": {
1016
+ "should_epoch_stop": false,
1017
+ "should_evaluate": false,
1018
+ "should_log": false,
1019
+ "should_save": true,
1020
+ "should_training_stop": true
1021
+ },
1022
+ "attributes": {}
1023
+ }
1024
+ },
1025
+ "total_flos": 0.0,
1026
+ "train_batch_size": 8,
1027
+ "trial_name": null,
1028
+ "trial_params": null
1029
+ }