yiran-wang3 commited on
Commit
791eae6
1 Parent(s): bd54ecb

End of training

Browse files
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: yiran-wang3/ds_coder6.7b_adamw_iter1
5
+ tags:
6
+ - alignment-handbook
7
+ - generated_from_trainer
8
+ - trl
9
+ - dpo
10
+ datasets:
11
+ - self-generate/ds_coder6.7b_sppo_hard_new_cn_mining_oj_iter1-binarized
12
+ model-index:
13
+ - name: ds_coder6.7b_adamw_iter2
14
+ results: []
15
+ ---
16
+
17
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
+ should probably proofread and complete it, then remove this comment. -->
19
+
20
+ # ds_coder6.7b_adamw_iter2
21
+
22
+ This model is a fine-tuned version of [yiran-wang3/ds_coder6.7b_adamw_iter1](https://huggingface.co/yiran-wang3/ds_coder6.7b_adamw_iter1) on the self-generate/ds_coder6.7b_sppo_hard_new_cn_mining_oj_iter1-binarized dataset.
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 1e-06
42
+ - train_batch_size: 8
43
+ - eval_batch_size: 4
44
+ - seed: 42
45
+ - distributed_type: multi-GPU
46
+ - num_devices: 8
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 32
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: constant
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - lr_scheduler_warmup_steps: 100
53
+ - num_epochs: 1.0
54
+
55
+ ### Training results
56
+
57
+
58
+
59
+ ### Framework versions
60
+
61
+ - Transformers 4.45.0
62
+ - Pytorch 2.4.0+cu121
63
+ - Datasets 2.14.6
64
+ - Tokenizers 0.20.3
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.4809455207238595,
5
+ "train_runtime": 486.3354,
6
+ "train_samples": 3050,
7
+ "train_samples_per_second": 6.271,
8
+ "train_steps_per_second": 0.099
9
+ }
config.json CHANGED
@@ -25,6 +25,6 @@
25
  "tie_word_embeddings": false,
26
  "torch_dtype": "bfloat16",
27
  "transformers_version": "4.45.0",
28
- "use_cache": false,
29
  "vocab_size": 102400
30
  }
 
25
  "tie_word_embeddings": false,
26
  "torch_dtype": "bfloat16",
27
  "transformers_version": "4.45.0",
28
+ "use_cache": true,
29
  "vocab_size": 102400
30
  }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 100000,
4
+ "eos_token_id": 100015,
5
+ "transformers_version": "4.45.0"
6
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.4809455207238595,
5
+ "train_runtime": 486.3354,
6
+ "train_samples": 3050,
7
+ "train_samples_per_second": 6.271,
8
+ "train_steps_per_second": 0.099
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1050 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 100,
6
+ "global_step": 48,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "debug/policy_chosen_logits": 32.32257843017578,
13
+ "debug/policy_chosen_logps": -428.3227844238281,
14
+ "debug/policy_rejected_logits": 30.82423210144043,
15
+ "debug/policy_rejected_logps": -426.3280029296875,
16
+ "debug/reference_chosen_logps": -428.3227844238281,
17
+ "debug/reference_rejected_logps": -426.3280029296875,
18
+ "epoch": 0.020833333333333332,
19
+ "grad_norm": 6.447932275019588,
20
+ "learning_rate": 1e-06,
21
+ "logits/chosen": 32.32257843017578,
22
+ "logits/rejected": 30.82423210144043,
23
+ "logps/chosen": -428.3227844238281,
24
+ "logps/rejected": -426.3280029296875,
25
+ "loss": 0.5,
26
+ "rewards/accuracies": 0.0,
27
+ "rewards/chosen": 0.0,
28
+ "rewards/margins": 0.0,
29
+ "rewards/rejected": 0.0,
30
+ "step": 1
31
+ },
32
+ {
33
+ "debug/policy_chosen_logits": 31.886442184448242,
34
+ "debug/policy_chosen_logps": -418.96539306640625,
35
+ "debug/policy_rejected_logits": 29.381662368774414,
36
+ "debug/policy_rejected_logps": -429.94775390625,
37
+ "debug/reference_chosen_logps": -418.5838623046875,
38
+ "debug/reference_rejected_logps": -429.845703125,
39
+ "epoch": 0.041666666666666664,
40
+ "grad_norm": 5.235639772033975,
41
+ "learning_rate": 1e-06,
42
+ "logits/chosen": 31.886442184448242,
43
+ "logits/rejected": 29.381662368774414,
44
+ "logps/chosen": -418.96539306640625,
45
+ "logps/rejected": -429.94775390625,
46
+ "loss": 0.4988,
47
+ "rewards/accuracies": 0.625,
48
+ "rewards/chosen": -0.00381515477783978,
49
+ "rewards/margins": -0.0027948375791311264,
50
+ "rewards/rejected": -0.001020316849462688,
51
+ "step": 2
52
+ },
53
+ {
54
+ "debug/policy_chosen_logits": 31.19687843322754,
55
+ "debug/policy_chosen_logps": -417.73297119140625,
56
+ "debug/policy_rejected_logits": 29.822526931762695,
57
+ "debug/policy_rejected_logps": -424.8887939453125,
58
+ "debug/reference_chosen_logps": -416.591796875,
59
+ "debug/reference_rejected_logps": -424.60333251953125,
60
+ "epoch": 0.0625,
61
+ "grad_norm": 5.035431279495221,
62
+ "learning_rate": 1e-06,
63
+ "logits/chosen": 31.19687843322754,
64
+ "logits/rejected": 29.822526931762695,
65
+ "logps/chosen": -417.73297119140625,
66
+ "logps/rejected": -424.8887939453125,
67
+ "loss": 0.5013,
68
+ "rewards/accuracies": 0.125,
69
+ "rewards/chosen": -0.011411934159696102,
70
+ "rewards/margins": -0.008557281456887722,
71
+ "rewards/rejected": -0.002854652237147093,
72
+ "step": 3
73
+ },
74
+ {
75
+ "debug/policy_chosen_logits": 31.30547523498535,
76
+ "debug/policy_chosen_logps": -409.65521240234375,
77
+ "debug/policy_rejected_logits": 31.427326202392578,
78
+ "debug/policy_rejected_logps": -419.1351013183594,
79
+ "debug/reference_chosen_logps": -410.0574951171875,
80
+ "debug/reference_rejected_logps": -419.167724609375,
81
+ "epoch": 0.08333333333333333,
82
+ "grad_norm": 4.947716251775631,
83
+ "learning_rate": 1e-06,
84
+ "logits/chosen": 31.30547523498535,
85
+ "logits/rejected": 31.427326202392578,
86
+ "logps/chosen": -409.65521240234375,
87
+ "logps/rejected": -419.1351013183594,
88
+ "loss": 0.4997,
89
+ "rewards/accuracies": 0.5,
90
+ "rewards/chosen": 0.0040228646248579025,
91
+ "rewards/margins": 0.0036964411847293377,
92
+ "rewards/rejected": 0.00032642344012856483,
93
+ "step": 4
94
+ },
95
+ {
96
+ "debug/policy_chosen_logits": 26.84766387939453,
97
+ "debug/policy_chosen_logps": -419.80255126953125,
98
+ "debug/policy_rejected_logits": 25.351024627685547,
99
+ "debug/policy_rejected_logps": -403.54278564453125,
100
+ "debug/reference_chosen_logps": -419.79803466796875,
101
+ "debug/reference_rejected_logps": -403.3204040527344,
102
+ "epoch": 0.10416666666666667,
103
+ "grad_norm": 5.437518779758296,
104
+ "learning_rate": 1e-06,
105
+ "logits/chosen": 26.84766387939453,
106
+ "logits/rejected": 25.351024627685547,
107
+ "logps/chosen": -419.80255126953125,
108
+ "logps/rejected": -403.54278564453125,
109
+ "loss": 0.5011,
110
+ "rewards/accuracies": 0.5,
111
+ "rewards/chosen": -4.535634070634842e-05,
112
+ "rewards/margins": 0.0021785348653793335,
113
+ "rewards/rejected": -0.0022238921374082565,
114
+ "step": 5
115
+ },
116
+ {
117
+ "debug/policy_chosen_logits": 28.140277862548828,
118
+ "debug/policy_chosen_logps": -435.39581298828125,
119
+ "debug/policy_rejected_logits": 29.35475730895996,
120
+ "debug/policy_rejected_logps": -438.25970458984375,
121
+ "debug/reference_chosen_logps": -435.1541748046875,
122
+ "debug/reference_rejected_logps": -436.6419677734375,
123
+ "epoch": 0.125,
124
+ "grad_norm": 5.3788783078874625,
125
+ "learning_rate": 1e-06,
126
+ "logits/chosen": 28.140277862548828,
127
+ "logits/rejected": 29.35475730895996,
128
+ "logps/chosen": -435.39581298828125,
129
+ "logps/rejected": -438.25970458984375,
130
+ "loss": 0.4935,
131
+ "rewards/accuracies": 0.875,
132
+ "rewards/chosen": -0.002416381612420082,
133
+ "rewards/margins": 0.013761366717517376,
134
+ "rewards/rejected": -0.016177749261260033,
135
+ "step": 6
136
+ },
137
+ {
138
+ "debug/policy_chosen_logits": 32.219444274902344,
139
+ "debug/policy_chosen_logps": -422.5959167480469,
140
+ "debug/policy_rejected_logits": 33.60882568359375,
141
+ "debug/policy_rejected_logps": -429.6781311035156,
142
+ "debug/reference_chosen_logps": -422.28350830078125,
143
+ "debug/reference_rejected_logps": -429.510009765625,
144
+ "epoch": 0.14583333333333334,
145
+ "grad_norm": 5.089230644656757,
146
+ "learning_rate": 1e-06,
147
+ "logits/chosen": 32.219444274902344,
148
+ "logits/rejected": 33.60882568359375,
149
+ "logps/chosen": -422.5959167480469,
150
+ "logps/rejected": -429.6781311035156,
151
+ "loss": 0.499,
152
+ "rewards/accuracies": 0.625,
153
+ "rewards/chosen": -0.0031237031798809767,
154
+ "rewards/margins": -0.0014425661647692323,
155
+ "rewards/rejected": -0.0016811370151117444,
156
+ "step": 7
157
+ },
158
+ {
159
+ "debug/policy_chosen_logits": 34.4580192565918,
160
+ "debug/policy_chosen_logps": -425.48052978515625,
161
+ "debug/policy_rejected_logits": 34.77037048339844,
162
+ "debug/policy_rejected_logps": -458.2679138183594,
163
+ "debug/reference_chosen_logps": -425.6324462890625,
164
+ "debug/reference_rejected_logps": -455.59991455078125,
165
+ "epoch": 0.16666666666666666,
166
+ "grad_norm": 6.722332782296524,
167
+ "learning_rate": 1e-06,
168
+ "logits/chosen": 34.4580192565918,
169
+ "logits/rejected": 34.77037048339844,
170
+ "logps/chosen": -425.48052978515625,
171
+ "logps/rejected": -458.2679138183594,
172
+ "loss": 0.4938,
173
+ "rewards/accuracies": 0.875,
174
+ "rewards/chosen": 0.0015194700099527836,
175
+ "rewards/margins": 0.02819930762052536,
176
+ "rewards/rejected": -0.026679838076233864,
177
+ "step": 8
178
+ },
179
+ {
180
+ "debug/policy_chosen_logits": 29.579063415527344,
181
+ "debug/policy_chosen_logps": -397.6627502441406,
182
+ "debug/policy_rejected_logits": 29.510604858398438,
183
+ "debug/policy_rejected_logps": -423.0687255859375,
184
+ "debug/reference_chosen_logps": -398.0777282714844,
185
+ "debug/reference_rejected_logps": -422.35186767578125,
186
+ "epoch": 0.1875,
187
+ "grad_norm": 5.150446773896013,
188
+ "learning_rate": 1e-06,
189
+ "logits/chosen": 29.579063415527344,
190
+ "logits/rejected": 29.510604858398438,
191
+ "logps/chosen": -397.6627502441406,
192
+ "logps/rejected": -423.0687255859375,
193
+ "loss": 0.4984,
194
+ "rewards/accuracies": 0.625,
195
+ "rewards/chosen": 0.004149818792939186,
196
+ "rewards/margins": 0.011318511329591274,
197
+ "rewards/rejected": -0.007168693467974663,
198
+ "step": 9
199
+ },
200
+ {
201
+ "debug/policy_chosen_logits": 30.510459899902344,
202
+ "debug/policy_chosen_logps": -406.46258544921875,
203
+ "debug/policy_rejected_logits": 32.81418228149414,
204
+ "debug/policy_rejected_logps": -420.8358154296875,
205
+ "debug/reference_chosen_logps": -405.9990539550781,
206
+ "debug/reference_rejected_logps": -419.8515930175781,
207
+ "epoch": 0.20833333333333334,
208
+ "grad_norm": 5.292871534847833,
209
+ "learning_rate": 1e-06,
210
+ "logits/chosen": 30.510459899902344,
211
+ "logits/rejected": 32.81418228149414,
212
+ "logps/chosen": -406.46258544921875,
213
+ "logps/rejected": -420.8358154296875,
214
+ "loss": 0.4965,
215
+ "rewards/accuracies": 0.375,
216
+ "rewards/chosen": -0.004634971730411053,
217
+ "rewards/margins": 0.005207213573157787,
218
+ "rewards/rejected": -0.009842186234891415,
219
+ "step": 10
220
+ },
221
+ {
222
+ "debug/policy_chosen_logits": 29.758987426757812,
223
+ "debug/policy_chosen_logps": -421.1949462890625,
224
+ "debug/policy_rejected_logits": 30.991361618041992,
225
+ "debug/policy_rejected_logps": -440.055908203125,
226
+ "debug/reference_chosen_logps": -420.9813537597656,
227
+ "debug/reference_rejected_logps": -440.38330078125,
228
+ "epoch": 0.22916666666666666,
229
+ "grad_norm": 5.257332824569119,
230
+ "learning_rate": 1e-06,
231
+ "logits/chosen": 29.758987426757812,
232
+ "logits/rejected": 30.991361618041992,
233
+ "logps/chosen": -421.1949462890625,
234
+ "logps/rejected": -440.055908203125,
235
+ "loss": 0.4898,
236
+ "rewards/accuracies": 0.5,
237
+ "rewards/chosen": -0.002135887276381254,
238
+ "rewards/margins": -0.005409927107393742,
239
+ "rewards/rejected": 0.003274040063843131,
240
+ "step": 11
241
+ },
242
+ {
243
+ "debug/policy_chosen_logits": 32.33553695678711,
244
+ "debug/policy_chosen_logps": -442.6135559082031,
245
+ "debug/policy_rejected_logits": 32.11320114135742,
246
+ "debug/policy_rejected_logps": -444.33697509765625,
247
+ "debug/reference_chosen_logps": -441.39215087890625,
248
+ "debug/reference_rejected_logps": -443.31390380859375,
249
+ "epoch": 0.25,
250
+ "grad_norm": 6.359103183966066,
251
+ "learning_rate": 1e-06,
252
+ "logits/chosen": 32.33553695678711,
253
+ "logits/rejected": 32.11320114135742,
254
+ "logps/chosen": -442.6135559082031,
255
+ "logps/rejected": -444.33697509765625,
256
+ "loss": 0.4997,
257
+ "rewards/accuracies": 0.625,
258
+ "rewards/chosen": -0.01221416424959898,
259
+ "rewards/margins": -0.001983833033591509,
260
+ "rewards/rejected": -0.010230331681668758,
261
+ "step": 12
262
+ },
263
+ {
264
+ "debug/policy_chosen_logits": 29.988950729370117,
265
+ "debug/policy_chosen_logps": -425.54913330078125,
266
+ "debug/policy_rejected_logits": 30.10309410095215,
267
+ "debug/policy_rejected_logps": -426.21905517578125,
268
+ "debug/reference_chosen_logps": -426.2005920410156,
269
+ "debug/reference_rejected_logps": -424.59246826171875,
270
+ "epoch": 0.2708333333333333,
271
+ "grad_norm": 5.288806638849553,
272
+ "learning_rate": 1e-06,
273
+ "logits/chosen": 29.988950729370117,
274
+ "logits/rejected": 30.10309410095215,
275
+ "logps/chosen": -425.54913330078125,
276
+ "logps/rejected": -426.21905517578125,
277
+ "loss": 0.4925,
278
+ "rewards/accuracies": 0.625,
279
+ "rewards/chosen": 0.006514891982078552,
280
+ "rewards/margins": 0.022780990228056908,
281
+ "rewards/rejected": -0.016266098245978355,
282
+ "step": 13
283
+ },
284
+ {
285
+ "debug/policy_chosen_logits": 31.317047119140625,
286
+ "debug/policy_chosen_logps": -419.84283447265625,
287
+ "debug/policy_rejected_logits": 32.35562515258789,
288
+ "debug/policy_rejected_logps": -399.378173828125,
289
+ "debug/reference_chosen_logps": -421.95867919921875,
290
+ "debug/reference_rejected_logps": -399.3148193359375,
291
+ "epoch": 0.2916666666666667,
292
+ "grad_norm": 5.154675497648274,
293
+ "learning_rate": 1e-06,
294
+ "logits/chosen": 31.317047119140625,
295
+ "logits/rejected": 32.35562515258789,
296
+ "logps/chosen": -419.84283447265625,
297
+ "logps/rejected": -399.378173828125,
298
+ "loss": 0.4906,
299
+ "rewards/accuracies": 0.75,
300
+ "rewards/chosen": 0.021158332005143166,
301
+ "rewards/margins": 0.021792029961943626,
302
+ "rewards/rejected": -0.0006336974911391735,
303
+ "step": 14
304
+ },
305
+ {
306
+ "debug/policy_chosen_logits": 29.34722328186035,
307
+ "debug/policy_chosen_logps": -394.68389892578125,
308
+ "debug/policy_rejected_logits": 27.714096069335938,
309
+ "debug/policy_rejected_logps": -414.51715087890625,
310
+ "debug/reference_chosen_logps": -395.9772033691406,
311
+ "debug/reference_rejected_logps": -414.28857421875,
312
+ "epoch": 0.3125,
313
+ "grad_norm": 5.191304228736527,
314
+ "learning_rate": 1e-06,
315
+ "logits/chosen": 29.34722328186035,
316
+ "logits/rejected": 27.714096069335938,
317
+ "logps/chosen": -394.68389892578125,
318
+ "logps/rejected": -414.51715087890625,
319
+ "loss": 0.4844,
320
+ "rewards/accuracies": 0.625,
321
+ "rewards/chosen": 0.012933043763041496,
322
+ "rewards/margins": 0.015218695625662804,
323
+ "rewards/rejected": -0.0022856518626213074,
324
+ "step": 15
325
+ },
326
+ {
327
+ "debug/policy_chosen_logits": 29.499605178833008,
328
+ "debug/policy_chosen_logps": -416.2149658203125,
329
+ "debug/policy_rejected_logits": 31.734092712402344,
330
+ "debug/policy_rejected_logps": -420.2775573730469,
331
+ "debug/reference_chosen_logps": -417.6040954589844,
332
+ "debug/reference_rejected_logps": -421.9405822753906,
333
+ "epoch": 0.3333333333333333,
334
+ "grad_norm": 5.336647699178971,
335
+ "learning_rate": 1e-06,
336
+ "logits/chosen": 29.499605178833008,
337
+ "logits/rejected": 31.734092712402344,
338
+ "logps/chosen": -416.2149658203125,
339
+ "logps/rejected": -420.2775573730469,
340
+ "loss": 0.4909,
341
+ "rewards/accuracies": 0.25,
342
+ "rewards/chosen": 0.013891411013901234,
343
+ "rewards/margins": -0.002739105373620987,
344
+ "rewards/rejected": 0.016630517318844795,
345
+ "step": 16
346
+ },
347
+ {
348
+ "debug/policy_chosen_logits": 30.53592872619629,
349
+ "debug/policy_chosen_logps": -424.2174987792969,
350
+ "debug/policy_rejected_logits": 28.613372802734375,
351
+ "debug/policy_rejected_logps": -424.45184326171875,
352
+ "debug/reference_chosen_logps": -425.0408020019531,
353
+ "debug/reference_rejected_logps": -424.396728515625,
354
+ "epoch": 0.3541666666666667,
355
+ "grad_norm": 5.069961718294066,
356
+ "learning_rate": 1e-06,
357
+ "logits/chosen": 30.53592872619629,
358
+ "logits/rejected": 28.613372802734375,
359
+ "logps/chosen": -424.2174987792969,
360
+ "logps/rejected": -424.45184326171875,
361
+ "loss": 0.4845,
362
+ "rewards/accuracies": 0.375,
363
+ "rewards/chosen": 0.008232764899730682,
364
+ "rewards/margins": 0.00878402590751648,
365
+ "rewards/rejected": -0.000551263801753521,
366
+ "step": 17
367
+ },
368
+ {
369
+ "debug/policy_chosen_logits": 32.6837272644043,
370
+ "debug/policy_chosen_logps": -439.52508544921875,
371
+ "debug/policy_rejected_logits": 30.927047729492188,
372
+ "debug/policy_rejected_logps": -429.91094970703125,
373
+ "debug/reference_chosen_logps": -441.05828857421875,
374
+ "debug/reference_rejected_logps": -431.7398376464844,
375
+ "epoch": 0.375,
376
+ "grad_norm": 5.035721183912941,
377
+ "learning_rate": 1e-06,
378
+ "logits/chosen": 32.6837272644043,
379
+ "logits/rejected": 30.927047729492188,
380
+ "logps/chosen": -439.52508544921875,
381
+ "logps/rejected": -429.91094970703125,
382
+ "loss": 0.5027,
383
+ "rewards/accuracies": 0.375,
384
+ "rewards/chosen": 0.015331955626606941,
385
+ "rewards/margins": -0.0029569631442427635,
386
+ "rewards/rejected": 0.01828891783952713,
387
+ "step": 18
388
+ },
389
+ {
390
+ "debug/policy_chosen_logits": 30.297531127929688,
391
+ "debug/policy_chosen_logps": -402.0390319824219,
392
+ "debug/policy_rejected_logits": 33.57741165161133,
393
+ "debug/policy_rejected_logps": -410.6739501953125,
394
+ "debug/reference_chosen_logps": -401.3625793457031,
395
+ "debug/reference_rejected_logps": -410.6376037597656,
396
+ "epoch": 0.3958333333333333,
397
+ "grad_norm": 5.398638220710165,
398
+ "learning_rate": 1e-06,
399
+ "logits/chosen": 30.297531127929688,
400
+ "logits/rejected": 33.57741165161133,
401
+ "logps/chosen": -402.0390319824219,
402
+ "logps/rejected": -410.6739501953125,
403
+ "loss": 0.4776,
404
+ "rewards/accuracies": 0.25,
405
+ "rewards/chosen": -0.00676448829472065,
406
+ "rewards/margins": -0.006401100195944309,
407
+ "rewards/rejected": -0.0003633880987763405,
408
+ "step": 19
409
+ },
410
+ {
411
+ "debug/policy_chosen_logits": 31.950498580932617,
412
+ "debug/policy_chosen_logps": -414.8414306640625,
413
+ "debug/policy_rejected_logits": 33.85417938232422,
414
+ "debug/policy_rejected_logps": -425.31268310546875,
415
+ "debug/reference_chosen_logps": -415.87957763671875,
416
+ "debug/reference_rejected_logps": -425.8223876953125,
417
+ "epoch": 0.4166666666666667,
418
+ "grad_norm": 5.034205804652484,
419
+ "learning_rate": 1e-06,
420
+ "logits/chosen": 31.950498580932617,
421
+ "logits/rejected": 33.85417938232422,
422
+ "logps/chosen": -414.8414306640625,
423
+ "logps/rejected": -425.31268310546875,
424
+ "loss": 0.4901,
425
+ "rewards/accuracies": 0.625,
426
+ "rewards/chosen": 0.010381774976849556,
427
+ "rewards/margins": 0.005284767597913742,
428
+ "rewards/rejected": 0.0050970083102583885,
429
+ "step": 20
430
+ },
431
+ {
432
+ "debug/policy_chosen_logits": 31.606367111206055,
433
+ "debug/policy_chosen_logps": -415.55767822265625,
434
+ "debug/policy_rejected_logits": 32.42195510864258,
435
+ "debug/policy_rejected_logps": -409.7369384765625,
436
+ "debug/reference_chosen_logps": -417.48565673828125,
437
+ "debug/reference_rejected_logps": -411.9869689941406,
438
+ "epoch": 0.4375,
439
+ "grad_norm": 4.815275729217639,
440
+ "learning_rate": 1e-06,
441
+ "logits/chosen": 31.606367111206055,
442
+ "logits/rejected": 32.42195510864258,
443
+ "logps/chosen": -415.55767822265625,
444
+ "logps/rejected": -409.7369384765625,
445
+ "loss": 0.4884,
446
+ "rewards/accuracies": 0.5,
447
+ "rewards/chosen": 0.01928028091788292,
448
+ "rewards/margins": -0.0032197567634284496,
449
+ "rewards/rejected": 0.022500038146972656,
450
+ "step": 21
451
+ },
452
+ {
453
+ "debug/policy_chosen_logits": 30.70886993408203,
454
+ "debug/policy_chosen_logps": -433.744873046875,
455
+ "debug/policy_rejected_logits": 26.203298568725586,
456
+ "debug/policy_rejected_logps": -415.767822265625,
457
+ "debug/reference_chosen_logps": -431.77203369140625,
458
+ "debug/reference_rejected_logps": -414.41796875,
459
+ "epoch": 0.4583333333333333,
460
+ "grad_norm": 5.219226631225296,
461
+ "learning_rate": 1e-06,
462
+ "logits/chosen": 30.70886993408203,
463
+ "logits/rejected": 26.203298568725586,
464
+ "logps/chosen": -433.744873046875,
465
+ "logps/rejected": -415.767822265625,
466
+ "loss": 0.4748,
467
+ "rewards/accuracies": 0.25,
468
+ "rewards/chosen": -0.019728660583496094,
469
+ "rewards/margins": -0.006230468861758709,
470
+ "rewards/rejected": -0.01349819079041481,
471
+ "step": 22
472
+ },
473
+ {
474
+ "debug/policy_chosen_logits": 30.573122024536133,
475
+ "debug/policy_chosen_logps": -420.0906677246094,
476
+ "debug/policy_rejected_logits": 31.557247161865234,
477
+ "debug/policy_rejected_logps": -405.26837158203125,
478
+ "debug/reference_chosen_logps": -422.8019714355469,
479
+ "debug/reference_rejected_logps": -406.9261474609375,
480
+ "epoch": 0.4791666666666667,
481
+ "grad_norm": 4.90573818429262,
482
+ "learning_rate": 1e-06,
483
+ "logits/chosen": 30.573122024536133,
484
+ "logits/rejected": 31.557247161865234,
485
+ "logps/chosen": -420.0906677246094,
486
+ "logps/rejected": -405.26837158203125,
487
+ "loss": 0.4845,
488
+ "rewards/accuracies": 0.625,
489
+ "rewards/chosen": 0.027113037183880806,
490
+ "rewards/margins": 0.010535049252212048,
491
+ "rewards/rejected": 0.016577987000346184,
492
+ "step": 23
493
+ },
494
+ {
495
+ "debug/policy_chosen_logits": 28.54072380065918,
496
+ "debug/policy_chosen_logps": -429.51544189453125,
497
+ "debug/policy_rejected_logits": 28.603498458862305,
498
+ "debug/policy_rejected_logps": -448.6463623046875,
499
+ "debug/reference_chosen_logps": -429.3318176269531,
500
+ "debug/reference_rejected_logps": -445.1684875488281,
501
+ "epoch": 0.5,
502
+ "grad_norm": 4.919240212142553,
503
+ "learning_rate": 1e-06,
504
+ "logits/chosen": 28.54072380065918,
505
+ "logits/rejected": 28.603498458862305,
506
+ "logps/chosen": -429.51544189453125,
507
+ "logps/rejected": -448.6463623046875,
508
+ "loss": 0.4835,
509
+ "rewards/accuracies": 0.75,
510
+ "rewards/chosen": -0.0018361280672252178,
511
+ "rewards/margins": 0.03294284641742706,
512
+ "rewards/rejected": -0.03477897495031357,
513
+ "step": 24
514
+ },
515
+ {
516
+ "debug/policy_chosen_logits": 32.81056594848633,
517
+ "debug/policy_chosen_logps": -443.39068603515625,
518
+ "debug/policy_rejected_logits": 30.637916564941406,
519
+ "debug/policy_rejected_logps": -425.4708251953125,
520
+ "debug/reference_chosen_logps": -439.9627685546875,
521
+ "debug/reference_rejected_logps": -420.35076904296875,
522
+ "epoch": 0.5208333333333334,
523
+ "grad_norm": 5.047798852836749,
524
+ "learning_rate": 1e-06,
525
+ "logits/chosen": 32.81056594848633,
526
+ "logits/rejected": 30.637916564941406,
527
+ "logps/chosen": -443.39068603515625,
528
+ "logps/rejected": -425.4708251953125,
529
+ "loss": 0.4832,
530
+ "rewards/accuracies": 0.625,
531
+ "rewards/chosen": -0.03427928686141968,
532
+ "rewards/margins": 0.016921542584896088,
533
+ "rewards/rejected": -0.051200829446315765,
534
+ "step": 25
535
+ },
536
+ {
537
+ "debug/policy_chosen_logits": 29.654489517211914,
538
+ "debug/policy_chosen_logps": -403.1080322265625,
539
+ "debug/policy_rejected_logits": 29.024642944335938,
540
+ "debug/policy_rejected_logps": -421.46270751953125,
541
+ "debug/reference_chosen_logps": -404.3522644042969,
542
+ "debug/reference_rejected_logps": -419.66693115234375,
543
+ "epoch": 0.5416666666666666,
544
+ "grad_norm": 4.941664000787305,
545
+ "learning_rate": 1e-06,
546
+ "logits/chosen": 29.654489517211914,
547
+ "logits/rejected": 29.024642944335938,
548
+ "logps/chosen": -403.1080322265625,
549
+ "logps/rejected": -421.46270751953125,
550
+ "loss": 0.4751,
551
+ "rewards/accuracies": 0.625,
552
+ "rewards/chosen": 0.012442246079444885,
553
+ "rewards/margins": 0.030400237068533897,
554
+ "rewards/rejected": -0.01795799285173416,
555
+ "step": 26
556
+ },
557
+ {
558
+ "debug/policy_chosen_logits": 28.045677185058594,
559
+ "debug/policy_chosen_logps": -403.36614990234375,
560
+ "debug/policy_rejected_logits": 30.181791305541992,
561
+ "debug/policy_rejected_logps": -423.99591064453125,
562
+ "debug/reference_chosen_logps": -405.4366149902344,
563
+ "debug/reference_rejected_logps": -424.10650634765625,
564
+ "epoch": 0.5625,
565
+ "grad_norm": 5.109404536265639,
566
+ "learning_rate": 1e-06,
567
+ "logits/chosen": 28.045677185058594,
568
+ "logits/rejected": 30.181791305541992,
569
+ "logps/chosen": -403.36614990234375,
570
+ "logps/rejected": -423.99591064453125,
571
+ "loss": 0.4726,
572
+ "rewards/accuracies": 0.625,
573
+ "rewards/chosen": 0.020704690366983414,
574
+ "rewards/margins": 0.019598999992012978,
575
+ "rewards/rejected": 0.0011056894436478615,
576
+ "step": 27
577
+ },
578
+ {
579
+ "debug/policy_chosen_logits": 30.849773406982422,
580
+ "debug/policy_chosen_logps": -421.11431884765625,
581
+ "debug/policy_rejected_logits": 30.653217315673828,
582
+ "debug/policy_rejected_logps": -440.32244873046875,
583
+ "debug/reference_chosen_logps": -418.60089111328125,
584
+ "debug/reference_rejected_logps": -439.1358642578125,
585
+ "epoch": 0.5833333333333334,
586
+ "grad_norm": 5.1018343354077205,
587
+ "learning_rate": 1e-06,
588
+ "logits/chosen": 30.849773406982422,
589
+ "logits/rejected": 30.653217315673828,
590
+ "logps/chosen": -421.11431884765625,
591
+ "logps/rejected": -440.32244873046875,
592
+ "loss": 0.4788,
593
+ "rewards/accuracies": 0.25,
594
+ "rewards/chosen": -0.025134162977337837,
595
+ "rewards/margins": -0.013268355280160904,
596
+ "rewards/rejected": -0.011865806765854359,
597
+ "step": 28
598
+ },
599
+ {
600
+ "debug/policy_chosen_logits": 32.95792770385742,
601
+ "debug/policy_chosen_logps": -438.89129638671875,
602
+ "debug/policy_rejected_logits": 32.24089813232422,
603
+ "debug/policy_rejected_logps": -445.365966796875,
604
+ "debug/reference_chosen_logps": -440.4767761230469,
605
+ "debug/reference_rejected_logps": -440.51837158203125,
606
+ "epoch": 0.6041666666666666,
607
+ "grad_norm": 4.632495681187026,
608
+ "learning_rate": 1e-06,
609
+ "logits/chosen": 32.95792770385742,
610
+ "logits/rejected": 32.24089813232422,
611
+ "logps/chosen": -438.89129638671875,
612
+ "logps/rejected": -445.365966796875,
613
+ "loss": 0.4649,
614
+ "rewards/accuracies": 0.75,
615
+ "rewards/chosen": 0.015854567289352417,
616
+ "rewards/margins": 0.06433063000440598,
617
+ "rewards/rejected": -0.04847606271505356,
618
+ "step": 29
619
+ },
620
+ {
621
+ "debug/policy_chosen_logits": 31.374120712280273,
622
+ "debug/policy_chosen_logps": -412.0467529296875,
623
+ "debug/policy_rejected_logits": 30.670644760131836,
624
+ "debug/policy_rejected_logps": -437.80059814453125,
625
+ "debug/reference_chosen_logps": -414.59442138671875,
626
+ "debug/reference_rejected_logps": -433.7508544921875,
627
+ "epoch": 0.625,
628
+ "grad_norm": 5.135137765814023,
629
+ "learning_rate": 1e-06,
630
+ "logits/chosen": 31.374120712280273,
631
+ "logits/rejected": 30.670644760131836,
632
+ "logps/chosen": -412.0467529296875,
633
+ "logps/rejected": -437.80059814453125,
634
+ "loss": 0.4928,
635
+ "rewards/accuracies": 0.875,
636
+ "rewards/chosen": 0.025476723909378052,
637
+ "rewards/margins": 0.06597384810447693,
638
+ "rewards/rejected": -0.040497127920389175,
639
+ "step": 30
640
+ },
641
+ {
642
+ "debug/policy_chosen_logits": 28.232507705688477,
643
+ "debug/policy_chosen_logps": -414.2076721191406,
644
+ "debug/policy_rejected_logits": 27.216352462768555,
645
+ "debug/policy_rejected_logps": -437.1291809082031,
646
+ "debug/reference_chosen_logps": -417.13665771484375,
647
+ "debug/reference_rejected_logps": -431.5104064941406,
648
+ "epoch": 0.6458333333333334,
649
+ "grad_norm": 9.586730066675056,
650
+ "learning_rate": 1e-06,
651
+ "logits/chosen": 28.232507705688477,
652
+ "logits/rejected": 27.216352462768555,
653
+ "logps/chosen": -414.2076721191406,
654
+ "logps/rejected": -437.1291809082031,
655
+ "loss": 0.4718,
656
+ "rewards/accuracies": 0.5,
657
+ "rewards/chosen": 0.029290199279785156,
658
+ "rewards/margins": 0.08547790348529816,
659
+ "rewards/rejected": -0.0561877004802227,
660
+ "step": 31
661
+ },
662
+ {
663
+ "debug/policy_chosen_logits": 31.265499114990234,
664
+ "debug/policy_chosen_logps": -415.2708740234375,
665
+ "debug/policy_rejected_logits": 30.586397171020508,
666
+ "debug/policy_rejected_logps": -420.286376953125,
667
+ "debug/reference_chosen_logps": -415.6477966308594,
668
+ "debug/reference_rejected_logps": -419.3028259277344,
669
+ "epoch": 0.6666666666666666,
670
+ "grad_norm": 5.088803834381108,
671
+ "learning_rate": 1e-06,
672
+ "logits/chosen": 31.265499114990234,
673
+ "logits/rejected": 30.586397171020508,
674
+ "logps/chosen": -415.2708740234375,
675
+ "logps/rejected": -420.286376953125,
676
+ "loss": 0.4781,
677
+ "rewards/accuracies": 0.5,
678
+ "rewards/chosen": 0.003769032657146454,
679
+ "rewards/margins": 0.013604126870632172,
680
+ "rewards/rejected": -0.00983509048819542,
681
+ "step": 32
682
+ },
683
+ {
684
+ "debug/policy_chosen_logits": 28.746450424194336,
685
+ "debug/policy_chosen_logps": -418.0946350097656,
686
+ "debug/policy_rejected_logits": 30.699132919311523,
687
+ "debug/policy_rejected_logps": -422.5447998046875,
688
+ "debug/reference_chosen_logps": -421.5987243652344,
689
+ "debug/reference_rejected_logps": -423.47491455078125,
690
+ "epoch": 0.6875,
691
+ "grad_norm": 5.2883474953854925,
692
+ "learning_rate": 1e-06,
693
+ "logits/chosen": 28.746450424194336,
694
+ "logits/rejected": 30.699132919311523,
695
+ "logps/chosen": -418.0946350097656,
696
+ "logps/rejected": -422.5447998046875,
697
+ "loss": 0.4925,
698
+ "rewards/accuracies": 0.5,
699
+ "rewards/chosen": 0.03504092991352081,
700
+ "rewards/margins": 0.02574004977941513,
701
+ "rewards/rejected": 0.009300878271460533,
702
+ "step": 33
703
+ },
704
+ {
705
+ "debug/policy_chosen_logits": 32.71680450439453,
706
+ "debug/policy_chosen_logps": -405.2165222167969,
707
+ "debug/policy_rejected_logits": 29.466472625732422,
708
+ "debug/policy_rejected_logps": -449.44921875,
709
+ "debug/reference_chosen_logps": -407.29217529296875,
710
+ "debug/reference_rejected_logps": -445.6148681640625,
711
+ "epoch": 0.7083333333333334,
712
+ "grad_norm": 5.307136992049977,
713
+ "learning_rate": 1e-06,
714
+ "logits/chosen": 32.71680450439453,
715
+ "logits/rejected": 29.466472625732422,
716
+ "logps/chosen": -405.2165222167969,
717
+ "logps/rejected": -449.44921875,
718
+ "loss": 0.4894,
719
+ "rewards/accuracies": 0.625,
720
+ "rewards/chosen": 0.020756645128130913,
721
+ "rewards/margins": 0.05910034105181694,
722
+ "rewards/rejected": -0.03834369778633118,
723
+ "step": 34
724
+ },
725
+ {
726
+ "debug/policy_chosen_logits": 25.695634841918945,
727
+ "debug/policy_chosen_logps": -404.29071044921875,
728
+ "debug/policy_rejected_logits": 27.330564498901367,
729
+ "debug/policy_rejected_logps": -454.2277526855469,
730
+ "debug/reference_chosen_logps": -405.4884338378906,
731
+ "debug/reference_rejected_logps": -449.79266357421875,
732
+ "epoch": 0.7291666666666666,
733
+ "grad_norm": 4.968733221673012,
734
+ "learning_rate": 1e-06,
735
+ "logits/chosen": 25.695634841918945,
736
+ "logits/rejected": 27.330564498901367,
737
+ "logps/chosen": -404.29071044921875,
738
+ "logps/rejected": -454.2277526855469,
739
+ "loss": 0.446,
740
+ "rewards/accuracies": 0.75,
741
+ "rewards/chosen": 0.011977346614003181,
742
+ "rewards/margins": 0.05632827803492546,
743
+ "rewards/rejected": -0.04435092955827713,
744
+ "step": 35
745
+ },
746
+ {
747
+ "debug/policy_chosen_logits": 27.627330780029297,
748
+ "debug/policy_chosen_logps": -420.744873046875,
749
+ "debug/policy_rejected_logits": 28.073627471923828,
750
+ "debug/policy_rejected_logps": -437.855224609375,
751
+ "debug/reference_chosen_logps": -425.2474060058594,
752
+ "debug/reference_rejected_logps": -429.3760681152344,
753
+ "epoch": 0.75,
754
+ "grad_norm": 5.074316911246501,
755
+ "learning_rate": 1e-06,
756
+ "logits/chosen": 27.627330780029297,
757
+ "logits/rejected": 28.073627471923828,
758
+ "logps/chosen": -420.744873046875,
759
+ "logps/rejected": -437.855224609375,
760
+ "loss": 0.4719,
761
+ "rewards/accuracies": 0.625,
762
+ "rewards/chosen": 0.04502537101507187,
763
+ "rewards/margins": 0.12981674075126648,
764
+ "rewards/rejected": -0.08479136973619461,
765
+ "step": 36
766
+ },
767
+ {
768
+ "debug/policy_chosen_logits": 26.058317184448242,
769
+ "debug/policy_chosen_logps": -418.7844543457031,
770
+ "debug/policy_rejected_logits": 27.601564407348633,
771
+ "debug/policy_rejected_logps": -430.4002380371094,
772
+ "debug/reference_chosen_logps": -421.88189697265625,
773
+ "debug/reference_rejected_logps": -429.6895446777344,
774
+ "epoch": 0.7708333333333334,
775
+ "grad_norm": 5.110386744264913,
776
+ "learning_rate": 1e-06,
777
+ "logits/chosen": 26.058317184448242,
778
+ "logits/rejected": 27.601564407348633,
779
+ "logps/chosen": -418.7844543457031,
780
+ "logps/rejected": -430.4002380371094,
781
+ "loss": 0.5027,
782
+ "rewards/accuracies": 0.75,
783
+ "rewards/chosen": 0.030974578112363815,
784
+ "rewards/margins": 0.03808154910802841,
785
+ "rewards/rejected": -0.0071069709956645966,
786
+ "step": 37
787
+ },
788
+ {
789
+ "debug/policy_chosen_logits": 32.79731369018555,
790
+ "debug/policy_chosen_logps": -430.58941650390625,
791
+ "debug/policy_rejected_logits": 34.05510330200195,
792
+ "debug/policy_rejected_logps": -450.1502685546875,
793
+ "debug/reference_chosen_logps": -432.3368225097656,
794
+ "debug/reference_rejected_logps": -443.3255615234375,
795
+ "epoch": 0.7916666666666666,
796
+ "grad_norm": 4.973892389169695,
797
+ "learning_rate": 1e-06,
798
+ "logits/chosen": 32.79731369018555,
799
+ "logits/rejected": 34.05510330200195,
800
+ "logps/chosen": -430.58941650390625,
801
+ "logps/rejected": -450.1502685546875,
802
+ "loss": 0.4642,
803
+ "rewards/accuracies": 0.75,
804
+ "rewards/chosen": 0.017473945394158363,
805
+ "rewards/margins": 0.08572139590978622,
806
+ "rewards/rejected": -0.06824745237827301,
807
+ "step": 38
808
+ },
809
+ {
810
+ "debug/policy_chosen_logits": 32.02721405029297,
811
+ "debug/policy_chosen_logps": -420.63739013671875,
812
+ "debug/policy_rejected_logits": 32.61655807495117,
813
+ "debug/policy_rejected_logps": -422.09625244140625,
814
+ "debug/reference_chosen_logps": -425.75048828125,
815
+ "debug/reference_rejected_logps": -412.3810729980469,
816
+ "epoch": 0.8125,
817
+ "grad_norm": 5.161463337049576,
818
+ "learning_rate": 1e-06,
819
+ "logits/chosen": 32.02721405029297,
820
+ "logits/rejected": 32.61655807495117,
821
+ "logps/chosen": -420.63739013671875,
822
+ "logps/rejected": -422.09625244140625,
823
+ "loss": 0.4611,
824
+ "rewards/accuracies": 0.75,
825
+ "rewards/chosen": 0.0511307530105114,
826
+ "rewards/margins": 0.1482827365398407,
827
+ "rewards/rejected": -0.0971519872546196,
828
+ "step": 39
829
+ },
830
+ {
831
+ "debug/policy_chosen_logits": 30.953702926635742,
832
+ "debug/policy_chosen_logps": -415.52679443359375,
833
+ "debug/policy_rejected_logits": 30.896102905273438,
834
+ "debug/policy_rejected_logps": -432.7928161621094,
835
+ "debug/reference_chosen_logps": -418.53839111328125,
836
+ "debug/reference_rejected_logps": -433.76123046875,
837
+ "epoch": 0.8333333333333334,
838
+ "grad_norm": 5.104987202858757,
839
+ "learning_rate": 1e-06,
840
+ "logits/chosen": 30.953702926635742,
841
+ "logits/rejected": 30.896102905273438,
842
+ "logps/chosen": -415.52679443359375,
843
+ "logps/rejected": -432.7928161621094,
844
+ "loss": 0.4728,
845
+ "rewards/accuracies": 0.625,
846
+ "rewards/chosen": 0.03011600486934185,
847
+ "rewards/margins": 0.020431898534297943,
848
+ "rewards/rejected": 0.009684105403721333,
849
+ "step": 40
850
+ },
851
+ {
852
+ "debug/policy_chosen_logits": 31.386384963989258,
853
+ "debug/policy_chosen_logps": -416.5565185546875,
854
+ "debug/policy_rejected_logits": 30.050193786621094,
855
+ "debug/policy_rejected_logps": -419.0783996582031,
856
+ "debug/reference_chosen_logps": -419.27752685546875,
857
+ "debug/reference_rejected_logps": -417.3736267089844,
858
+ "epoch": 0.8541666666666666,
859
+ "grad_norm": 4.8124075843398275,
860
+ "learning_rate": 1e-06,
861
+ "logits/chosen": 31.386384963989258,
862
+ "logits/rejected": 30.050193786621094,
863
+ "logps/chosen": -416.5565185546875,
864
+ "logps/rejected": -419.0783996582031,
865
+ "loss": 0.4822,
866
+ "rewards/accuracies": 0.625,
867
+ "rewards/chosen": 0.027210043743252754,
868
+ "rewards/margins": 0.04425777494907379,
869
+ "rewards/rejected": -0.017047729343175888,
870
+ "step": 41
871
+ },
872
+ {
873
+ "debug/policy_chosen_logits": 27.5343074798584,
874
+ "debug/policy_chosen_logps": -416.6922607421875,
875
+ "debug/policy_rejected_logits": 29.844606399536133,
876
+ "debug/policy_rejected_logps": -427.3104553222656,
877
+ "debug/reference_chosen_logps": -422.4092102050781,
878
+ "debug/reference_rejected_logps": -425.1396484375,
879
+ "epoch": 0.875,
880
+ "grad_norm": 5.155061770516332,
881
+ "learning_rate": 1e-06,
882
+ "logits/chosen": 27.5343074798584,
883
+ "logits/rejected": 29.844606399536133,
884
+ "logps/chosen": -416.6922607421875,
885
+ "logps/rejected": -427.3104553222656,
886
+ "loss": 0.4577,
887
+ "rewards/accuracies": 0.75,
888
+ "rewards/chosen": 0.05716957151889801,
889
+ "rewards/margins": 0.07887779176235199,
890
+ "rewards/rejected": -0.02170822024345398,
891
+ "step": 42
892
+ },
893
+ {
894
+ "debug/policy_chosen_logits": 27.6116943359375,
895
+ "debug/policy_chosen_logps": -421.576416015625,
896
+ "debug/policy_rejected_logits": 26.841312408447266,
897
+ "debug/policy_rejected_logps": -442.26361083984375,
898
+ "debug/reference_chosen_logps": -428.953125,
899
+ "debug/reference_rejected_logps": -441.0887756347656,
900
+ "epoch": 0.8958333333333334,
901
+ "grad_norm": 4.824458757028807,
902
+ "learning_rate": 1e-06,
903
+ "logits/chosen": 27.6116943359375,
904
+ "logits/rejected": 26.841312408447266,
905
+ "logps/chosen": -421.576416015625,
906
+ "logps/rejected": -442.26361083984375,
907
+ "loss": 0.4393,
908
+ "rewards/accuracies": 0.75,
909
+ "rewards/chosen": 0.07376720011234283,
910
+ "rewards/margins": 0.08551543951034546,
911
+ "rewards/rejected": -0.01174823846668005,
912
+ "step": 43
913
+ },
914
+ {
915
+ "debug/policy_chosen_logits": 28.251888275146484,
916
+ "debug/policy_chosen_logps": -413.2276611328125,
917
+ "debug/policy_rejected_logits": 27.731847763061523,
918
+ "debug/policy_rejected_logps": -431.4485168457031,
919
+ "debug/reference_chosen_logps": -421.6210632324219,
920
+ "debug/reference_rejected_logps": -428.9674072265625,
921
+ "epoch": 0.9166666666666666,
922
+ "grad_norm": 4.765787027247756,
923
+ "learning_rate": 1e-06,
924
+ "logits/chosen": 28.251888275146484,
925
+ "logits/rejected": 27.731847763061523,
926
+ "logps/chosen": -413.2276611328125,
927
+ "logps/rejected": -431.4485168457031,
928
+ "loss": 0.4476,
929
+ "rewards/accuracies": 0.75,
930
+ "rewards/chosen": 0.08393420279026031,
931
+ "rewards/margins": 0.10874545574188232,
932
+ "rewards/rejected": -0.02481124736368656,
933
+ "step": 44
934
+ },
935
+ {
936
+ "debug/policy_chosen_logits": 27.165687561035156,
937
+ "debug/policy_chosen_logps": -416.60101318359375,
938
+ "debug/policy_rejected_logits": 26.6442928314209,
939
+ "debug/policy_rejected_logps": -421.559326171875,
940
+ "debug/reference_chosen_logps": -422.03057861328125,
941
+ "debug/reference_rejected_logps": -421.613037109375,
942
+ "epoch": 0.9375,
943
+ "grad_norm": 5.601545923208478,
944
+ "learning_rate": 1e-06,
945
+ "logits/chosen": 27.165687561035156,
946
+ "logits/rejected": 26.6442928314209,
947
+ "logps/chosen": -416.60101318359375,
948
+ "logps/rejected": -421.559326171875,
949
+ "loss": 0.4536,
950
+ "rewards/accuracies": 0.75,
951
+ "rewards/chosen": 0.05429550260305405,
952
+ "rewards/margins": 0.05375823751091957,
953
+ "rewards/rejected": 0.0005372613668441772,
954
+ "step": 45
955
+ },
956
+ {
957
+ "debug/policy_chosen_logits": 30.84551429748535,
958
+ "debug/policy_chosen_logps": -420.135498046875,
959
+ "debug/policy_rejected_logits": 31.129411697387695,
960
+ "debug/policy_rejected_logps": -423.7474365234375,
961
+ "debug/reference_chosen_logps": -425.0477294921875,
962
+ "debug/reference_rejected_logps": -426.3204345703125,
963
+ "epoch": 0.9583333333333334,
964
+ "grad_norm": 5.0077997901080655,
965
+ "learning_rate": 1e-06,
966
+ "logits/chosen": 30.84551429748535,
967
+ "logits/rejected": 31.129411697387695,
968
+ "logps/chosen": -420.135498046875,
969
+ "logps/rejected": -423.7474365234375,
970
+ "loss": 0.4604,
971
+ "rewards/accuracies": 0.625,
972
+ "rewards/chosen": 0.04912235215306282,
973
+ "rewards/margins": 0.02339225634932518,
974
+ "rewards/rejected": 0.025730092078447342,
975
+ "step": 46
976
+ },
977
+ {
978
+ "debug/policy_chosen_logits": 25.565441131591797,
979
+ "debug/policy_chosen_logps": -390.5962829589844,
980
+ "debug/policy_rejected_logits": 31.802610397338867,
981
+ "debug/policy_rejected_logps": -443.27691650390625,
982
+ "debug/reference_chosen_logps": -394.64532470703125,
983
+ "debug/reference_rejected_logps": -439.7212219238281,
984
+ "epoch": 0.9791666666666666,
985
+ "grad_norm": 5.104824777439459,
986
+ "learning_rate": 1e-06,
987
+ "logits/chosen": 25.565441131591797,
988
+ "logits/rejected": 31.802610397338867,
989
+ "logps/chosen": -390.5962829589844,
990
+ "logps/rejected": -443.27691650390625,
991
+ "loss": 0.4627,
992
+ "rewards/accuracies": 0.625,
993
+ "rewards/chosen": 0.04049030318856239,
994
+ "rewards/margins": 0.07604698091745377,
995
+ "rewards/rejected": -0.03555667772889137,
996
+ "step": 47
997
+ },
998
+ {
999
+ "debug/policy_chosen_logits": 30.14870262145996,
1000
+ "debug/policy_chosen_logps": -410.57965087890625,
1001
+ "debug/policy_rejected_logits": 30.24129295349121,
1002
+ "debug/policy_rejected_logps": -411.0328369140625,
1003
+ "debug/reference_chosen_logps": -416.5841979980469,
1004
+ "debug/reference_rejected_logps": -415.076904296875,
1005
+ "epoch": 1.0,
1006
+ "grad_norm": 5.41196633275443,
1007
+ "learning_rate": 1e-06,
1008
+ "logits/chosen": 30.14870262145996,
1009
+ "logits/rejected": 30.24129295349121,
1010
+ "logps/chosen": -410.57965087890625,
1011
+ "logps/rejected": -411.0328369140625,
1012
+ "loss": 0.4479,
1013
+ "rewards/accuracies": 0.375,
1014
+ "rewards/chosen": 0.06004543602466583,
1015
+ "rewards/margins": 0.01960449479520321,
1016
+ "rewards/rejected": 0.040440939366817474,
1017
+ "step": 48
1018
+ },
1019
+ {
1020
+ "epoch": 1.0,
1021
+ "step": 48,
1022
+ "total_flos": 0.0,
1023
+ "train_loss": 0.4809455207238595,
1024
+ "train_runtime": 486.3354,
1025
+ "train_samples_per_second": 6.271,
1026
+ "train_steps_per_second": 0.099
1027
+ }
1028
+ ],
1029
+ "logging_steps": 1,
1030
+ "max_steps": 48,
1031
+ "num_input_tokens_seen": 0,
1032
+ "num_train_epochs": 1,
1033
+ "save_steps": 500,
1034
+ "stateful_callbacks": {
1035
+ "TrainerControl": {
1036
+ "args": {
1037
+ "should_epoch_stop": false,
1038
+ "should_evaluate": false,
1039
+ "should_log": false,
1040
+ "should_save": true,
1041
+ "should_training_stop": true
1042
+ },
1043
+ "attributes": {}
1044
+ }
1045
+ },
1046
+ "total_flos": 0.0,
1047
+ "train_batch_size": 8,
1048
+ "trial_name": null,
1049
+ "trial_params": null
1050
+ }