jiuhai commited on
Commit
21f6f7c
1 Parent(s): c461c51

Model save

Browse files
Files changed (5) hide show
  1. README.md +18 -19
  2. all_results.json +16 -16
  3. eval_results.json +12 -12
  4. train_results.json +5 -5
  5. trainer_state.json +1896 -506
README.md CHANGED
@@ -1,11 +1,8 @@
1
  ---
2
- license: apache-2.0
3
- base_model: alignment-handbook/zephyr-7b-sft-full
4
  tags:
5
- - alignment-handbook
6
  - generated_from_trainer
7
- datasets:
8
- - HuggingFaceH4/ultrafeedback_binarized
9
  model-index:
10
  - name: zephyr-7b-dpo-lora
11
  results: []
@@ -16,17 +13,17 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # zephyr-7b-dpo-lora
18
 
19
- This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the HuggingFaceH4/ultrafeedback_binarized dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.5650
22
- - Rewards/chosen: 0.0816
23
- - Rewards/rejected: -0.2564
24
- - Rewards/accuracies: 0.7695
25
- - Rewards/margins: 0.3380
26
- - Logps/rejected: -175.5244
27
- - Logps/chosen: -271.4002
28
- - Logits/rejected: -3.0699
29
- - Logits/chosen: -3.0344
30
 
31
  ## Model description
32
 
@@ -45,7 +42,7 @@ More information needed
45
  ### Training hyperparameters
46
 
47
  The following hyperparameters were used during training:
48
- - learning_rate: 5e-07
49
  - train_batch_size: 16
50
  - eval_batch_size: 16
51
  - seed: 42
@@ -56,18 +53,20 @@ The following hyperparameters were used during training:
56
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
57
  - lr_scheduler_type: linear
58
  - lr_scheduler_warmup_ratio: 0.1
59
- - num_epochs: 1.0
60
 
61
  ### Training results
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
- | 0.482 | 1.0 | 485 | 0.5650 | 0.0816 | -0.2564 | 0.7695 | 0.3380 | -175.5244 | -271.4002 | -3.0699 | -3.0344 |
 
 
66
 
67
 
68
  ### Framework versions
69
 
70
  - Transformers 4.35.0
71
- - Pytorch 2.1.1+cu121
72
  - Datasets 2.14.6
73
  - Tokenizers 0.14.1
 
1
  ---
2
+ license: mit
3
+ base_model: HuggingFaceH4/mistral-7b-sft-beta
4
  tags:
 
5
  - generated_from_trainer
 
 
6
  model-index:
7
  - name: zephyr-7b-dpo-lora
8
  results: []
 
13
 
14
  # zephyr-7b-dpo-lora
15
 
16
+ This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.4553
19
+ - Rewards/chosen: -0.5876
20
+ - Rewards/rejected: -2.1911
21
+ - Rewards/accuracies: 0.8359
22
+ - Rewards/margins: 1.6035
23
+ - Logps/rejected: -246.6992
24
+ - Logps/chosen: -279.5245
25
+ - Logits/rejected: -2.8331
26
+ - Logits/chosen: -2.8422
27
 
28
  ## Model description
29
 
 
42
  ### Training hyperparameters
43
 
44
  The following hyperparameters were used during training:
45
+ - learning_rate: 2e-05
46
  - train_batch_size: 16
47
  - eval_batch_size: 16
48
  - seed: 42
 
53
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
54
  - lr_scheduler_type: linear
55
  - lr_scheduler_warmup_ratio: 0.1
56
+ - num_epochs: 3.0
57
 
58
  ### Training results
59
 
60
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
61
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
62
+ | 0.4771 | 1.0 | 485 | 0.4617 | -0.0843 | -1.3520 | 0.7891 | 1.2677 | -238.3082 | -274.4911 | -2.8501 | -2.8796 |
63
+ | 0.4124 | 2.0 | 970 | 0.4545 | -0.3551 | -1.7590 | 0.8164 | 1.4038 | -242.3781 | -277.1996 | -2.8563 | -2.8659 |
64
+ | 0.3549 | 3.0 | 1455 | 0.4553 | -0.5876 | -2.1911 | 0.8359 | 1.6035 | -246.6992 | -279.5245 | -2.8331 | -2.8422 |
65
 
66
 
67
  ### Framework versions
68
 
69
  - Transformers 4.35.0
70
+ - Pytorch 2.1.0+cu121
71
  - Datasets 2.14.6
72
  - Tokenizers 0.14.1
all_results.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_logits/chosen": -3.034407377243042,
4
- "eval_logits/rejected": -3.069913864135742,
5
- "eval_logps/chosen": -271.40020751953125,
6
- "eval_logps/rejected": -175.5244140625,
7
- "eval_loss": 0.5650191903114319,
8
- "eval_rewards/accuracies": 0.76953125,
9
- "eval_rewards/chosen": 0.08157022297382355,
10
- "eval_rewards/margins": 0.33799096941947937,
11
- "eval_rewards/rejected": -0.25642073154449463,
12
- "eval_runtime": 254.1285,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 7.87,
15
  "eval_steps_per_second": 0.063,
16
- "train_loss": 0.5539181610972611,
17
- "train_runtime": 15602.6148,
18
- "train_samples": 62064,
19
- "train_samples_per_second": 3.978,
20
  "train_steps_per_second": 0.031
21
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "eval_logits/chosen": -2.842160701751709,
4
+ "eval_logits/rejected": -2.833141326904297,
5
+ "eval_logps/chosen": -279.5245056152344,
6
+ "eval_logps/rejected": -246.69915771484375,
7
+ "eval_loss": 0.45531293749809265,
8
+ "eval_rewards/accuracies": 0.8359375,
9
+ "eval_rewards/chosen": -0.5876308083534241,
10
+ "eval_rewards/margins": 1.6034575700759888,
11
+ "eval_rewards/rejected": -2.1910881996154785,
12
+ "eval_runtime": 252.3832,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 7.924,
15
  "eval_steps_per_second": 0.063,
16
+ "train_loss": 0.43281792414557074,
17
+ "train_runtime": 46468.4841,
18
+ "train_samples": 61966,
19
+ "train_samples_per_second": 4.001,
20
  "train_steps_per_second": 0.031
21
  }
eval_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_logits/chosen": -3.034407377243042,
4
- "eval_logits/rejected": -3.069913864135742,
5
- "eval_logps/chosen": -271.40020751953125,
6
- "eval_logps/rejected": -175.5244140625,
7
- "eval_loss": 0.5650191903114319,
8
- "eval_rewards/accuracies": 0.76953125,
9
- "eval_rewards/chosen": 0.08157022297382355,
10
- "eval_rewards/margins": 0.33799096941947937,
11
- "eval_rewards/rejected": -0.25642073154449463,
12
- "eval_runtime": 254.1285,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 7.87,
15
  "eval_steps_per_second": 0.063
16
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "eval_logits/chosen": -2.842160701751709,
4
+ "eval_logits/rejected": -2.833141326904297,
5
+ "eval_logps/chosen": -279.5245056152344,
6
+ "eval_logps/rejected": -246.69915771484375,
7
+ "eval_loss": 0.45531293749809265,
8
+ "eval_rewards/accuracies": 0.8359375,
9
+ "eval_rewards/chosen": -0.5876308083534241,
10
+ "eval_rewards/margins": 1.6034575700759888,
11
+ "eval_rewards/rejected": -2.1910881996154785,
12
+ "eval_runtime": 252.3832,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 7.924,
15
  "eval_steps_per_second": 0.063
16
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 0.5539181610972611,
4
- "train_runtime": 15602.6148,
5
- "train_samples": 62064,
6
- "train_samples_per_second": 3.978,
7
  "train_steps_per_second": 0.031
8
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.43281792414557074,
4
+ "train_runtime": 46468.4841,
5
+ "train_samples": 61966,
6
+ "train_samples_per_second": 4.001,
7
  "train_steps_per_second": 0.031
8
  }
trainer_state.json CHANGED
@@ -1,20 +1,20 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 100,
6
- "global_step": 485,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 1.020408163265306e-08,
14
- "logits/chosen": -3.094454526901245,
15
- "logits/rejected": -3.0498220920562744,
16
- "logps/chosen": -242.99183654785156,
17
- "logps/rejected": -74.66817474365234,
18
  "loss": 0.6931,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
@@ -24,705 +24,2095 @@
24
  },
25
  {
26
  "epoch": 0.02,
27
- "learning_rate": 1.0204081632653061e-07,
28
- "logits/chosen": -3.032047986984253,
29
- "logits/rejected": -3.029446840286255,
30
- "logps/chosen": -290.1824645996094,
31
- "logps/rejected": -75.82839965820312,
32
- "loss": 0.6935,
33
- "rewards/accuracies": 0.4027777910232544,
34
- "rewards/chosen": -0.007104851305484772,
35
- "rewards/margins": -0.0044839149340987206,
36
- "rewards/rejected": -0.0026209354400634766,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.04,
41
- "learning_rate": 2.0408163265306121e-07,
42
- "logits/chosen": -2.9773757457733154,
43
- "logits/rejected": -2.967517852783203,
44
- "logps/chosen": -297.57342529296875,
45
- "logps/rejected": -77.62318420410156,
46
- "loss": 0.692,
47
- "rewards/accuracies": 0.5625,
48
- "rewards/chosen": 0.00020697650325018913,
49
- "rewards/margins": 0.003021990181878209,
50
- "rewards/rejected": -0.0028150142170488834,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.06,
55
- "learning_rate": 3.0612244897959183e-07,
56
- "logits/chosen": -2.983607769012451,
57
- "logits/rejected": -2.9363152980804443,
58
- "logps/chosen": -288.51458740234375,
59
- "logps/rejected": -75.65086364746094,
60
- "loss": 0.6892,
61
- "rewards/accuracies": 0.5687500238418579,
62
- "rewards/chosen": -0.0037677965592592955,
63
- "rewards/margins": 0.004846884869039059,
64
- "rewards/rejected": -0.008614679798483849,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.08,
69
- "learning_rate": 4.0816326530612243e-07,
70
- "logits/chosen": -3.0467514991760254,
71
- "logits/rejected": -3.010239362716675,
72
- "logps/chosen": -243.7971954345703,
73
- "logps/rejected": -81.06056213378906,
74
- "loss": 0.685,
75
- "rewards/accuracies": 0.6499999761581421,
76
- "rewards/chosen": 0.0063628097996115685,
77
- "rewards/margins": 0.02118637040257454,
78
- "rewards/rejected": -0.014823561534285545,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.1,
83
- "learning_rate": 4.988532110091743e-07,
84
- "logits/chosen": -3.0095317363739014,
85
- "logits/rejected": -3.0367846488952637,
86
- "logps/chosen": -251.5819854736328,
87
- "logps/rejected": -78.19547271728516,
88
- "loss": 0.6784,
89
- "rewards/accuracies": 0.6499999761581421,
90
- "rewards/chosen": 0.005416669882833958,
91
- "rewards/margins": 0.023932188749313354,
92
- "rewards/rejected": -0.018515516072511673,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.12,
97
- "learning_rate": 4.873853211009174e-07,
98
- "logits/chosen": -3.0116028785705566,
99
- "logits/rejected": -3.0300631523132324,
100
- "logps/chosen": -281.01361083984375,
101
- "logps/rejected": -75.49365997314453,
102
- "loss": 0.6715,
103
- "rewards/accuracies": 0.8125,
104
- "rewards/chosen": 0.015385298058390617,
105
- "rewards/margins": 0.050571341067552567,
106
- "rewards/rejected": -0.0351860448718071,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.14,
111
- "learning_rate": 4.7591743119266054e-07,
112
- "logits/chosen": -3.0327250957489014,
113
- "logits/rejected": -3.0184121131896973,
114
- "logps/chosen": -262.8722229003906,
115
- "logps/rejected": -71.65990447998047,
116
- "loss": 0.6649,
117
- "rewards/accuracies": 0.831250011920929,
118
- "rewards/chosen": 0.016824517399072647,
119
- "rewards/margins": 0.06025807186961174,
120
- "rewards/rejected": -0.043433547019958496,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.16,
125
- "learning_rate": 4.644495412844037e-07,
126
- "logits/chosen": -3.0364532470703125,
127
- "logits/rejected": -2.988002300262451,
128
- "logps/chosen": -254.49423217773438,
129
- "logps/rejected": -70.27412414550781,
130
- "loss": 0.6556,
131
- "rewards/accuracies": 0.8500000238418579,
132
- "rewards/chosen": 0.022701723501086235,
133
- "rewards/margins": 0.07623252272605896,
134
- "rewards/rejected": -0.05353079363703728,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.19,
139
- "learning_rate": 4.5298165137614677e-07,
140
- "logits/chosen": -3.068497657775879,
141
- "logits/rejected": -3.0402565002441406,
142
- "logps/chosen": -266.61614990234375,
143
- "logps/rejected": -81.87393951416016,
144
- "loss": 0.6455,
145
- "rewards/accuracies": 0.8687499761581421,
146
- "rewards/chosen": 0.026070792227983475,
147
- "rewards/margins": 0.10358123481273651,
148
- "rewards/rejected": -0.07751044631004333,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.21,
153
- "learning_rate": 4.4151376146788986e-07,
154
- "logits/chosen": -3.0521655082702637,
155
- "logits/rejected": -3.057821750640869,
156
- "logps/chosen": -286.0577087402344,
157
- "logps/rejected": -77.96414947509766,
158
- "loss": 0.6336,
159
- "rewards/accuracies": 0.949999988079071,
160
- "rewards/chosen": 0.033475782722234726,
161
- "rewards/margins": 0.14013811945915222,
162
- "rewards/rejected": -0.10666234791278839,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.23,
167
- "learning_rate": 4.30045871559633e-07,
168
- "logits/chosen": -3.003532886505127,
169
- "logits/rejected": -2.995978355407715,
170
- "logps/chosen": -276.5457458496094,
171
- "logps/rejected": -80.02079010009766,
172
- "loss": 0.6234,
173
- "rewards/accuracies": 0.9375,
174
- "rewards/chosen": 0.0331401564180851,
175
- "rewards/margins": 0.14480046927928925,
176
- "rewards/rejected": -0.11166031658649445,
177
  "step": 110
178
  },
179
  {
180
  "epoch": 0.25,
181
- "learning_rate": 4.1857798165137613e-07,
182
- "logits/chosen": -3.0330376625061035,
183
- "logits/rejected": -3.030214548110962,
184
- "logps/chosen": -276.41632080078125,
185
- "logps/rejected": -77.67643737792969,
186
- "loss": 0.6164,
187
- "rewards/accuracies": 0.9375,
188
- "rewards/chosen": 0.043682295829057693,
189
- "rewards/margins": 0.177944153547287,
190
- "rewards/rejected": -0.1342618763446808,
191
  "step": 120
192
  },
193
  {
194
  "epoch": 0.27,
195
- "learning_rate": 4.071100917431192e-07,
196
- "logits/chosen": -2.9754703044891357,
197
- "logits/rejected": -2.9898681640625,
198
- "logps/chosen": -283.3277587890625,
199
- "logps/rejected": -83.87138366699219,
200
- "loss": 0.6121,
201
- "rewards/accuracies": 0.9312499761581421,
202
- "rewards/chosen": 0.048630841076374054,
203
- "rewards/margins": 0.19439519941806793,
204
- "rewards/rejected": -0.14576435089111328,
205
  "step": 130
206
  },
207
  {
208
  "epoch": 0.29,
209
- "learning_rate": 3.9564220183486236e-07,
210
- "logits/chosen": -3.0477757453918457,
211
- "logits/rejected": -3.0237550735473633,
212
- "logps/chosen": -291.98065185546875,
213
- "logps/rejected": -82.53144073486328,
214
- "loss": 0.5997,
215
- "rewards/accuracies": 0.925000011920929,
216
- "rewards/chosen": 0.034745730459690094,
217
- "rewards/margins": 0.20989501476287842,
218
- "rewards/rejected": -0.17514929175376892,
219
  "step": 140
220
  },
221
  {
222
  "epoch": 0.31,
223
- "learning_rate": 3.841743119266055e-07,
224
- "logits/chosen": -3.033001661300659,
225
- "logits/rejected": -3.015845775604248,
226
- "logps/chosen": -289.15582275390625,
227
- "logps/rejected": -76.08447265625,
228
- "loss": 0.5925,
229
- "rewards/accuracies": 0.9437500238418579,
230
- "rewards/chosen": 0.0425817035138607,
231
- "rewards/margins": 0.21189098060131073,
232
- "rewards/rejected": -0.16930925846099854,
233
  "step": 150
234
  },
235
  {
236
  "epoch": 0.33,
237
- "learning_rate": 3.7270642201834864e-07,
238
- "logits/chosen": -3.0720551013946533,
239
- "logits/rejected": -3.0518932342529297,
240
- "logps/chosen": -271.08258056640625,
241
- "logps/rejected": -75.97576141357422,
242
- "loss": 0.5874,
243
- "rewards/accuracies": 0.956250011920929,
244
- "rewards/chosen": 0.03000471368432045,
245
- "rewards/margins": 0.20934228599071503,
246
- "rewards/rejected": -0.17933759093284607,
247
  "step": 160
248
  },
249
  {
250
  "epoch": 0.35,
251
- "learning_rate": 3.612385321100918e-07,
252
- "logits/chosen": -3.026865243911743,
253
- "logits/rejected": -3.030813455581665,
254
- "logps/chosen": -287.5133361816406,
255
- "logps/rejected": -77.84892272949219,
256
- "loss": 0.5811,
257
- "rewards/accuracies": 0.949999988079071,
258
- "rewards/chosen": 0.050167638808488846,
259
- "rewards/margins": 0.24577708542346954,
260
- "rewards/rejected": -0.1956094205379486,
261
  "step": 170
262
  },
263
  {
264
  "epoch": 0.37,
265
- "learning_rate": 3.497706422018348e-07,
266
- "logits/chosen": -3.064037322998047,
267
- "logits/rejected": -3.0434131622314453,
268
- "logps/chosen": -270.81378173828125,
269
- "logps/rejected": -78.64222717285156,
270
- "loss": 0.5708,
271
- "rewards/accuracies": 0.9750000238418579,
272
- "rewards/chosen": 0.0572846345603466,
273
- "rewards/margins": 0.27750909328460693,
274
- "rewards/rejected": -0.2202244997024536,
275
  "step": 180
276
  },
277
  {
278
  "epoch": 0.39,
279
- "learning_rate": 3.3830275229357795e-07,
280
- "logits/chosen": -3.0381369590759277,
281
- "logits/rejected": -3.031832456588745,
282
- "logps/chosen": -273.7306823730469,
283
- "logps/rejected": -79.31744384765625,
284
- "loss": 0.5604,
285
- "rewards/accuracies": 0.96875,
286
- "rewards/chosen": 0.05553610250353813,
287
- "rewards/margins": 0.29081013798713684,
288
- "rewards/rejected": -0.2352740317583084,
289
  "step": 190
290
  },
291
  {
292
  "epoch": 0.41,
293
- "learning_rate": 3.268348623853211e-07,
294
- "logits/chosen": -3.036811113357544,
295
- "logits/rejected": -3.0287680625915527,
296
- "logps/chosen": -266.4691467285156,
297
- "logps/rejected": -77.38215637207031,
298
- "loss": 0.5504,
299
- "rewards/accuracies": 0.949999988079071,
300
- "rewards/chosen": 0.08118367195129395,
301
- "rewards/margins": 0.3425747752189636,
302
- "rewards/rejected": -0.2613911032676697,
303
  "step": 200
304
  },
305
  {
306
  "epoch": 0.43,
307
- "learning_rate": 3.1536697247706423e-07,
308
- "logits/chosen": -3.061699867248535,
309
- "logits/rejected": -3.042888641357422,
310
- "logps/chosen": -269.961181640625,
311
- "logps/rejected": -89.21647644042969,
312
- "loss": 0.5501,
313
- "rewards/accuracies": 0.956250011920929,
314
- "rewards/chosen": 0.07142322510480881,
315
- "rewards/margins": 0.3240587115287781,
316
- "rewards/rejected": -0.25263547897338867,
317
  "step": 210
318
  },
319
  {
320
  "epoch": 0.45,
321
- "learning_rate": 3.038990825688073e-07,
322
- "logits/chosen": -3.04771089553833,
323
- "logits/rejected": -3.018721103668213,
324
- "logps/chosen": -250.44091796875,
325
- "logps/rejected": -72.33317565917969,
326
- "loss": 0.5488,
327
- "rewards/accuracies": 0.9624999761581421,
328
- "rewards/chosen": 0.06637217104434967,
329
- "rewards/margins": 0.3276647627353668,
330
- "rewards/rejected": -0.26129260659217834,
331
  "step": 220
332
  },
333
  {
334
  "epoch": 0.47,
335
- "learning_rate": 2.9243119266055045e-07,
336
- "logits/chosen": -2.9626972675323486,
337
- "logits/rejected": -2.9827158451080322,
338
- "logps/chosen": -293.9212646484375,
339
- "logps/rejected": -72.2821044921875,
340
- "loss": 0.5313,
341
- "rewards/accuracies": 0.981249988079071,
342
- "rewards/chosen": 0.08349540829658508,
343
- "rewards/margins": 0.3892216682434082,
344
- "rewards/rejected": -0.30572623014450073,
345
  "step": 230
346
  },
347
  {
348
  "epoch": 0.49,
349
- "learning_rate": 2.809633027522936e-07,
350
- "logits/chosen": -3.034790277481079,
351
- "logits/rejected": -3.016634225845337,
352
- "logps/chosen": -280.6105651855469,
353
- "logps/rejected": -76.09197235107422,
354
- "loss": 0.5333,
355
- "rewards/accuracies": 0.9375,
356
- "rewards/chosen": 0.08378176391124725,
357
- "rewards/margins": 0.4068339467048645,
358
- "rewards/rejected": -0.32305219769477844,
359
  "step": 240
360
  },
361
  {
362
  "epoch": 0.52,
363
- "learning_rate": 2.6949541284403673e-07,
364
- "logits/chosen": -3.0789849758148193,
365
- "logits/rejected": -3.0785841941833496,
366
- "logps/chosen": -264.5536804199219,
367
- "logps/rejected": -82.22047424316406,
368
- "loss": 0.5282,
369
- "rewards/accuracies": 0.9624999761581421,
370
- "rewards/chosen": 0.06328760087490082,
371
- "rewards/margins": 0.40200409293174744,
372
- "rewards/rejected": -0.3387165069580078,
373
  "step": 250
374
  },
375
  {
376
  "epoch": 0.54,
377
- "learning_rate": 2.5802752293577976e-07,
378
- "logits/chosen": -2.9741625785827637,
379
- "logits/rejected": -2.9866743087768555,
380
- "logps/chosen": -282.30902099609375,
381
- "logps/rejected": -70.76858520507812,
382
- "loss": 0.5277,
383
- "rewards/accuracies": 0.9312499761581421,
384
- "rewards/chosen": 0.10191468149423599,
385
- "rewards/margins": 0.39590951800346375,
386
- "rewards/rejected": -0.29399481415748596,
387
  "step": 260
388
  },
389
  {
390
  "epoch": 0.56,
391
- "learning_rate": 2.465596330275229e-07,
392
- "logits/chosen": -3.032557964324951,
393
- "logits/rejected": -3.03240704536438,
394
- "logps/chosen": -274.0851135253906,
395
- "logps/rejected": -86.98384094238281,
396
- "loss": 0.5135,
397
- "rewards/accuracies": 0.9375,
398
- "rewards/chosen": 0.07479412853717804,
399
- "rewards/margins": 0.4109489321708679,
400
- "rewards/rejected": -0.3361548185348511,
401
  "step": 270
402
  },
403
  {
404
  "epoch": 0.58,
405
- "learning_rate": 2.3509174311926604e-07,
406
- "logits/chosen": -3.060285806655884,
407
- "logits/rejected": -2.9775302410125732,
408
- "logps/chosen": -253.785888671875,
409
- "logps/rejected": -70.39444732666016,
410
- "loss": 0.5183,
411
- "rewards/accuracies": 0.9624999761581421,
412
- "rewards/chosen": 0.07235217839479446,
413
- "rewards/margins": 0.3860532343387604,
414
- "rewards/rejected": -0.31370100378990173,
415
  "step": 280
416
  },
417
  {
418
  "epoch": 0.6,
419
- "learning_rate": 2.2362385321100916e-07,
420
- "logits/chosen": -3.029343843460083,
421
- "logits/rejected": -3.0406129360198975,
422
- "logps/chosen": -276.57196044921875,
423
- "logps/rejected": -84.54597473144531,
424
- "loss": 0.5107,
425
- "rewards/accuracies": 0.9437500238418579,
426
- "rewards/chosen": 0.08857797086238861,
427
- "rewards/margins": 0.4803849756717682,
428
- "rewards/rejected": -0.3918069899082184,
429
  "step": 290
430
  },
431
  {
432
  "epoch": 0.62,
433
- "learning_rate": 2.121559633027523e-07,
434
- "logits/chosen": -2.9938578605651855,
435
- "logits/rejected": -2.9954426288604736,
436
- "logps/chosen": -273.7822265625,
437
- "logps/rejected": -77.98421478271484,
438
- "loss": 0.5079,
439
- "rewards/accuracies": 0.956250011920929,
440
- "rewards/chosen": 0.08799968659877777,
441
- "rewards/margins": 0.40502768754959106,
442
- "rewards/rejected": -0.3170279860496521,
443
  "step": 300
444
  },
445
  {
446
  "epoch": 0.64,
447
- "learning_rate": 2.0068807339449538e-07,
448
- "logits/chosen": -3.052614212036133,
449
- "logits/rejected": -3.0461201667785645,
450
- "logps/chosen": -281.28814697265625,
451
- "logps/rejected": -81.84606170654297,
452
- "loss": 0.5038,
453
- "rewards/accuracies": 0.956250011920929,
454
- "rewards/chosen": 0.05326849967241287,
455
- "rewards/margins": 0.46244749426841736,
456
- "rewards/rejected": -0.4091789722442627,
457
  "step": 310
458
  },
459
  {
460
  "epoch": 0.66,
461
- "learning_rate": 1.8922018348623852e-07,
462
- "logits/chosen": -3.031501054763794,
463
- "logits/rejected": -3.042961597442627,
464
- "logps/chosen": -271.274658203125,
465
- "logps/rejected": -87.3827133178711,
466
- "loss": 0.5003,
467
- "rewards/accuracies": 0.9624999761581421,
468
- "rewards/chosen": 0.07084844261407852,
469
- "rewards/margins": 0.445441871881485,
470
- "rewards/rejected": -0.37459343671798706,
471
  "step": 320
472
  },
473
  {
474
  "epoch": 0.68,
475
- "learning_rate": 1.7775229357798163e-07,
476
- "logits/chosen": -3.0476019382476807,
477
- "logits/rejected": -3.0447893142700195,
478
- "logps/chosen": -249.735595703125,
479
- "logps/rejected": -73.10395812988281,
480
- "loss": 0.4976,
481
- "rewards/accuracies": 0.925000011920929,
482
- "rewards/chosen": 0.06198754906654358,
483
- "rewards/margins": 0.43834322690963745,
484
- "rewards/rejected": -0.37635567784309387,
485
  "step": 330
486
  },
487
  {
488
  "epoch": 0.7,
489
- "learning_rate": 1.6628440366972477e-07,
490
- "logits/chosen": -3.055901288986206,
491
- "logits/rejected": -3.0517029762268066,
492
- "logps/chosen": -273.3477478027344,
493
- "logps/rejected": -85.53290557861328,
494
- "loss": 0.496,
495
- "rewards/accuracies": 0.987500011920929,
496
- "rewards/chosen": 0.08338963240385056,
497
- "rewards/margins": 0.5042273998260498,
498
- "rewards/rejected": -0.42083778977394104,
499
  "step": 340
500
  },
501
  {
502
  "epoch": 0.72,
503
- "learning_rate": 1.5481651376146786e-07,
504
- "logits/chosen": -3.063744306564331,
505
- "logits/rejected": -3.066366195678711,
506
- "logps/chosen": -277.1488952636719,
507
- "logps/rejected": -88.2572250366211,
508
- "loss": 0.4931,
509
- "rewards/accuracies": 0.987500011920929,
510
- "rewards/chosen": 0.07289155572652817,
511
- "rewards/margins": 0.5126849412918091,
512
- "rewards/rejected": -0.4397934079170227,
513
  "step": 350
514
  },
515
  {
516
  "epoch": 0.74,
517
- "learning_rate": 1.43348623853211e-07,
518
- "logits/chosen": -3.0237436294555664,
519
- "logits/rejected": -3.0258359909057617,
520
- "logps/chosen": -292.0096740722656,
521
- "logps/rejected": -81.93167114257812,
522
- "loss": 0.4951,
523
- "rewards/accuracies": 0.9937499761581421,
524
- "rewards/chosen": 0.07367613166570663,
525
- "rewards/margins": 0.49797001481056213,
526
- "rewards/rejected": -0.4242939352989197,
527
  "step": 360
528
  },
529
  {
530
  "epoch": 0.76,
531
- "learning_rate": 1.318807339449541e-07,
532
- "logits/chosen": -2.9882092475891113,
533
- "logits/rejected": -2.9637956619262695,
534
- "logps/chosen": -274.551513671875,
535
- "logps/rejected": -73.8973388671875,
536
- "loss": 0.496,
537
- "rewards/accuracies": 0.949999988079071,
538
- "rewards/chosen": 0.0880483016371727,
539
- "rewards/margins": 0.49274787306785583,
540
- "rewards/rejected": -0.4046996533870697,
541
  "step": 370
542
  },
543
  {
544
  "epoch": 0.78,
545
- "learning_rate": 1.2041284403669725e-07,
546
- "logits/chosen": -3.070621967315674,
547
- "logits/rejected": -3.0683789253234863,
548
- "logps/chosen": -266.607177734375,
549
- "logps/rejected": -81.02775573730469,
550
- "loss": 0.493,
551
- "rewards/accuracies": 0.9750000238418579,
552
- "rewards/chosen": 0.10891600698232651,
553
- "rewards/margins": 0.5303564071655273,
554
- "rewards/rejected": -0.42144036293029785,
555
  "step": 380
556
  },
557
  {
558
  "epoch": 0.8,
559
- "learning_rate": 1.0894495412844036e-07,
560
- "logits/chosen": -3.0497114658355713,
561
- "logits/rejected": -3.053192615509033,
562
- "logps/chosen": -280.43218994140625,
563
- "logps/rejected": -80.42735290527344,
564
- "loss": 0.4892,
565
- "rewards/accuracies": 0.9375,
566
- "rewards/chosen": 0.10893626511096954,
567
- "rewards/margins": 0.5605167746543884,
568
- "rewards/rejected": -0.4515805244445801,
569
  "step": 390
570
  },
571
  {
572
  "epoch": 0.82,
573
- "learning_rate": 9.747706422018348e-08,
574
- "logits/chosen": -3.002933979034424,
575
- "logits/rejected": -3.0063657760620117,
576
- "logps/chosen": -241.24276733398438,
577
- "logps/rejected": -75.92924499511719,
578
- "loss": 0.4833,
579
- "rewards/accuracies": 0.956250011920929,
580
- "rewards/chosen": 0.07781459391117096,
581
- "rewards/margins": 0.46425342559814453,
582
- "rewards/rejected": -0.38643890619277954,
583
  "step": 400
584
  },
585
  {
586
  "epoch": 0.85,
587
- "learning_rate": 8.60091743119266e-08,
588
- "logits/chosen": -3.0454163551330566,
589
- "logits/rejected": -3.035583972930908,
590
- "logps/chosen": -264.18585205078125,
591
- "logps/rejected": -78.031982421875,
592
- "loss": 0.4744,
593
- "rewards/accuracies": 0.987500011920929,
594
- "rewards/chosen": 0.09802711009979248,
595
- "rewards/margins": 0.5436574816703796,
596
- "rewards/rejected": -0.44563040137290955,
597
  "step": 410
598
  },
599
  {
600
  "epoch": 0.87,
601
- "learning_rate": 7.454128440366971e-08,
602
- "logits/chosen": -3.0196666717529297,
603
- "logits/rejected": -3.0026302337646484,
604
- "logps/chosen": -272.02630615234375,
605
- "logps/rejected": -82.01240539550781,
606
- "loss": 0.481,
607
- "rewards/accuracies": 0.956250011920929,
608
- "rewards/chosen": 0.08279488980770111,
609
- "rewards/margins": 0.5704164505004883,
610
- "rewards/rejected": -0.48762160539627075,
611
  "step": 420
612
  },
613
  {
614
  "epoch": 0.89,
615
- "learning_rate": 6.307339449541284e-08,
616
- "logits/chosen": -3.0509345531463623,
617
- "logits/rejected": -3.0137345790863037,
618
- "logps/chosen": -262.2018127441406,
619
- "logps/rejected": -77.63418579101562,
620
- "loss": 0.4731,
621
- "rewards/accuracies": 0.9750000238418579,
622
- "rewards/chosen": 0.1073322519659996,
623
- "rewards/margins": 0.5776056051254272,
624
- "rewards/rejected": -0.4702734053134918,
625
  "step": 430
626
  },
627
  {
628
  "epoch": 0.91,
629
- "learning_rate": 5.1605504587155966e-08,
630
- "logits/chosen": -3.0285000801086426,
631
- "logits/rejected": -3.0236475467681885,
632
- "logps/chosen": -266.83599853515625,
633
- "logps/rejected": -77.38362121582031,
634
- "loss": 0.476,
635
- "rewards/accuracies": 0.9437500238418579,
636
- "rewards/chosen": 0.08291526138782501,
637
- "rewards/margins": 0.4984784722328186,
638
- "rewards/rejected": -0.41556310653686523,
639
  "step": 440
640
  },
641
  {
642
  "epoch": 0.93,
643
- "learning_rate": 4.0137614678899086e-08,
644
- "logits/chosen": -3.02640438079834,
645
- "logits/rejected": -3.011373996734619,
646
- "logps/chosen": -295.5868835449219,
647
- "logps/rejected": -80.76414489746094,
648
- "loss": 0.4707,
649
- "rewards/accuracies": 0.96875,
650
- "rewards/chosen": 0.09663239866495132,
651
- "rewards/margins": 0.5815601944923401,
652
- "rewards/rejected": -0.48492780327796936,
653
  "step": 450
654
  },
655
  {
656
  "epoch": 0.95,
657
- "learning_rate": 2.86697247706422e-08,
658
- "logits/chosen": -3.0195059776306152,
659
- "logits/rejected": -2.988323926925659,
660
- "logps/chosen": -300.5026550292969,
661
- "logps/rejected": -86.79838562011719,
662
- "loss": 0.4808,
663
- "rewards/accuracies": 0.96875,
664
- "rewards/chosen": 0.11054690927267075,
665
- "rewards/margins": 0.5899176001548767,
666
- "rewards/rejected": -0.47937074303627014,
667
  "step": 460
668
  },
669
  {
670
  "epoch": 0.97,
671
- "learning_rate": 1.720183486238532e-08,
672
- "logits/chosen": -3.0426931381225586,
673
- "logits/rejected": -3.0394179821014404,
674
- "logps/chosen": -235.52706909179688,
675
- "logps/rejected": -73.9857406616211,
676
- "loss": 0.4819,
677
- "rewards/accuracies": 0.956250011920929,
678
- "rewards/chosen": 0.08785500377416611,
679
- "rewards/margins": 0.5274263620376587,
680
- "rewards/rejected": -0.4395713806152344,
681
  "step": 470
682
  },
683
  {
684
  "epoch": 0.99,
685
- "learning_rate": 5.73394495412844e-09,
686
- "logits/chosen": -3.0092616081237793,
687
- "logits/rejected": -2.972731590270996,
688
- "logps/chosen": -249.88876342773438,
689
- "logps/rejected": -85.80451965332031,
690
- "loss": 0.482,
691
- "rewards/accuracies": 0.96875,
692
- "rewards/chosen": 0.07512323558330536,
693
- "rewards/margins": 0.5230099558830261,
694
- "rewards/rejected": -0.44788676500320435,
695
  "step": 480
696
  },
697
  {
698
  "epoch": 1.0,
699
- "eval_logits/chosen": -3.034407377243042,
700
- "eval_logits/rejected": -3.069913864135742,
701
- "eval_logps/chosen": -271.40020751953125,
702
- "eval_logps/rejected": -175.5244140625,
703
- "eval_loss": 0.5650191903114319,
704
- "eval_rewards/accuracies": 0.76953125,
705
- "eval_rewards/chosen": 0.08157022297382355,
706
- "eval_rewards/margins": 0.33799096941947937,
707
- "eval_rewards/rejected": -0.25642073154449463,
708
- "eval_runtime": 256.4523,
709
- "eval_samples_per_second": 7.799,
710
- "eval_steps_per_second": 0.062,
711
  "step": 485
712
  },
713
  {
714
- "epoch": 1.0,
715
- "step": 485,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716
  "total_flos": 0.0,
717
- "train_loss": 0.5539181610972611,
718
- "train_runtime": 15602.6148,
719
- "train_samples_per_second": 3.978,
720
  "train_steps_per_second": 0.031
721
  }
722
  ],
723
  "logging_steps": 10,
724
- "max_steps": 485,
725
- "num_train_epochs": 1,
726
  "save_steps": 500,
727
  "total_flos": 0.0,
728
  "trial_name": null,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
  "eval_steps": 100,
6
+ "global_step": 1455,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 1.36986301369863e-07,
14
+ "logits/chosen": -2.6635093688964844,
15
+ "logits/rejected": -2.7324111461639404,
16
+ "logps/chosen": -135.12002563476562,
17
+ "logps/rejected": -103.28743743896484,
18
  "loss": 0.6931,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
 
24
  },
25
  {
26
  "epoch": 0.02,
27
+ "learning_rate": 1.3698630136986302e-06,
28
+ "logits/chosen": -2.783435583114624,
29
+ "logits/rejected": -2.754120111465454,
30
+ "logps/chosen": -311.785400390625,
31
+ "logps/rejected": -273.2391357421875,
32
+ "loss": 0.6925,
33
+ "rewards/accuracies": 0.4513888955116272,
34
+ "rewards/chosen": -0.005032465327531099,
35
+ "rewards/margins": -0.004827913362532854,
36
+ "rewards/rejected": -0.00020455113553907722,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.04,
41
+ "learning_rate": 2.7397260273972604e-06,
42
+ "logits/chosen": -2.8342247009277344,
43
+ "logits/rejected": -2.8470585346221924,
44
+ "logps/chosen": -283.9891662597656,
45
+ "logps/rejected": -250.61019897460938,
46
+ "loss": 0.6861,
47
+ "rewards/accuracies": 0.574999988079071,
48
+ "rewards/chosen": 0.006812130566686392,
49
+ "rewards/margins": 0.014938007108867168,
50
+ "rewards/rejected": -0.008125877007842064,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.06,
55
+ "learning_rate": 4.109589041095891e-06,
56
+ "logits/chosen": -2.816066026687622,
57
+ "logits/rejected": -2.8595542907714844,
58
+ "logps/chosen": -282.41351318359375,
59
+ "logps/rejected": -200.7602081298828,
60
+ "loss": 0.6719,
61
+ "rewards/accuracies": 0.643750011920929,
62
+ "rewards/chosen": 0.009923343546688557,
63
+ "rewards/margins": 0.052515141665935516,
64
+ "rewards/rejected": -0.04259180277585983,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.08,
69
+ "learning_rate": 5.479452054794521e-06,
70
+ "logits/chosen": -2.8367600440979004,
71
+ "logits/rejected": -2.777651309967041,
72
+ "logps/chosen": -292.46136474609375,
73
+ "logps/rejected": -239.9720001220703,
74
+ "loss": 0.6352,
75
+ "rewards/accuracies": 0.668749988079071,
76
+ "rewards/chosen": 0.041534725576639175,
77
+ "rewards/margins": 0.1540035903453827,
78
+ "rewards/rejected": -0.11246886104345322,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.1,
83
+ "learning_rate": 6.849315068493151e-06,
84
+ "logits/chosen": -2.789390802383423,
85
+ "logits/rejected": -2.7937839031219482,
86
+ "logps/chosen": -296.8035583496094,
87
+ "logps/rejected": -220.61669921875,
88
+ "loss": 0.5922,
89
+ "rewards/accuracies": 0.8062499761581421,
90
+ "rewards/chosen": 0.07272736728191376,
91
+ "rewards/margins": 0.3851665258407593,
92
+ "rewards/rejected": -0.3124391436576843,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.12,
97
+ "learning_rate": 8.219178082191782e-06,
98
+ "logits/chosen": -2.8027291297912598,
99
+ "logits/rejected": -2.8161094188690186,
100
+ "logps/chosen": -298.8342590332031,
101
+ "logps/rejected": -248.74533081054688,
102
+ "loss": 0.5742,
103
+ "rewards/accuracies": 0.7124999761581421,
104
+ "rewards/chosen": -0.023758064955472946,
105
+ "rewards/margins": 0.40244102478027344,
106
+ "rewards/rejected": -0.4261991083621979,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.14,
111
+ "learning_rate": 9.589041095890411e-06,
112
+ "logits/chosen": -2.8271241188049316,
113
+ "logits/rejected": -2.833664894104004,
114
+ "logps/chosen": -282.1733703613281,
115
+ "logps/rejected": -261.71844482421875,
116
+ "loss": 0.5511,
117
+ "rewards/accuracies": 0.7124999761581421,
118
+ "rewards/chosen": -0.05103034898638725,
119
+ "rewards/margins": 0.4532381594181061,
120
+ "rewards/rejected": -0.5042685270309448,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.16,
125
+ "learning_rate": 1.0958904109589042e-05,
126
+ "logits/chosen": -2.8094613552093506,
127
+ "logits/rejected": -2.8167314529418945,
128
+ "logps/chosen": -302.01190185546875,
129
+ "logps/rejected": -241.74282836914062,
130
+ "loss": 0.538,
131
+ "rewards/accuracies": 0.8062499761581421,
132
+ "rewards/chosen": -0.05076650530099869,
133
+ "rewards/margins": 0.729707658290863,
134
+ "rewards/rejected": -0.7804741263389587,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.19,
139
+ "learning_rate": 1.2328767123287673e-05,
140
+ "logits/chosen": -2.7917096614837646,
141
+ "logits/rejected": -2.8236160278320312,
142
+ "logps/chosen": -255.85238647460938,
143
+ "logps/rejected": -210.2568359375,
144
+ "loss": 0.5137,
145
+ "rewards/accuracies": 0.75,
146
+ "rewards/chosen": -0.12663120031356812,
147
+ "rewards/margins": 0.7104201316833496,
148
+ "rewards/rejected": -0.8370513916015625,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.21,
153
+ "learning_rate": 1.3698630136986302e-05,
154
+ "logits/chosen": -2.8499982357025146,
155
+ "logits/rejected": -2.8203647136688232,
156
+ "logps/chosen": -261.9218444824219,
157
+ "logps/rejected": -243.7086639404297,
158
+ "loss": 0.5222,
159
+ "rewards/accuracies": 0.8125,
160
+ "rewards/chosen": -0.13231949508190155,
161
+ "rewards/margins": 0.815433144569397,
162
+ "rewards/rejected": -0.9477526545524597,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.23,
167
+ "learning_rate": 1.5068493150684933e-05,
168
+ "logits/chosen": -2.8208096027374268,
169
+ "logits/rejected": -2.8195183277130127,
170
+ "logps/chosen": -278.68377685546875,
171
+ "logps/rejected": -239.5231170654297,
172
+ "loss": 0.4973,
173
+ "rewards/accuracies": 0.7562500238418579,
174
+ "rewards/chosen": -0.1940581500530243,
175
+ "rewards/margins": 0.9298496246337891,
176
+ "rewards/rejected": -1.1239076852798462,
177
  "step": 110
178
  },
179
  {
180
  "epoch": 0.25,
181
+ "learning_rate": 1.6438356164383563e-05,
182
+ "logits/chosen": -2.8535244464874268,
183
+ "logits/rejected": -2.8457090854644775,
184
+ "logps/chosen": -299.8865661621094,
185
+ "logps/rejected": -211.28964233398438,
186
+ "loss": 0.503,
187
+ "rewards/accuracies": 0.762499988079071,
188
+ "rewards/chosen": -0.1717432737350464,
189
+ "rewards/margins": 0.9522634744644165,
190
+ "rewards/rejected": -1.124006748199463,
191
  "step": 120
192
  },
193
  {
194
  "epoch": 0.27,
195
+ "learning_rate": 1.7808219178082194e-05,
196
+ "logits/chosen": -2.806631565093994,
197
+ "logits/rejected": -2.8321399688720703,
198
+ "logps/chosen": -273.78619384765625,
199
+ "logps/rejected": -232.2666015625,
200
+ "loss": 0.5137,
201
+ "rewards/accuracies": 0.7875000238418579,
202
+ "rewards/chosen": -0.3173758387565613,
203
+ "rewards/margins": 0.8668048977851868,
204
+ "rewards/rejected": -1.1841806173324585,
205
  "step": 130
206
  },
207
  {
208
  "epoch": 0.29,
209
+ "learning_rate": 1.9178082191780822e-05,
210
+ "logits/chosen": -2.8575327396392822,
211
+ "logits/rejected": -2.8858590126037598,
212
+ "logps/chosen": -291.01837158203125,
213
+ "logps/rejected": -233.05001831054688,
214
+ "loss": 0.528,
215
+ "rewards/accuracies": 0.706250011920929,
216
+ "rewards/chosen": -0.21833661198616028,
217
+ "rewards/margins": 0.7500525712966919,
218
+ "rewards/rejected": -0.9683893322944641,
219
  "step": 140
220
  },
221
  {
222
  "epoch": 0.31,
223
+ "learning_rate": 1.9938884644767e-05,
224
+ "logits/chosen": -2.8324332237243652,
225
+ "logits/rejected": -2.8657970428466797,
226
+ "logps/chosen": -263.90875244140625,
227
+ "logps/rejected": -248.7552490234375,
228
+ "loss": 0.5066,
229
+ "rewards/accuracies": 0.6812499761581421,
230
+ "rewards/chosen": -0.226444810628891,
231
+ "rewards/margins": 0.6770893335342407,
232
+ "rewards/rejected": -0.9035340547561646,
233
  "step": 150
234
  },
235
  {
236
  "epoch": 0.33,
237
+ "learning_rate": 1.9786096256684494e-05,
238
+ "logits/chosen": -2.85012149810791,
239
+ "logits/rejected": -2.920163869857788,
240
+ "logps/chosen": -313.2906494140625,
241
+ "logps/rejected": -240.2455596923828,
242
+ "loss": 0.514,
243
+ "rewards/accuracies": 0.75,
244
+ "rewards/chosen": -0.15915456414222717,
245
+ "rewards/margins": 0.9070942997932434,
246
+ "rewards/rejected": -1.066248893737793,
247
  "step": 160
248
  },
249
  {
250
  "epoch": 0.35,
251
+ "learning_rate": 1.9633307868601987e-05,
252
+ "logits/chosen": -2.8070321083068848,
253
+ "logits/rejected": -2.8099472522735596,
254
+ "logps/chosen": -302.35736083984375,
255
+ "logps/rejected": -241.8223419189453,
256
+ "loss": 0.5346,
257
+ "rewards/accuracies": 0.6937500238418579,
258
+ "rewards/chosen": -0.07760269939899445,
259
+ "rewards/margins": 0.7661724090576172,
260
+ "rewards/rejected": -0.8437750935554504,
261
  "step": 170
262
  },
263
  {
264
  "epoch": 0.37,
265
+ "learning_rate": 1.9480519480519483e-05,
266
+ "logits/chosen": -2.7994544506073,
267
+ "logits/rejected": -2.8084394931793213,
268
+ "logps/chosen": -302.2978820800781,
269
+ "logps/rejected": -248.54586791992188,
270
+ "loss": 0.4909,
271
+ "rewards/accuracies": 0.7562500238418579,
272
+ "rewards/chosen": -0.09264491498470306,
273
+ "rewards/margins": 0.8108696937561035,
274
+ "rewards/rejected": -0.9035146832466125,
275
  "step": 180
276
  },
277
  {
278
  "epoch": 0.39,
279
+ "learning_rate": 1.9327731092436976e-05,
280
+ "logits/chosen": -2.867279052734375,
281
+ "logits/rejected": -2.8662352561950684,
282
+ "logps/chosen": -294.74139404296875,
283
+ "logps/rejected": -247.1249542236328,
284
+ "loss": 0.5157,
285
+ "rewards/accuracies": 0.831250011920929,
286
+ "rewards/chosen": -0.023864692077040672,
287
+ "rewards/margins": 1.0177843570709229,
288
+ "rewards/rejected": -1.0416491031646729,
289
  "step": 190
290
  },
291
  {
292
  "epoch": 0.41,
293
+ "learning_rate": 1.9174942704354472e-05,
294
+ "logits/chosen": -2.7849340438842773,
295
+ "logits/rejected": -2.8029227256774902,
296
+ "logps/chosen": -274.12408447265625,
297
+ "logps/rejected": -234.2401885986328,
298
+ "loss": 0.4979,
299
+ "rewards/accuracies": 0.7437499761581421,
300
+ "rewards/chosen": -0.18910327553749084,
301
+ "rewards/margins": 0.9559990763664246,
302
+ "rewards/rejected": -1.1451025009155273,
303
  "step": 200
304
  },
305
  {
306
  "epoch": 0.43,
307
+ "learning_rate": 1.9022154316271965e-05,
308
+ "logits/chosen": -2.886434555053711,
309
+ "logits/rejected": -2.8876872062683105,
310
+ "logps/chosen": -285.2301940917969,
311
+ "logps/rejected": -270.3388977050781,
312
+ "loss": 0.5246,
313
+ "rewards/accuracies": 0.800000011920929,
314
+ "rewards/chosen": -0.0919012501835823,
315
+ "rewards/margins": 1.0152003765106201,
316
+ "rewards/rejected": -1.1071016788482666,
317
  "step": 210
318
  },
319
  {
320
  "epoch": 0.45,
321
+ "learning_rate": 1.8869365928189458e-05,
322
+ "logits/chosen": -2.8153045177459717,
323
+ "logits/rejected": -2.8722925186157227,
324
+ "logps/chosen": -295.7647399902344,
325
+ "logps/rejected": -241.56723022460938,
326
+ "loss": 0.4931,
327
+ "rewards/accuracies": 0.762499988079071,
328
+ "rewards/chosen": -0.16732807457447052,
329
+ "rewards/margins": 0.9812033772468567,
330
+ "rewards/rejected": -1.1485313177108765,
331
  "step": 220
332
  },
333
  {
334
  "epoch": 0.47,
335
+ "learning_rate": 1.8716577540106954e-05,
336
+ "logits/chosen": -2.890263080596924,
337
+ "logits/rejected": -2.870927333831787,
338
+ "logps/chosen": -300.0990905761719,
339
+ "logps/rejected": -243.5838165283203,
340
+ "loss": 0.5021,
341
+ "rewards/accuracies": 0.75,
342
+ "rewards/chosen": -0.1739131659269333,
343
+ "rewards/margins": 0.8733876943588257,
344
+ "rewards/rejected": -1.047300934791565,
345
  "step": 230
346
  },
347
  {
348
  "epoch": 0.49,
349
+ "learning_rate": 1.8563789152024447e-05,
350
+ "logits/chosen": -2.792865753173828,
351
+ "logits/rejected": -2.8081324100494385,
352
+ "logps/chosen": -249.1541748046875,
353
+ "logps/rejected": -221.9134979248047,
354
+ "loss": 0.4763,
355
+ "rewards/accuracies": 0.8062499761581421,
356
+ "rewards/chosen": -0.2561655640602112,
357
+ "rewards/margins": 1.2132002115249634,
358
+ "rewards/rejected": -1.4693658351898193,
359
  "step": 240
360
  },
361
  {
362
  "epoch": 0.52,
363
+ "learning_rate": 1.8411000763941943e-05,
364
+ "logits/chosen": -2.780810594558716,
365
+ "logits/rejected": -2.8007028102874756,
366
+ "logps/chosen": -289.1332702636719,
367
+ "logps/rejected": -259.22491455078125,
368
+ "loss": 0.488,
369
+ "rewards/accuracies": 0.7437499761581421,
370
+ "rewards/chosen": -0.20490197837352753,
371
+ "rewards/margins": 0.9276278614997864,
372
+ "rewards/rejected": -1.132529854774475,
373
  "step": 250
374
  },
375
  {
376
  "epoch": 0.54,
377
+ "learning_rate": 1.8258212375859436e-05,
378
+ "logits/chosen": -2.7935876846313477,
379
+ "logits/rejected": -2.795382022857666,
380
+ "logps/chosen": -300.5179748535156,
381
+ "logps/rejected": -260.3360900878906,
382
+ "loss": 0.4823,
383
+ "rewards/accuracies": 0.75,
384
+ "rewards/chosen": -0.22746071219444275,
385
+ "rewards/margins": 1.0280539989471436,
386
+ "rewards/rejected": -1.2555148601531982,
387
  "step": 260
388
  },
389
  {
390
  "epoch": 0.56,
391
+ "learning_rate": 1.8105423987776932e-05,
392
+ "logits/chosen": -2.8180222511291504,
393
+ "logits/rejected": -2.7675962448120117,
394
+ "logps/chosen": -264.4036560058594,
395
+ "logps/rejected": -242.5054931640625,
396
+ "loss": 0.4994,
397
+ "rewards/accuracies": 0.8062499761581421,
398
+ "rewards/chosen": -0.12344682216644287,
399
+ "rewards/margins": 1.0876801013946533,
400
+ "rewards/rejected": -1.2111269235610962,
401
  "step": 270
402
  },
403
  {
404
  "epoch": 0.58,
405
+ "learning_rate": 1.7952635599694425e-05,
406
+ "logits/chosen": -2.859412908554077,
407
+ "logits/rejected": -2.7997498512268066,
408
+ "logps/chosen": -274.4200744628906,
409
+ "logps/rejected": -253.32235717773438,
410
+ "loss": 0.4921,
411
+ "rewards/accuracies": 0.7437499761581421,
412
+ "rewards/chosen": -0.08681371808052063,
413
+ "rewards/margins": 0.9918006658554077,
414
+ "rewards/rejected": -1.078614354133606,
415
  "step": 280
416
  },
417
  {
418
  "epoch": 0.6,
419
+ "learning_rate": 1.7799847211611917e-05,
420
+ "logits/chosen": -2.8540239334106445,
421
+ "logits/rejected": -2.8323752880096436,
422
+ "logps/chosen": -297.4104309082031,
423
+ "logps/rejected": -258.0458984375,
424
+ "loss": 0.4741,
425
+ "rewards/accuracies": 0.71875,
426
+ "rewards/chosen": -0.1035405844449997,
427
+ "rewards/margins": 1.009918451309204,
428
+ "rewards/rejected": -1.1134589910507202,
429
  "step": 290
430
  },
431
  {
432
  "epoch": 0.62,
433
+ "learning_rate": 1.7647058823529414e-05,
434
+ "logits/chosen": -2.835106372833252,
435
+ "logits/rejected": -2.8273653984069824,
436
+ "logps/chosen": -251.33984375,
437
+ "logps/rejected": -243.59890747070312,
438
+ "loss": 0.4957,
439
+ "rewards/accuracies": 0.7562500238418579,
440
+ "rewards/chosen": -0.1399732530117035,
441
+ "rewards/margins": 0.9917305111885071,
442
+ "rewards/rejected": -1.1317037343978882,
443
  "step": 300
444
  },
445
  {
446
  "epoch": 0.64,
447
+ "learning_rate": 1.7494270435446906e-05,
448
+ "logits/chosen": -2.8415441513061523,
449
+ "logits/rejected": -2.7994866371154785,
450
+ "logps/chosen": -260.0638732910156,
451
+ "logps/rejected": -227.3611602783203,
452
+ "loss": 0.5106,
453
+ "rewards/accuracies": 0.7875000238418579,
454
+ "rewards/chosen": -0.05403571575880051,
455
+ "rewards/margins": 1.2989342212677002,
456
+ "rewards/rejected": -1.3529701232910156,
457
  "step": 310
458
  },
459
  {
460
  "epoch": 0.66,
461
+ "learning_rate": 1.7341482047364403e-05,
462
+ "logits/chosen": -2.870739698410034,
463
+ "logits/rejected": -2.808722496032715,
464
+ "logps/chosen": -252.57870483398438,
465
+ "logps/rejected": -238.3760528564453,
466
+ "loss": 0.4714,
467
+ "rewards/accuracies": 0.7562500238418579,
468
+ "rewards/chosen": -0.3325572907924652,
469
+ "rewards/margins": 1.0449466705322266,
470
+ "rewards/rejected": -1.3775039911270142,
471
  "step": 320
472
  },
473
  {
474
  "epoch": 0.68,
475
+ "learning_rate": 1.7188693659281895e-05,
476
+ "logits/chosen": -2.870849132537842,
477
+ "logits/rejected": -2.8693222999572754,
478
+ "logps/chosen": -296.0721435546875,
479
+ "logps/rejected": -252.4921875,
480
+ "loss": 0.4776,
481
+ "rewards/accuracies": 0.8062499761581421,
482
+ "rewards/chosen": -0.14332377910614014,
483
+ "rewards/margins": 1.1690990924835205,
484
+ "rewards/rejected": -1.312422752380371,
485
  "step": 330
486
  },
487
  {
488
  "epoch": 0.7,
489
+ "learning_rate": 1.703590527119939e-05,
490
+ "logits/chosen": -2.82586669921875,
491
+ "logits/rejected": -2.780189037322998,
492
+ "logps/chosen": -315.26727294921875,
493
+ "logps/rejected": -268.7083740234375,
494
+ "loss": 0.5107,
495
+ "rewards/accuracies": 0.75,
496
+ "rewards/chosen": -0.26427751779556274,
497
+ "rewards/margins": 0.9479537010192871,
498
+ "rewards/rejected": -1.2122312784194946,
499
  "step": 340
500
  },
501
  {
502
  "epoch": 0.72,
503
+ "learning_rate": 1.6883116883116884e-05,
504
+ "logits/chosen": -2.8107895851135254,
505
+ "logits/rejected": -2.8060848712921143,
506
+ "logps/chosen": -242.031494140625,
507
+ "logps/rejected": -227.6324920654297,
508
+ "loss": 0.5167,
509
+ "rewards/accuracies": 0.78125,
510
+ "rewards/chosen": -0.1813223510980606,
511
+ "rewards/margins": 0.9393804669380188,
512
+ "rewards/rejected": -1.1207029819488525,
513
  "step": 350
514
  },
515
  {
516
  "epoch": 0.74,
517
+ "learning_rate": 1.6730328495034377e-05,
518
+ "logits/chosen": -2.8176794052124023,
519
+ "logits/rejected": -2.76552152633667,
520
+ "logps/chosen": -263.55047607421875,
521
+ "logps/rejected": -240.71484375,
522
+ "loss": 0.5146,
523
+ "rewards/accuracies": 0.75,
524
+ "rewards/chosen": -0.1424541026353836,
525
+ "rewards/margins": 0.9593822360038757,
526
+ "rewards/rejected": -1.1018364429473877,
527
  "step": 360
528
  },
529
  {
530
  "epoch": 0.76,
531
+ "learning_rate": 1.6577540106951873e-05,
532
+ "logits/chosen": -2.8674864768981934,
533
+ "logits/rejected": -2.8338735103607178,
534
+ "logps/chosen": -288.48687744140625,
535
+ "logps/rejected": -261.1170349121094,
536
+ "loss": 0.4945,
537
+ "rewards/accuracies": 0.793749988079071,
538
+ "rewards/chosen": -0.24135860800743103,
539
+ "rewards/margins": 0.9816256761550903,
540
+ "rewards/rejected": -1.2229843139648438,
541
  "step": 370
542
  },
543
  {
544
  "epoch": 0.78,
545
+ "learning_rate": 1.6424751718869366e-05,
546
+ "logits/chosen": -2.8083739280700684,
547
+ "logits/rejected": -2.7698397636413574,
548
+ "logps/chosen": -243.06356811523438,
549
+ "logps/rejected": -258.90496826171875,
550
+ "loss": 0.4858,
551
+ "rewards/accuracies": 0.731249988079071,
552
+ "rewards/chosen": -0.4036439061164856,
553
+ "rewards/margins": 1.0016567707061768,
554
+ "rewards/rejected": -1.4053006172180176,
555
  "step": 380
556
  },
557
  {
558
  "epoch": 0.8,
559
+ "learning_rate": 1.6271963330786862e-05,
560
+ "logits/chosen": -2.8200416564941406,
561
+ "logits/rejected": -2.809105634689331,
562
+ "logps/chosen": -317.0741882324219,
563
+ "logps/rejected": -248.5664825439453,
564
+ "loss": 0.4763,
565
+ "rewards/accuracies": 0.793749988079071,
566
+ "rewards/chosen": -0.23517903685569763,
567
+ "rewards/margins": 1.1138803958892822,
568
+ "rewards/rejected": -1.3490597009658813,
569
  "step": 390
570
  },
571
  {
572
  "epoch": 0.82,
573
+ "learning_rate": 1.6119174942704355e-05,
574
+ "logits/chosen": -2.860297679901123,
575
+ "logits/rejected": -2.8406786918640137,
576
+ "logps/chosen": -281.96527099609375,
577
+ "logps/rejected": -261.6462707519531,
578
+ "loss": 0.4637,
579
+ "rewards/accuracies": 0.7562500238418579,
580
+ "rewards/chosen": -0.20879487693309784,
581
+ "rewards/margins": 1.0736795663833618,
582
+ "rewards/rejected": -1.2824745178222656,
583
  "step": 400
584
  },
585
  {
586
  "epoch": 0.85,
587
+ "learning_rate": 1.596638655462185e-05,
588
+ "logits/chosen": -2.8639817237854004,
589
+ "logits/rejected": -2.8506665229797363,
590
+ "logps/chosen": -295.0474853515625,
591
+ "logps/rejected": -243.6042938232422,
592
+ "loss": 0.4449,
593
+ "rewards/accuracies": 0.800000011920929,
594
+ "rewards/chosen": -0.20277197659015656,
595
+ "rewards/margins": 1.3167582750320435,
596
+ "rewards/rejected": -1.519530177116394,
597
  "step": 410
598
  },
599
  {
600
  "epoch": 0.87,
601
+ "learning_rate": 1.5813598166539344e-05,
602
+ "logits/chosen": -2.83903431892395,
603
+ "logits/rejected": -2.8222603797912598,
604
+ "logps/chosen": -279.1824035644531,
605
+ "logps/rejected": -243.2361297607422,
606
+ "loss": 0.4928,
607
+ "rewards/accuracies": 0.75,
608
+ "rewards/chosen": -0.4378887712955475,
609
+ "rewards/margins": 1.1546775102615356,
610
+ "rewards/rejected": -1.5925662517547607,
611
  "step": 420
612
  },
613
  {
614
  "epoch": 0.89,
615
+ "learning_rate": 1.5660809778456837e-05,
616
+ "logits/chosen": -2.8503293991088867,
617
+ "logits/rejected": -2.8215270042419434,
618
+ "logps/chosen": -260.7596130371094,
619
+ "logps/rejected": -272.5841979980469,
620
+ "loss": 0.49,
621
+ "rewards/accuracies": 0.706250011920929,
622
+ "rewards/chosen": -0.2955819368362427,
623
+ "rewards/margins": 0.9733622670173645,
624
+ "rewards/rejected": -1.268944263458252,
625
  "step": 430
626
  },
627
  {
628
  "epoch": 0.91,
629
+ "learning_rate": 1.5508021390374333e-05,
630
+ "logits/chosen": -2.820071220397949,
631
+ "logits/rejected": -2.7907521724700928,
632
+ "logps/chosen": -307.09576416015625,
633
+ "logps/rejected": -249.6613006591797,
634
+ "loss": 0.4867,
635
+ "rewards/accuracies": 0.762499988079071,
636
+ "rewards/chosen": -0.3214530348777771,
637
+ "rewards/margins": 1.281368613243103,
638
+ "rewards/rejected": -1.602821707725525,
639
  "step": 440
640
  },
641
  {
642
  "epoch": 0.93,
643
+ "learning_rate": 1.5355233002291826e-05,
644
+ "logits/chosen": -2.8464276790618896,
645
+ "logits/rejected": -2.7966537475585938,
646
+ "logps/chosen": -296.7311706542969,
647
+ "logps/rejected": -254.079833984375,
648
+ "loss": 0.4991,
649
+ "rewards/accuracies": 0.793749988079071,
650
+ "rewards/chosen": -0.35059598088264465,
651
+ "rewards/margins": 1.165145993232727,
652
+ "rewards/rejected": -1.5157420635223389,
653
  "step": 450
654
  },
655
  {
656
  "epoch": 0.95,
657
+ "learning_rate": 1.5202444614209322e-05,
658
+ "logits/chosen": -2.8699872493743896,
659
+ "logits/rejected": -2.8504984378814697,
660
+ "logps/chosen": -262.0877990722656,
661
+ "logps/rejected": -247.0086212158203,
662
+ "loss": 0.4908,
663
+ "rewards/accuracies": 0.7562500238418579,
664
+ "rewards/chosen": -0.2521810531616211,
665
+ "rewards/margins": 1.0113656520843506,
666
+ "rewards/rejected": -1.2635467052459717,
667
  "step": 460
668
  },
669
  {
670
  "epoch": 0.97,
671
+ "learning_rate": 1.5049656226126816e-05,
672
+ "logits/chosen": -2.846945285797119,
673
+ "logits/rejected": -2.849860191345215,
674
+ "logps/chosen": -292.128173828125,
675
+ "logps/rejected": -255.2592315673828,
676
+ "loss": 0.48,
677
+ "rewards/accuracies": 0.8062499761581421,
678
+ "rewards/chosen": -0.06887436658143997,
679
+ "rewards/margins": 1.2138417959213257,
680
+ "rewards/rejected": -1.282716155052185,
681
  "step": 470
682
  },
683
  {
684
  "epoch": 0.99,
685
+ "learning_rate": 1.489686783804431e-05,
686
+ "logits/chosen": -2.872967481613159,
687
+ "logits/rejected": -2.8441576957702637,
688
+ "logps/chosen": -263.8342590332031,
689
+ "logps/rejected": -239.5003204345703,
690
+ "loss": 0.4771,
691
+ "rewards/accuracies": 0.7749999761581421,
692
+ "rewards/chosen": -0.3044741153717041,
693
+ "rewards/margins": 0.980197548866272,
694
+ "rewards/rejected": -1.2846715450286865,
695
  "step": 480
696
  },
697
  {
698
  "epoch": 1.0,
699
+ "eval_logits/chosen": -2.8796305656433105,
700
+ "eval_logits/rejected": -2.8501133918762207,
701
+ "eval_logps/chosen": -274.4910583496094,
702
+ "eval_logps/rejected": -238.3082275390625,
703
+ "eval_loss": 0.4616946280002594,
704
+ "eval_rewards/accuracies": 0.7890625,
705
+ "eval_rewards/chosen": -0.0842861607670784,
706
+ "eval_rewards/margins": 1.2677102088928223,
707
+ "eval_rewards/rejected": -1.3519961833953857,
708
+ "eval_runtime": 253.7733,
709
+ "eval_samples_per_second": 7.881,
710
+ "eval_steps_per_second": 0.063,
711
  "step": 485
712
  },
713
  {
714
+ "epoch": 1.01,
715
+ "learning_rate": 1.4744079449961804e-05,
716
+ "logits/chosen": -2.8916447162628174,
717
+ "logits/rejected": -2.836836576461792,
718
+ "logps/chosen": -317.30328369140625,
719
+ "logps/rejected": -261.12030029296875,
720
+ "loss": 0.4271,
721
+ "rewards/accuracies": 0.8125,
722
+ "rewards/chosen": -0.1437714546918869,
723
+ "rewards/margins": 1.377268671989441,
724
+ "rewards/rejected": -1.5210400819778442,
725
+ "step": 490
726
+ },
727
+ {
728
+ "epoch": 1.03,
729
+ "learning_rate": 1.4591291061879298e-05,
730
+ "logits/chosen": -2.8427810668945312,
731
+ "logits/rejected": -2.8195979595184326,
732
+ "logps/chosen": -262.42047119140625,
733
+ "logps/rejected": -256.4164733886719,
734
+ "loss": 0.4627,
735
+ "rewards/accuracies": 0.768750011920929,
736
+ "rewards/chosen": -0.3208313584327698,
737
+ "rewards/margins": 1.1283810138702393,
738
+ "rewards/rejected": -1.4492123126983643,
739
+ "step": 500
740
+ },
741
+ {
742
+ "epoch": 1.05,
743
+ "learning_rate": 1.4438502673796793e-05,
744
+ "logits/chosen": -2.8156418800354004,
745
+ "logits/rejected": -2.843306064605713,
746
+ "logps/chosen": -267.56536865234375,
747
+ "logps/rejected": -224.3167266845703,
748
+ "loss": 0.4201,
749
+ "rewards/accuracies": 0.800000011920929,
750
+ "rewards/chosen": -0.24222290515899658,
751
+ "rewards/margins": 1.2655701637268066,
752
+ "rewards/rejected": -1.5077931880950928,
753
+ "step": 510
754
+ },
755
+ {
756
+ "epoch": 1.07,
757
+ "learning_rate": 1.4285714285714287e-05,
758
+ "logits/chosen": -2.7805769443511963,
759
+ "logits/rejected": -2.8112196922302246,
760
+ "logps/chosen": -287.56451416015625,
761
+ "logps/rejected": -262.3890075683594,
762
+ "loss": 0.4203,
763
+ "rewards/accuracies": 0.737500011920929,
764
+ "rewards/chosen": -0.2784760892391205,
765
+ "rewards/margins": 1.08200204372406,
766
+ "rewards/rejected": -1.360478162765503,
767
+ "step": 520
768
+ },
769
+ {
770
+ "epoch": 1.09,
771
+ "learning_rate": 1.4132925897631782e-05,
772
+ "logits/chosen": -2.815563678741455,
773
+ "logits/rejected": -2.839988946914673,
774
+ "logps/chosen": -295.6175842285156,
775
+ "logps/rejected": -235.90164184570312,
776
+ "loss": 0.4009,
777
+ "rewards/accuracies": 0.8062499761581421,
778
+ "rewards/chosen": -0.23429016768932343,
779
+ "rewards/margins": 1.343583106994629,
780
+ "rewards/rejected": -1.5778734683990479,
781
+ "step": 530
782
+ },
783
+ {
784
+ "epoch": 1.11,
785
+ "learning_rate": 1.3980137509549276e-05,
786
+ "logits/chosen": -2.8498940467834473,
787
+ "logits/rejected": -2.830718517303467,
788
+ "logps/chosen": -318.00909423828125,
789
+ "logps/rejected": -257.63348388671875,
790
+ "loss": 0.4186,
791
+ "rewards/accuracies": 0.793749988079071,
792
+ "rewards/chosen": -0.3423912525177002,
793
+ "rewards/margins": 1.4794024229049683,
794
+ "rewards/rejected": -1.821793794631958,
795
+ "step": 540
796
+ },
797
+ {
798
+ "epoch": 1.13,
799
+ "learning_rate": 1.3827349121466769e-05,
800
+ "logits/chosen": -2.7992899417877197,
801
+ "logits/rejected": -2.8059680461883545,
802
+ "logps/chosen": -258.46514892578125,
803
+ "logps/rejected": -210.663818359375,
804
+ "loss": 0.426,
805
+ "rewards/accuracies": 0.7562500238418579,
806
+ "rewards/chosen": -0.40961843729019165,
807
+ "rewards/margins": 1.0720117092132568,
808
+ "rewards/rejected": -1.4816303253173828,
809
+ "step": 550
810
+ },
811
+ {
812
+ "epoch": 1.15,
813
+ "learning_rate": 1.3674560733384263e-05,
814
+ "logits/chosen": -2.862682580947876,
815
+ "logits/rejected": -2.865583658218384,
816
+ "logps/chosen": -281.8017883300781,
817
+ "logps/rejected": -257.26690673828125,
818
+ "loss": 0.4007,
819
+ "rewards/accuracies": 0.8687499761581421,
820
+ "rewards/chosen": -0.2565487325191498,
821
+ "rewards/margins": 1.5748827457427979,
822
+ "rewards/rejected": -1.8314317464828491,
823
+ "step": 560
824
+ },
825
+ {
826
+ "epoch": 1.18,
827
+ "learning_rate": 1.3521772345301758e-05,
828
+ "logits/chosen": -2.8742198944091797,
829
+ "logits/rejected": -2.849208354949951,
830
+ "logps/chosen": -240.13259887695312,
831
+ "logps/rejected": -242.2512664794922,
832
+ "loss": 0.3911,
833
+ "rewards/accuracies": 0.800000011920929,
834
+ "rewards/chosen": -0.2923469543457031,
835
+ "rewards/margins": 1.5814087390899658,
836
+ "rewards/rejected": -1.873755693435669,
837
+ "step": 570
838
+ },
839
+ {
840
+ "epoch": 1.2,
841
+ "learning_rate": 1.3368983957219252e-05,
842
+ "logits/chosen": -2.8210272789001465,
843
+ "logits/rejected": -2.8307366371154785,
844
+ "logps/chosen": -277.41375732421875,
845
+ "logps/rejected": -270.2271728515625,
846
+ "loss": 0.4217,
847
+ "rewards/accuracies": 0.8187500238418579,
848
+ "rewards/chosen": -0.5435327887535095,
849
+ "rewards/margins": 1.276086688041687,
850
+ "rewards/rejected": -1.8196194171905518,
851
+ "step": 580
852
+ },
853
+ {
854
+ "epoch": 1.22,
855
+ "learning_rate": 1.3216195569136747e-05,
856
+ "logits/chosen": -2.8047261238098145,
857
+ "logits/rejected": -2.8207616806030273,
858
+ "logps/chosen": -268.05999755859375,
859
+ "logps/rejected": -250.09188842773438,
860
+ "loss": 0.4192,
861
+ "rewards/accuracies": 0.887499988079071,
862
+ "rewards/chosen": -0.2236345261335373,
863
+ "rewards/margins": 1.6592124700546265,
864
+ "rewards/rejected": -1.8828470706939697,
865
+ "step": 590
866
+ },
867
+ {
868
+ "epoch": 1.24,
869
+ "learning_rate": 1.3063407181054241e-05,
870
+ "logits/chosen": -2.8321800231933594,
871
+ "logits/rejected": -2.813894748687744,
872
+ "logps/chosen": -247.3390655517578,
873
+ "logps/rejected": -245.7059326171875,
874
+ "loss": 0.4081,
875
+ "rewards/accuracies": 0.793749988079071,
876
+ "rewards/chosen": -0.6002678871154785,
877
+ "rewards/margins": 1.3836045265197754,
878
+ "rewards/rejected": -1.983872652053833,
879
+ "step": 600
880
+ },
881
+ {
882
+ "epoch": 1.26,
883
+ "learning_rate": 1.2910618792971734e-05,
884
+ "logits/chosen": -2.8052244186401367,
885
+ "logits/rejected": -2.7987864017486572,
886
+ "logps/chosen": -252.0263214111328,
887
+ "logps/rejected": -265.32666015625,
888
+ "loss": 0.4056,
889
+ "rewards/accuracies": 0.8062499761581421,
890
+ "rewards/chosen": -0.32010939717292786,
891
+ "rewards/margins": 1.5006351470947266,
892
+ "rewards/rejected": -1.820744514465332,
893
+ "step": 610
894
+ },
895
+ {
896
+ "epoch": 1.28,
897
+ "learning_rate": 1.2757830404889229e-05,
898
+ "logits/chosen": -2.852949619293213,
899
+ "logits/rejected": -2.810314655303955,
900
+ "logps/chosen": -305.781982421875,
901
+ "logps/rejected": -255.346435546875,
902
+ "loss": 0.44,
903
+ "rewards/accuracies": 0.7875000238418579,
904
+ "rewards/chosen": -0.2824147343635559,
905
+ "rewards/margins": 1.5166089534759521,
906
+ "rewards/rejected": -1.7990238666534424,
907
+ "step": 620
908
+ },
909
+ {
910
+ "epoch": 1.3,
911
+ "learning_rate": 1.2605042016806723e-05,
912
+ "logits/chosen": -2.8488757610321045,
913
+ "logits/rejected": -2.8255667686462402,
914
+ "logps/chosen": -272.29388427734375,
915
+ "logps/rejected": -245.6341552734375,
916
+ "loss": 0.42,
917
+ "rewards/accuracies": 0.800000011920929,
918
+ "rewards/chosen": -0.3796336054801941,
919
+ "rewards/margins": 1.4470628499984741,
920
+ "rewards/rejected": -1.8266966342926025,
921
+ "step": 630
922
+ },
923
+ {
924
+ "epoch": 1.32,
925
+ "learning_rate": 1.2452253628724218e-05,
926
+ "logits/chosen": -2.8429629802703857,
927
+ "logits/rejected": -2.837463140487671,
928
+ "logps/chosen": -262.43292236328125,
929
+ "logps/rejected": -261.91644287109375,
930
+ "loss": 0.4354,
931
+ "rewards/accuracies": 0.831250011920929,
932
+ "rewards/chosen": -0.4664459228515625,
933
+ "rewards/margins": 1.388948917388916,
934
+ "rewards/rejected": -1.855394721031189,
935
+ "step": 640
936
+ },
937
+ {
938
+ "epoch": 1.34,
939
+ "learning_rate": 1.2299465240641712e-05,
940
+ "logits/chosen": -2.829761028289795,
941
+ "logits/rejected": -2.799344062805176,
942
+ "logps/chosen": -266.4898986816406,
943
+ "logps/rejected": -228.8533935546875,
944
+ "loss": 0.4393,
945
+ "rewards/accuracies": 0.762499988079071,
946
+ "rewards/chosen": -0.6215249300003052,
947
+ "rewards/margins": 1.1069129705429077,
948
+ "rewards/rejected": -1.7284377813339233,
949
+ "step": 650
950
+ },
951
+ {
952
+ "epoch": 1.36,
953
+ "learning_rate": 1.2146676852559206e-05,
954
+ "logits/chosen": -2.8082187175750732,
955
+ "logits/rejected": -2.785370111465454,
956
+ "logps/chosen": -272.84552001953125,
957
+ "logps/rejected": -257.3983154296875,
958
+ "loss": 0.417,
959
+ "rewards/accuracies": 0.762499988079071,
960
+ "rewards/chosen": -0.5420676469802856,
961
+ "rewards/margins": 1.0937607288360596,
962
+ "rewards/rejected": -1.6358283758163452,
963
+ "step": 660
964
+ },
965
+ {
966
+ "epoch": 1.38,
967
+ "learning_rate": 1.1993888464476701e-05,
968
+ "logits/chosen": -2.796501636505127,
969
+ "logits/rejected": -2.753633975982666,
970
+ "logps/chosen": -275.73297119140625,
971
+ "logps/rejected": -261.51495361328125,
972
+ "loss": 0.4048,
973
+ "rewards/accuracies": 0.831250011920929,
974
+ "rewards/chosen": -0.6535481214523315,
975
+ "rewards/margins": 1.5823442935943604,
976
+ "rewards/rejected": -2.2358925342559814,
977
+ "step": 670
978
+ },
979
+ {
980
+ "epoch": 1.4,
981
+ "learning_rate": 1.1841100076394194e-05,
982
+ "logits/chosen": -2.8197669982910156,
983
+ "logits/rejected": -2.8018298149108887,
984
+ "logps/chosen": -279.07794189453125,
985
+ "logps/rejected": -264.37908935546875,
986
+ "loss": 0.4476,
987
+ "rewards/accuracies": 0.78125,
988
+ "rewards/chosen": -0.5116819143295288,
989
+ "rewards/margins": 1.3447668552398682,
990
+ "rewards/rejected": -1.856448769569397,
991
+ "step": 680
992
+ },
993
+ {
994
+ "epoch": 1.42,
995
+ "learning_rate": 1.1688311688311688e-05,
996
+ "logits/chosen": -2.844113826751709,
997
+ "logits/rejected": -2.844442844390869,
998
+ "logps/chosen": -252.5892791748047,
999
+ "logps/rejected": -259.8851623535156,
1000
+ "loss": 0.4359,
1001
+ "rewards/accuracies": 0.762499988079071,
1002
+ "rewards/chosen": -0.6136052012443542,
1003
+ "rewards/margins": 1.2718920707702637,
1004
+ "rewards/rejected": -1.8854974508285522,
1005
+ "step": 690
1006
+ },
1007
+ {
1008
+ "epoch": 1.44,
1009
+ "learning_rate": 1.1535523300229183e-05,
1010
+ "logits/chosen": -2.8703908920288086,
1011
+ "logits/rejected": -2.8482773303985596,
1012
+ "logps/chosen": -300.6127624511719,
1013
+ "logps/rejected": -267.99188232421875,
1014
+ "loss": 0.4299,
1015
+ "rewards/accuracies": 0.800000011920929,
1016
+ "rewards/chosen": -0.30981573462486267,
1017
+ "rewards/margins": 1.310162901878357,
1018
+ "rewards/rejected": -1.619978666305542,
1019
+ "step": 700
1020
+ },
1021
+ {
1022
+ "epoch": 1.46,
1023
+ "learning_rate": 1.1382734912146677e-05,
1024
+ "logits/chosen": -2.807915449142456,
1025
+ "logits/rejected": -2.7632646560668945,
1026
+ "logps/chosen": -244.17822265625,
1027
+ "logps/rejected": -210.9264678955078,
1028
+ "loss": 0.4146,
1029
+ "rewards/accuracies": 0.793749988079071,
1030
+ "rewards/chosen": -0.32631585001945496,
1031
+ "rewards/margins": 1.2793928384780884,
1032
+ "rewards/rejected": -1.6057088375091553,
1033
+ "step": 710
1034
+ },
1035
+ {
1036
+ "epoch": 1.48,
1037
+ "learning_rate": 1.1229946524064172e-05,
1038
+ "logits/chosen": -2.7481131553649902,
1039
+ "logits/rejected": -2.7189228534698486,
1040
+ "logps/chosen": -271.4867858886719,
1041
+ "logps/rejected": -256.009521484375,
1042
+ "loss": 0.4123,
1043
+ "rewards/accuracies": 0.8125,
1044
+ "rewards/chosen": -0.31545475125312805,
1045
+ "rewards/margins": 1.4961215257644653,
1046
+ "rewards/rejected": -1.811576247215271,
1047
+ "step": 720
1048
+ },
1049
+ {
1050
+ "epoch": 1.51,
1051
+ "learning_rate": 1.1077158135981668e-05,
1052
+ "logits/chosen": -2.80031681060791,
1053
+ "logits/rejected": -2.7459282875061035,
1054
+ "logps/chosen": -268.36669921875,
1055
+ "logps/rejected": -244.3325958251953,
1056
+ "loss": 0.4444,
1057
+ "rewards/accuracies": 0.768750011920929,
1058
+ "rewards/chosen": -0.3755728006362915,
1059
+ "rewards/margins": 1.379817008972168,
1060
+ "rewards/rejected": -1.755389928817749,
1061
+ "step": 730
1062
+ },
1063
+ {
1064
+ "epoch": 1.53,
1065
+ "learning_rate": 1.0924369747899159e-05,
1066
+ "logits/chosen": -2.809633255004883,
1067
+ "logits/rejected": -2.85695219039917,
1068
+ "logps/chosen": -293.177734375,
1069
+ "logps/rejected": -231.4840087890625,
1070
+ "loss": 0.4198,
1071
+ "rewards/accuracies": 0.800000011920929,
1072
+ "rewards/chosen": -0.2760697901248932,
1073
+ "rewards/margins": 1.443703293800354,
1074
+ "rewards/rejected": -1.7197730541229248,
1075
+ "step": 740
1076
+ },
1077
+ {
1078
+ "epoch": 1.55,
1079
+ "learning_rate": 1.0771581359816653e-05,
1080
+ "logits/chosen": -2.7558960914611816,
1081
+ "logits/rejected": -2.803246021270752,
1082
+ "logps/chosen": -261.3629455566406,
1083
+ "logps/rejected": -246.388671875,
1084
+ "loss": 0.39,
1085
+ "rewards/accuracies": 0.84375,
1086
+ "rewards/chosen": -0.4588204026222229,
1087
+ "rewards/margins": 1.728859305381775,
1088
+ "rewards/rejected": -2.1876797676086426,
1089
+ "step": 750
1090
+ },
1091
+ {
1092
+ "epoch": 1.57,
1093
+ "learning_rate": 1.0618792971734148e-05,
1094
+ "logits/chosen": -2.78913950920105,
1095
+ "logits/rejected": -2.7678422927856445,
1096
+ "logps/chosen": -256.0678405761719,
1097
+ "logps/rejected": -234.4947509765625,
1098
+ "loss": 0.4066,
1099
+ "rewards/accuracies": 0.8125,
1100
+ "rewards/chosen": -0.471910297870636,
1101
+ "rewards/margins": 1.3505375385284424,
1102
+ "rewards/rejected": -1.8224480152130127,
1103
+ "step": 760
1104
+ },
1105
+ {
1106
+ "epoch": 1.59,
1107
+ "learning_rate": 1.0466004583651644e-05,
1108
+ "logits/chosen": -2.808622121810913,
1109
+ "logits/rejected": -2.8128342628479004,
1110
+ "logps/chosen": -263.07305908203125,
1111
+ "logps/rejected": -242.57357788085938,
1112
+ "loss": 0.4032,
1113
+ "rewards/accuracies": 0.824999988079071,
1114
+ "rewards/chosen": -0.24994757771492004,
1115
+ "rewards/margins": 1.3186602592468262,
1116
+ "rewards/rejected": -1.5686078071594238,
1117
+ "step": 770
1118
+ },
1119
+ {
1120
+ "epoch": 1.61,
1121
+ "learning_rate": 1.0313216195569139e-05,
1122
+ "logits/chosen": -2.8431007862091064,
1123
+ "logits/rejected": -2.8500876426696777,
1124
+ "logps/chosen": -289.2240295410156,
1125
+ "logps/rejected": -240.93447875976562,
1126
+ "loss": 0.3989,
1127
+ "rewards/accuracies": 0.887499988079071,
1128
+ "rewards/chosen": -0.1801755428314209,
1129
+ "rewards/margins": 1.5674879550933838,
1130
+ "rewards/rejected": -1.7476632595062256,
1131
+ "step": 780
1132
+ },
1133
+ {
1134
+ "epoch": 1.63,
1135
+ "learning_rate": 1.0160427807486633e-05,
1136
+ "logits/chosen": -2.896554470062256,
1137
+ "logits/rejected": -2.8891143798828125,
1138
+ "logps/chosen": -304.2243957519531,
1139
+ "logps/rejected": -279.3757019042969,
1140
+ "loss": 0.3938,
1141
+ "rewards/accuracies": 0.78125,
1142
+ "rewards/chosen": -0.3372874855995178,
1143
+ "rewards/margins": 1.5321928262710571,
1144
+ "rewards/rejected": -1.8694803714752197,
1145
+ "step": 790
1146
+ },
1147
+ {
1148
+ "epoch": 1.65,
1149
+ "learning_rate": 1.0007639419404128e-05,
1150
+ "logits/chosen": -2.8317697048187256,
1151
+ "logits/rejected": -2.7945523262023926,
1152
+ "logps/chosen": -292.1405029296875,
1153
+ "logps/rejected": -234.8839569091797,
1154
+ "loss": 0.4294,
1155
+ "rewards/accuracies": 0.8125,
1156
+ "rewards/chosen": -0.5130658149719238,
1157
+ "rewards/margins": 1.4586106538772583,
1158
+ "rewards/rejected": -1.9716764688491821,
1159
+ "step": 800
1160
+ },
1161
+ {
1162
+ "epoch": 1.67,
1163
+ "learning_rate": 9.85485103132162e-06,
1164
+ "logits/chosen": -2.875779151916504,
1165
+ "logits/rejected": -2.8391237258911133,
1166
+ "logps/chosen": -298.0716857910156,
1167
+ "logps/rejected": -242.75662231445312,
1168
+ "loss": 0.4371,
1169
+ "rewards/accuracies": 0.831250011920929,
1170
+ "rewards/chosen": -0.41954541206359863,
1171
+ "rewards/margins": 1.413554072380066,
1172
+ "rewards/rejected": -1.833099603652954,
1173
+ "step": 810
1174
+ },
1175
+ {
1176
+ "epoch": 1.69,
1177
+ "learning_rate": 9.702062643239115e-06,
1178
+ "logits/chosen": -2.842196464538574,
1179
+ "logits/rejected": -2.824432849884033,
1180
+ "logps/chosen": -255.82620239257812,
1181
+ "logps/rejected": -256.314453125,
1182
+ "loss": 0.4196,
1183
+ "rewards/accuracies": 0.8125,
1184
+ "rewards/chosen": -0.4787816107273102,
1185
+ "rewards/margins": 1.262540578842163,
1186
+ "rewards/rejected": -1.7413221597671509,
1187
+ "step": 820
1188
+ },
1189
+ {
1190
+ "epoch": 1.71,
1191
+ "learning_rate": 9.54927425515661e-06,
1192
+ "logits/chosen": -2.8649404048919678,
1193
+ "logits/rejected": -2.807797908782959,
1194
+ "logps/chosen": -296.7332458496094,
1195
+ "logps/rejected": -261.4539489746094,
1196
+ "loss": 0.4274,
1197
+ "rewards/accuracies": 0.768750011920929,
1198
+ "rewards/chosen": -0.34954220056533813,
1199
+ "rewards/margins": 1.1616686582565308,
1200
+ "rewards/rejected": -1.5112109184265137,
1201
+ "step": 830
1202
+ },
1203
+ {
1204
+ "epoch": 1.73,
1205
+ "learning_rate": 9.396485867074104e-06,
1206
+ "logits/chosen": -2.8419387340545654,
1207
+ "logits/rejected": -2.7805838584899902,
1208
+ "logps/chosen": -271.58807373046875,
1209
+ "logps/rejected": -239.6482391357422,
1210
+ "loss": 0.4106,
1211
+ "rewards/accuracies": 0.793749988079071,
1212
+ "rewards/chosen": -0.35656026005744934,
1213
+ "rewards/margins": 1.2432340383529663,
1214
+ "rewards/rejected": -1.5997945070266724,
1215
+ "step": 840
1216
+ },
1217
+ {
1218
+ "epoch": 1.75,
1219
+ "learning_rate": 9.243697478991598e-06,
1220
+ "logits/chosen": -2.851105213165283,
1221
+ "logits/rejected": -2.8537163734436035,
1222
+ "logps/chosen": -285.19293212890625,
1223
+ "logps/rejected": -249.2686004638672,
1224
+ "loss": 0.4067,
1225
+ "rewards/accuracies": 0.7875000238418579,
1226
+ "rewards/chosen": -0.4058365225791931,
1227
+ "rewards/margins": 1.1938354969024658,
1228
+ "rewards/rejected": -1.5996720790863037,
1229
+ "step": 850
1230
+ },
1231
+ {
1232
+ "epoch": 1.77,
1233
+ "learning_rate": 9.090909090909091e-06,
1234
+ "logits/chosen": -2.8415451049804688,
1235
+ "logits/rejected": -2.816315174102783,
1236
+ "logps/chosen": -246.9827117919922,
1237
+ "logps/rejected": -244.11172485351562,
1238
+ "loss": 0.4166,
1239
+ "rewards/accuracies": 0.7437499761581421,
1240
+ "rewards/chosen": -0.5251684784889221,
1241
+ "rewards/margins": 1.321463942527771,
1242
+ "rewards/rejected": -1.8466323614120483,
1243
+ "step": 860
1244
+ },
1245
+ {
1246
+ "epoch": 1.79,
1247
+ "learning_rate": 8.938120702826586e-06,
1248
+ "logits/chosen": -2.8182404041290283,
1249
+ "logits/rejected": -2.7850711345672607,
1250
+ "logps/chosen": -320.24407958984375,
1251
+ "logps/rejected": -251.82949829101562,
1252
+ "loss": 0.409,
1253
+ "rewards/accuracies": 0.78125,
1254
+ "rewards/chosen": -0.2882133424282074,
1255
+ "rewards/margins": 1.439748764038086,
1256
+ "rewards/rejected": -1.7279622554779053,
1257
+ "step": 870
1258
+ },
1259
+ {
1260
+ "epoch": 1.81,
1261
+ "learning_rate": 8.78533231474408e-06,
1262
+ "logits/chosen": -2.859550952911377,
1263
+ "logits/rejected": -2.8387062549591064,
1264
+ "logps/chosen": -251.30764770507812,
1265
+ "logps/rejected": -253.18179321289062,
1266
+ "loss": 0.4071,
1267
+ "rewards/accuracies": 0.793749988079071,
1268
+ "rewards/chosen": -0.4950861930847168,
1269
+ "rewards/margins": 1.2707871198654175,
1270
+ "rewards/rejected": -1.7658733129501343,
1271
+ "step": 880
1272
+ },
1273
+ {
1274
+ "epoch": 1.84,
1275
+ "learning_rate": 8.632543926661574e-06,
1276
+ "logits/chosen": -2.8016982078552246,
1277
+ "logits/rejected": -2.7528939247131348,
1278
+ "logps/chosen": -277.7475891113281,
1279
+ "logps/rejected": -262.52215576171875,
1280
+ "loss": 0.444,
1281
+ "rewards/accuracies": 0.8062499761581421,
1282
+ "rewards/chosen": -0.487938791513443,
1283
+ "rewards/margins": 1.4403693675994873,
1284
+ "rewards/rejected": -1.928308129310608,
1285
+ "step": 890
1286
+ },
1287
+ {
1288
+ "epoch": 1.86,
1289
+ "learning_rate": 8.479755538579069e-06,
1290
+ "logits/chosen": -2.832099199295044,
1291
+ "logits/rejected": -2.786726951599121,
1292
+ "logps/chosen": -232.41787719726562,
1293
+ "logps/rejected": -241.41775512695312,
1294
+ "loss": 0.4149,
1295
+ "rewards/accuracies": 0.793749988079071,
1296
+ "rewards/chosen": -0.5055242776870728,
1297
+ "rewards/margins": 1.4381215572357178,
1298
+ "rewards/rejected": -1.9436458349227905,
1299
+ "step": 900
1300
+ },
1301
+ {
1302
+ "epoch": 1.88,
1303
+ "learning_rate": 8.326967150496563e-06,
1304
+ "logits/chosen": -2.793572425842285,
1305
+ "logits/rejected": -2.8044943809509277,
1306
+ "logps/chosen": -293.1367492675781,
1307
+ "logps/rejected": -241.0701904296875,
1308
+ "loss": 0.4144,
1309
+ "rewards/accuracies": 0.793749988079071,
1310
+ "rewards/chosen": -0.40146392583847046,
1311
+ "rewards/margins": 1.4329584836959839,
1312
+ "rewards/rejected": -1.8344223499298096,
1313
+ "step": 910
1314
+ },
1315
+ {
1316
+ "epoch": 1.9,
1317
+ "learning_rate": 8.174178762414056e-06,
1318
+ "logits/chosen": -2.861027240753174,
1319
+ "logits/rejected": -2.802192449569702,
1320
+ "logps/chosen": -302.29095458984375,
1321
+ "logps/rejected": -264.388916015625,
1322
+ "loss": 0.3921,
1323
+ "rewards/accuracies": 0.7875000238418579,
1324
+ "rewards/chosen": -0.377704381942749,
1325
+ "rewards/margins": 1.487374186515808,
1326
+ "rewards/rejected": -1.8650786876678467,
1327
+ "step": 920
1328
+ },
1329
+ {
1330
+ "epoch": 1.92,
1331
+ "learning_rate": 8.02139037433155e-06,
1332
+ "logits/chosen": -2.856374502182007,
1333
+ "logits/rejected": -2.875044584274292,
1334
+ "logps/chosen": -282.31585693359375,
1335
+ "logps/rejected": -235.76748657226562,
1336
+ "loss": 0.4205,
1337
+ "rewards/accuracies": 0.7875000238418579,
1338
+ "rewards/chosen": -0.39734259247779846,
1339
+ "rewards/margins": 1.3766673803329468,
1340
+ "rewards/rejected": -1.7740100622177124,
1341
+ "step": 930
1342
+ },
1343
+ {
1344
+ "epoch": 1.94,
1345
+ "learning_rate": 7.868601986249045e-06,
1346
+ "logits/chosen": -2.8075997829437256,
1347
+ "logits/rejected": -2.77669620513916,
1348
+ "logps/chosen": -256.1700134277344,
1349
+ "logps/rejected": -247.21896362304688,
1350
+ "loss": 0.4157,
1351
+ "rewards/accuracies": 0.7875000238418579,
1352
+ "rewards/chosen": -0.3420465886592865,
1353
+ "rewards/margins": 1.5288991928100586,
1354
+ "rewards/rejected": -1.8709455728530884,
1355
+ "step": 940
1356
+ },
1357
+ {
1358
+ "epoch": 1.96,
1359
+ "learning_rate": 7.71581359816654e-06,
1360
+ "logits/chosen": -2.8214058876037598,
1361
+ "logits/rejected": -2.812399387359619,
1362
+ "logps/chosen": -271.9144592285156,
1363
+ "logps/rejected": -248.6433563232422,
1364
+ "loss": 0.4086,
1365
+ "rewards/accuracies": 0.8187500238418579,
1366
+ "rewards/chosen": -0.5668919682502747,
1367
+ "rewards/margins": 1.462934970855713,
1368
+ "rewards/rejected": -2.0298266410827637,
1369
+ "step": 950
1370
+ },
1371
+ {
1372
+ "epoch": 1.98,
1373
+ "learning_rate": 7.563025210084034e-06,
1374
+ "logits/chosen": -2.855318069458008,
1375
+ "logits/rejected": -2.8766427040100098,
1376
+ "logps/chosen": -305.3939514160156,
1377
+ "logps/rejected": -259.01739501953125,
1378
+ "loss": 0.4274,
1379
+ "rewards/accuracies": 0.8125,
1380
+ "rewards/chosen": -0.4556516706943512,
1381
+ "rewards/margins": 1.3332924842834473,
1382
+ "rewards/rejected": -1.7889440059661865,
1383
+ "step": 960
1384
+ },
1385
+ {
1386
+ "epoch": 2.0,
1387
+ "learning_rate": 7.410236822001529e-06,
1388
+ "logits/chosen": -2.804309368133545,
1389
+ "logits/rejected": -2.7984023094177246,
1390
+ "logps/chosen": -269.3817443847656,
1391
+ "logps/rejected": -236.9586639404297,
1392
+ "loss": 0.4124,
1393
+ "rewards/accuracies": 0.824999988079071,
1394
+ "rewards/chosen": -0.4466172754764557,
1395
+ "rewards/margins": 1.4143751859664917,
1396
+ "rewards/rejected": -1.860992431640625,
1397
+ "step": 970
1398
+ },
1399
+ {
1400
+ "epoch": 2.0,
1401
+ "eval_logits/chosen": -2.865886688232422,
1402
+ "eval_logits/rejected": -2.8562960624694824,
1403
+ "eval_logps/chosen": -277.1995849609375,
1404
+ "eval_logps/rejected": -242.37806701660156,
1405
+ "eval_loss": 0.4544542133808136,
1406
+ "eval_rewards/accuracies": 0.81640625,
1407
+ "eval_rewards/chosen": -0.3551396429538727,
1408
+ "eval_rewards/margins": 1.4038398265838623,
1409
+ "eval_rewards/rejected": -1.7589795589447021,
1410
+ "eval_runtime": 253.9486,
1411
+ "eval_samples_per_second": 7.876,
1412
+ "eval_steps_per_second": 0.063,
1413
+ "step": 970
1414
+ },
1415
+ {
1416
+ "epoch": 2.02,
1417
+ "learning_rate": 7.257448433919023e-06,
1418
+ "logits/chosen": -2.8543519973754883,
1419
+ "logits/rejected": -2.8081583976745605,
1420
+ "logps/chosen": -279.414306640625,
1421
+ "logps/rejected": -276.77752685546875,
1422
+ "loss": 0.3522,
1423
+ "rewards/accuracies": 0.8187500238418579,
1424
+ "rewards/chosen": -0.36832618713378906,
1425
+ "rewards/margins": 1.5619885921478271,
1426
+ "rewards/rejected": -1.9303147792816162,
1427
+ "step": 980
1428
+ },
1429
+ {
1430
+ "epoch": 2.04,
1431
+ "learning_rate": 7.104660045836517e-06,
1432
+ "logits/chosen": -2.8730249404907227,
1433
+ "logits/rejected": -2.8948845863342285,
1434
+ "logps/chosen": -318.85736083984375,
1435
+ "logps/rejected": -257.4644470214844,
1436
+ "loss": 0.3859,
1437
+ "rewards/accuracies": 0.8500000238418579,
1438
+ "rewards/chosen": -0.29034391045570374,
1439
+ "rewards/margins": 1.8795220851898193,
1440
+ "rewards/rejected": -2.1698660850524902,
1441
+ "step": 990
1442
+ },
1443
+ {
1444
+ "epoch": 2.06,
1445
+ "learning_rate": 6.951871657754011e-06,
1446
+ "logits/chosen": -2.8562302589416504,
1447
+ "logits/rejected": -2.834265947341919,
1448
+ "logps/chosen": -259.0556640625,
1449
+ "logps/rejected": -247.9607696533203,
1450
+ "loss": 0.3666,
1451
+ "rewards/accuracies": 0.831250011920929,
1452
+ "rewards/chosen": -0.5741233229637146,
1453
+ "rewards/margins": 1.5469454526901245,
1454
+ "rewards/rejected": -2.1210689544677734,
1455
+ "step": 1000
1456
+ },
1457
+ {
1458
+ "epoch": 2.08,
1459
+ "learning_rate": 6.799083269671506e-06,
1460
+ "logits/chosen": -2.905282974243164,
1461
+ "logits/rejected": -2.836683988571167,
1462
+ "logps/chosen": -293.37139892578125,
1463
+ "logps/rejected": -253.64810180664062,
1464
+ "loss": 0.3664,
1465
+ "rewards/accuracies": 0.8374999761581421,
1466
+ "rewards/chosen": -0.5017197728157043,
1467
+ "rewards/margins": 1.8150691986083984,
1468
+ "rewards/rejected": -2.316789150238037,
1469
+ "step": 1010
1470
+ },
1471
+ {
1472
+ "epoch": 2.1,
1473
+ "learning_rate": 6.646294881588999e-06,
1474
+ "logits/chosen": -2.8647093772888184,
1475
+ "logits/rejected": -2.8602373600006104,
1476
+ "logps/chosen": -253.71450805664062,
1477
+ "logps/rejected": -248.3654022216797,
1478
+ "loss": 0.369,
1479
+ "rewards/accuracies": 0.831250011920929,
1480
+ "rewards/chosen": -0.581963300704956,
1481
+ "rewards/margins": 1.589311957359314,
1482
+ "rewards/rejected": -2.1712751388549805,
1483
+ "step": 1020
1484
+ },
1485
+ {
1486
+ "epoch": 2.12,
1487
+ "learning_rate": 6.493506493506494e-06,
1488
+ "logits/chosen": -2.837463617324829,
1489
+ "logits/rejected": -2.817621946334839,
1490
+ "logps/chosen": -285.40142822265625,
1491
+ "logps/rejected": -252.75448608398438,
1492
+ "loss": 0.354,
1493
+ "rewards/accuracies": 0.84375,
1494
+ "rewards/chosen": -0.5188383460044861,
1495
+ "rewards/margins": 1.7174046039581299,
1496
+ "rewards/rejected": -2.2362427711486816,
1497
+ "step": 1030
1498
+ },
1499
+ {
1500
+ "epoch": 2.14,
1501
+ "learning_rate": 6.340718105423988e-06,
1502
+ "logits/chosen": -2.901369571685791,
1503
+ "logits/rejected": -2.8450589179992676,
1504
+ "logps/chosen": -270.55145263671875,
1505
+ "logps/rejected": -257.68060302734375,
1506
+ "loss": 0.3873,
1507
+ "rewards/accuracies": 0.8687499761581421,
1508
+ "rewards/chosen": -0.28947222232818604,
1509
+ "rewards/margins": 1.6885324716567993,
1510
+ "rewards/rejected": -1.9780044555664062,
1511
+ "step": 1040
1512
+ },
1513
+ {
1514
+ "epoch": 2.16,
1515
+ "learning_rate": 6.187929717341482e-06,
1516
+ "logits/chosen": -2.8084139823913574,
1517
+ "logits/rejected": -2.83860445022583,
1518
+ "logps/chosen": -272.42822265625,
1519
+ "logps/rejected": -243.85299682617188,
1520
+ "loss": 0.3468,
1521
+ "rewards/accuracies": 0.8125,
1522
+ "rewards/chosen": -0.4739387035369873,
1523
+ "rewards/margins": 1.55910325050354,
1524
+ "rewards/rejected": -2.0330419540405273,
1525
+ "step": 1050
1526
+ },
1527
+ {
1528
+ "epoch": 2.19,
1529
+ "learning_rate": 6.0351413292589764e-06,
1530
+ "logits/chosen": -2.8273186683654785,
1531
+ "logits/rejected": -2.8402295112609863,
1532
+ "logps/chosen": -282.76422119140625,
1533
+ "logps/rejected": -263.18548583984375,
1534
+ "loss": 0.3782,
1535
+ "rewards/accuracies": 0.875,
1536
+ "rewards/chosen": -0.4047788083553314,
1537
+ "rewards/margins": 1.8494300842285156,
1538
+ "rewards/rejected": -2.254209041595459,
1539
+ "step": 1060
1540
+ },
1541
+ {
1542
+ "epoch": 2.21,
1543
+ "learning_rate": 5.882352941176471e-06,
1544
+ "logits/chosen": -2.8585734367370605,
1545
+ "logits/rejected": -2.8870089054107666,
1546
+ "logps/chosen": -242.56851196289062,
1547
+ "logps/rejected": -221.9137725830078,
1548
+ "loss": 0.3691,
1549
+ "rewards/accuracies": 0.762499988079071,
1550
+ "rewards/chosen": -0.5514971017837524,
1551
+ "rewards/margins": 1.47260582447052,
1552
+ "rewards/rejected": -2.0241026878356934,
1553
+ "step": 1070
1554
+ },
1555
+ {
1556
+ "epoch": 2.23,
1557
+ "learning_rate": 5.729564553093966e-06,
1558
+ "logits/chosen": -2.8390653133392334,
1559
+ "logits/rejected": -2.827547788619995,
1560
+ "logps/chosen": -336.2854309082031,
1561
+ "logps/rejected": -258.2242126464844,
1562
+ "loss": 0.373,
1563
+ "rewards/accuracies": 0.831250011920929,
1564
+ "rewards/chosen": -0.47643327713012695,
1565
+ "rewards/margins": 1.622065544128418,
1566
+ "rewards/rejected": -2.098498821258545,
1567
+ "step": 1080
1568
+ },
1569
+ {
1570
+ "epoch": 2.25,
1571
+ "learning_rate": 5.576776165011459e-06,
1572
+ "logits/chosen": -2.864114284515381,
1573
+ "logits/rejected": -2.8293545246124268,
1574
+ "logps/chosen": -273.42315673828125,
1575
+ "logps/rejected": -267.7647399902344,
1576
+ "loss": 0.3652,
1577
+ "rewards/accuracies": 0.8187500238418579,
1578
+ "rewards/chosen": -0.6307514905929565,
1579
+ "rewards/margins": 1.7940418720245361,
1580
+ "rewards/rejected": -2.424793243408203,
1581
+ "step": 1090
1582
+ },
1583
+ {
1584
+ "epoch": 2.27,
1585
+ "learning_rate": 5.423987776928954e-06,
1586
+ "logits/chosen": -2.847562313079834,
1587
+ "logits/rejected": -2.8787343502044678,
1588
+ "logps/chosen": -271.5540771484375,
1589
+ "logps/rejected": -237.20877075195312,
1590
+ "loss": 0.3705,
1591
+ "rewards/accuracies": 0.8187500238418579,
1592
+ "rewards/chosen": -0.7378746271133423,
1593
+ "rewards/margins": 1.611467719078064,
1594
+ "rewards/rejected": -2.3493425846099854,
1595
+ "step": 1100
1596
+ },
1597
+ {
1598
+ "epoch": 2.29,
1599
+ "learning_rate": 5.271199388846449e-06,
1600
+ "logits/chosen": -2.844463348388672,
1601
+ "logits/rejected": -2.7723591327667236,
1602
+ "logps/chosen": -289.79632568359375,
1603
+ "logps/rejected": -266.92529296875,
1604
+ "loss": 0.3637,
1605
+ "rewards/accuracies": 0.856249988079071,
1606
+ "rewards/chosen": -0.6152908205986023,
1607
+ "rewards/margins": 1.741233229637146,
1608
+ "rewards/rejected": -2.3565244674682617,
1609
+ "step": 1110
1610
+ },
1611
+ {
1612
+ "epoch": 2.31,
1613
+ "learning_rate": 5.118411000763942e-06,
1614
+ "logits/chosen": -2.8476827144622803,
1615
+ "logits/rejected": -2.8521735668182373,
1616
+ "logps/chosen": -273.8808288574219,
1617
+ "logps/rejected": -241.986572265625,
1618
+ "loss": 0.3575,
1619
+ "rewards/accuracies": 0.831250011920929,
1620
+ "rewards/chosen": -0.7439392805099487,
1621
+ "rewards/margins": 1.6854461431503296,
1622
+ "rewards/rejected": -2.4293856620788574,
1623
+ "step": 1120
1624
+ },
1625
+ {
1626
+ "epoch": 2.33,
1627
+ "learning_rate": 4.965622612681437e-06,
1628
+ "logits/chosen": -2.8075308799743652,
1629
+ "logits/rejected": -2.7614858150482178,
1630
+ "logps/chosen": -285.82550048828125,
1631
+ "logps/rejected": -258.968994140625,
1632
+ "loss": 0.3651,
1633
+ "rewards/accuracies": 0.856249988079071,
1634
+ "rewards/chosen": -0.7141876220703125,
1635
+ "rewards/margins": 1.7358572483062744,
1636
+ "rewards/rejected": -2.450045108795166,
1637
+ "step": 1130
1638
+ },
1639
+ {
1640
+ "epoch": 2.35,
1641
+ "learning_rate": 4.812834224598931e-06,
1642
+ "logits/chosen": -2.8139889240264893,
1643
+ "logits/rejected": -2.8023717403411865,
1644
+ "logps/chosen": -304.9195251464844,
1645
+ "logps/rejected": -261.23919677734375,
1646
+ "loss": 0.3568,
1647
+ "rewards/accuracies": 0.8374999761581421,
1648
+ "rewards/chosen": -0.511367678642273,
1649
+ "rewards/margins": 1.4749786853790283,
1650
+ "rewards/rejected": -1.9863464832305908,
1651
+ "step": 1140
1652
+ },
1653
+ {
1654
+ "epoch": 2.37,
1655
+ "learning_rate": 4.660045836516425e-06,
1656
+ "logits/chosen": -2.7972934246063232,
1657
+ "logits/rejected": -2.7768425941467285,
1658
+ "logps/chosen": -277.6763916015625,
1659
+ "logps/rejected": -251.0581817626953,
1660
+ "loss": 0.3389,
1661
+ "rewards/accuracies": 0.800000011920929,
1662
+ "rewards/chosen": -0.7810333967208862,
1663
+ "rewards/margins": 1.623008370399475,
1664
+ "rewards/rejected": -2.4040417671203613,
1665
+ "step": 1150
1666
+ },
1667
+ {
1668
+ "epoch": 2.39,
1669
+ "learning_rate": 4.5072574484339196e-06,
1670
+ "logits/chosen": -2.842956781387329,
1671
+ "logits/rejected": -2.8352267742156982,
1672
+ "logps/chosen": -246.30056762695312,
1673
+ "logps/rejected": -273.0400390625,
1674
+ "loss": 0.3685,
1675
+ "rewards/accuracies": 0.800000011920929,
1676
+ "rewards/chosen": -0.6348632574081421,
1677
+ "rewards/margins": 1.7184776067733765,
1678
+ "rewards/rejected": -2.3533406257629395,
1679
+ "step": 1160
1680
+ },
1681
+ {
1682
+ "epoch": 2.41,
1683
+ "learning_rate": 4.354469060351414e-06,
1684
+ "logits/chosen": -2.801945447921753,
1685
+ "logits/rejected": -2.791245937347412,
1686
+ "logps/chosen": -279.83709716796875,
1687
+ "logps/rejected": -284.6191711425781,
1688
+ "loss": 0.3482,
1689
+ "rewards/accuracies": 0.831250011920929,
1690
+ "rewards/chosen": -0.7146093249320984,
1691
+ "rewards/margins": 1.7736566066741943,
1692
+ "rewards/rejected": -2.4882664680480957,
1693
+ "step": 1170
1694
+ },
1695
+ {
1696
+ "epoch": 2.43,
1697
+ "learning_rate": 4.201680672268908e-06,
1698
+ "logits/chosen": -2.752314329147339,
1699
+ "logits/rejected": -2.751218318939209,
1700
+ "logps/chosen": -285.19122314453125,
1701
+ "logps/rejected": -262.41162109375,
1702
+ "loss": 0.3726,
1703
+ "rewards/accuracies": 0.824999988079071,
1704
+ "rewards/chosen": -0.7081433534622192,
1705
+ "rewards/margins": 1.631823182106018,
1706
+ "rewards/rejected": -2.339966297149658,
1707
+ "step": 1180
1708
+ },
1709
+ {
1710
+ "epoch": 2.45,
1711
+ "learning_rate": 4.048892284186402e-06,
1712
+ "logits/chosen": -2.7520546913146973,
1713
+ "logits/rejected": -2.706789493560791,
1714
+ "logps/chosen": -274.19903564453125,
1715
+ "logps/rejected": -271.12530517578125,
1716
+ "loss": 0.37,
1717
+ "rewards/accuracies": 0.8374999761581421,
1718
+ "rewards/chosen": -0.5612797737121582,
1719
+ "rewards/margins": 1.6494731903076172,
1720
+ "rewards/rejected": -2.2107529640197754,
1721
+ "step": 1190
1722
+ },
1723
+ {
1724
+ "epoch": 2.47,
1725
+ "learning_rate": 3.896103896103897e-06,
1726
+ "logits/chosen": -2.8326988220214844,
1727
+ "logits/rejected": -2.7695236206054688,
1728
+ "logps/chosen": -272.9900817871094,
1729
+ "logps/rejected": -257.9608154296875,
1730
+ "loss": 0.3775,
1731
+ "rewards/accuracies": 0.8500000238418579,
1732
+ "rewards/chosen": -0.7353371381759644,
1733
+ "rewards/margins": 1.5964215993881226,
1734
+ "rewards/rejected": -2.331758737564087,
1735
+ "step": 1200
1736
+ },
1737
+ {
1738
+ "epoch": 2.49,
1739
+ "learning_rate": 3.7433155080213907e-06,
1740
+ "logits/chosen": -2.7996091842651367,
1741
+ "logits/rejected": -2.8057217597961426,
1742
+ "logps/chosen": -290.14324951171875,
1743
+ "logps/rejected": -257.6935119628906,
1744
+ "loss": 0.3824,
1745
+ "rewards/accuracies": 0.862500011920929,
1746
+ "rewards/chosen": -0.5703123807907104,
1747
+ "rewards/margins": 1.7026172876358032,
1748
+ "rewards/rejected": -2.2729296684265137,
1749
+ "step": 1210
1750
+ },
1751
+ {
1752
+ "epoch": 2.52,
1753
+ "learning_rate": 3.5905271199388848e-06,
1754
+ "logits/chosen": -2.834031343460083,
1755
+ "logits/rejected": -2.8236374855041504,
1756
+ "logps/chosen": -279.32598876953125,
1757
+ "logps/rejected": -250.78292846679688,
1758
+ "loss": 0.376,
1759
+ "rewards/accuracies": 0.824999988079071,
1760
+ "rewards/chosen": -0.5887583494186401,
1761
+ "rewards/margins": 1.6018693447113037,
1762
+ "rewards/rejected": -2.1906275749206543,
1763
+ "step": 1220
1764
+ },
1765
+ {
1766
+ "epoch": 2.54,
1767
+ "learning_rate": 3.4377387318563792e-06,
1768
+ "logits/chosen": -2.8245434761047363,
1769
+ "logits/rejected": -2.852121353149414,
1770
+ "logps/chosen": -323.82940673828125,
1771
+ "logps/rejected": -270.3510437011719,
1772
+ "loss": 0.3703,
1773
+ "rewards/accuracies": 0.8062499761581421,
1774
+ "rewards/chosen": -0.7247987985610962,
1775
+ "rewards/margins": 1.512142539024353,
1776
+ "rewards/rejected": -2.236941337585449,
1777
+ "step": 1230
1778
+ },
1779
+ {
1780
+ "epoch": 2.56,
1781
+ "learning_rate": 3.2849503437738733e-06,
1782
+ "logits/chosen": -2.842057466506958,
1783
+ "logits/rejected": -2.855686902999878,
1784
+ "logps/chosen": -275.85418701171875,
1785
+ "logps/rejected": -251.3174591064453,
1786
+ "loss": 0.3878,
1787
+ "rewards/accuracies": 0.84375,
1788
+ "rewards/chosen": -0.4698792099952698,
1789
+ "rewards/margins": 1.7577711343765259,
1790
+ "rewards/rejected": -2.2276504039764404,
1791
+ "step": 1240
1792
+ },
1793
+ {
1794
+ "epoch": 2.58,
1795
+ "learning_rate": 3.1321619556913678e-06,
1796
+ "logits/chosen": -2.842163562774658,
1797
+ "logits/rejected": -2.8198578357696533,
1798
+ "logps/chosen": -293.8838195800781,
1799
+ "logps/rejected": -260.05194091796875,
1800
+ "loss": 0.3441,
1801
+ "rewards/accuracies": 0.84375,
1802
+ "rewards/chosen": -0.48028483986854553,
1803
+ "rewards/margins": 1.8665335178375244,
1804
+ "rewards/rejected": -2.346818447113037,
1805
+ "step": 1250
1806
+ },
1807
+ {
1808
+ "epoch": 2.6,
1809
+ "learning_rate": 2.979373567608862e-06,
1810
+ "logits/chosen": -2.835662364959717,
1811
+ "logits/rejected": -2.8839526176452637,
1812
+ "logps/chosen": -257.24334716796875,
1813
+ "logps/rejected": -238.25588989257812,
1814
+ "loss": 0.368,
1815
+ "rewards/accuracies": 0.824999988079071,
1816
+ "rewards/chosen": -0.6586586236953735,
1817
+ "rewards/margins": 1.7249507904052734,
1818
+ "rewards/rejected": -2.3836092948913574,
1819
+ "step": 1260
1820
+ },
1821
+ {
1822
+ "epoch": 2.62,
1823
+ "learning_rate": 2.826585179526356e-06,
1824
+ "logits/chosen": -2.7938365936279297,
1825
+ "logits/rejected": -2.767805576324463,
1826
+ "logps/chosen": -249.7235107421875,
1827
+ "logps/rejected": -234.31283569335938,
1828
+ "loss": 0.369,
1829
+ "rewards/accuracies": 0.8125,
1830
+ "rewards/chosen": -0.7289739847183228,
1831
+ "rewards/margins": 1.7214456796646118,
1832
+ "rewards/rejected": -2.4504194259643555,
1833
+ "step": 1270
1834
+ },
1835
+ {
1836
+ "epoch": 2.64,
1837
+ "learning_rate": 2.673796791443851e-06,
1838
+ "logits/chosen": -2.7239506244659424,
1839
+ "logits/rejected": -2.751171588897705,
1840
+ "logps/chosen": -329.4594421386719,
1841
+ "logps/rejected": -260.408935546875,
1842
+ "loss": 0.3679,
1843
+ "rewards/accuracies": 0.90625,
1844
+ "rewards/chosen": -0.6046456694602966,
1845
+ "rewards/margins": 2.0207152366638184,
1846
+ "rewards/rejected": -2.6253609657287598,
1847
+ "step": 1280
1848
+ },
1849
+ {
1850
+ "epoch": 2.66,
1851
+ "learning_rate": 2.521008403361345e-06,
1852
+ "logits/chosen": -2.797870397567749,
1853
+ "logits/rejected": -2.8259756565093994,
1854
+ "logps/chosen": -305.6458435058594,
1855
+ "logps/rejected": -244.69601440429688,
1856
+ "loss": 0.3668,
1857
+ "rewards/accuracies": 0.8374999761581421,
1858
+ "rewards/chosen": -0.6542243957519531,
1859
+ "rewards/margins": 1.900058388710022,
1860
+ "rewards/rejected": -2.5542826652526855,
1861
+ "step": 1290
1862
+ },
1863
+ {
1864
+ "epoch": 2.68,
1865
+ "learning_rate": 2.368220015278839e-06,
1866
+ "logits/chosen": -2.7966253757476807,
1867
+ "logits/rejected": -2.8371598720550537,
1868
+ "logps/chosen": -281.93939208984375,
1869
+ "logps/rejected": -261.60150146484375,
1870
+ "loss": 0.3348,
1871
+ "rewards/accuracies": 0.793749988079071,
1872
+ "rewards/chosen": -0.5757613182067871,
1873
+ "rewards/margins": 1.6483405828475952,
1874
+ "rewards/rejected": -2.224102020263672,
1875
+ "step": 1300
1876
+ },
1877
+ {
1878
+ "epoch": 2.7,
1879
+ "learning_rate": 2.2154316271963334e-06,
1880
+ "logits/chosen": -2.8347060680389404,
1881
+ "logits/rejected": -2.818382501602173,
1882
+ "logps/chosen": -266.34979248046875,
1883
+ "logps/rejected": -248.8013153076172,
1884
+ "loss": 0.3448,
1885
+ "rewards/accuracies": 0.8687499761581421,
1886
+ "rewards/chosen": -0.5766314268112183,
1887
+ "rewards/margins": 1.5551669597625732,
1888
+ "rewards/rejected": -2.131798505783081,
1889
+ "step": 1310
1890
+ },
1891
+ {
1892
+ "epoch": 2.72,
1893
+ "learning_rate": 2.0626432391138275e-06,
1894
+ "logits/chosen": -2.780651569366455,
1895
+ "logits/rejected": -2.7655837535858154,
1896
+ "logps/chosen": -317.9383544921875,
1897
+ "logps/rejected": -282.8804931640625,
1898
+ "loss": 0.3784,
1899
+ "rewards/accuracies": 0.875,
1900
+ "rewards/chosen": -0.5049678683280945,
1901
+ "rewards/margins": 1.9375810623168945,
1902
+ "rewards/rejected": -2.442549228668213,
1903
+ "step": 1320
1904
+ },
1905
+ {
1906
+ "epoch": 2.74,
1907
+ "learning_rate": 1.9098548510313215e-06,
1908
+ "logits/chosen": -2.8032174110412598,
1909
+ "logits/rejected": -2.7949814796447754,
1910
+ "logps/chosen": -260.9715576171875,
1911
+ "logps/rejected": -260.29913330078125,
1912
+ "loss": 0.3628,
1913
+ "rewards/accuracies": 0.8500000238418579,
1914
+ "rewards/chosen": -0.6280083060264587,
1915
+ "rewards/margins": 1.837847113609314,
1916
+ "rewards/rejected": -2.465855360031128,
1917
+ "step": 1330
1918
+ },
1919
+ {
1920
+ "epoch": 2.76,
1921
+ "learning_rate": 1.757066462948816e-06,
1922
+ "logits/chosen": -2.8185904026031494,
1923
+ "logits/rejected": -2.7385153770446777,
1924
+ "logps/chosen": -278.1485290527344,
1925
+ "logps/rejected": -278.1205139160156,
1926
+ "loss": 0.3636,
1927
+ "rewards/accuracies": 0.862500011920929,
1928
+ "rewards/chosen": -0.6104816198348999,
1929
+ "rewards/margins": 2.0218350887298584,
1930
+ "rewards/rejected": -2.6323161125183105,
1931
+ "step": 1340
1932
+ },
1933
+ {
1934
+ "epoch": 2.78,
1935
+ "learning_rate": 1.6042780748663103e-06,
1936
+ "logits/chosen": -2.8218092918395996,
1937
+ "logits/rejected": -2.7967371940612793,
1938
+ "logps/chosen": -297.7966613769531,
1939
+ "logps/rejected": -240.7419891357422,
1940
+ "loss": 0.3763,
1941
+ "rewards/accuracies": 0.7875000238418579,
1942
+ "rewards/chosen": -0.5741168260574341,
1943
+ "rewards/margins": 1.761366605758667,
1944
+ "rewards/rejected": -2.3354835510253906,
1945
+ "step": 1350
1946
+ },
1947
+ {
1948
+ "epoch": 2.8,
1949
+ "learning_rate": 1.4514896867838045e-06,
1950
+ "logits/chosen": -2.8747620582580566,
1951
+ "logits/rejected": -2.818427562713623,
1952
+ "logps/chosen": -280.43756103515625,
1953
+ "logps/rejected": -234.6200714111328,
1954
+ "loss": 0.3859,
1955
+ "rewards/accuracies": 0.8687499761581421,
1956
+ "rewards/chosen": -0.5531826019287109,
1957
+ "rewards/margins": 1.6504430770874023,
1958
+ "rewards/rejected": -2.2036256790161133,
1959
+ "step": 1360
1960
+ },
1961
+ {
1962
+ "epoch": 2.82,
1963
+ "learning_rate": 1.2987012987012986e-06,
1964
+ "logits/chosen": -2.8544864654541016,
1965
+ "logits/rejected": -2.8276607990264893,
1966
+ "logps/chosen": -298.3465881347656,
1967
+ "logps/rejected": -288.8724365234375,
1968
+ "loss": 0.3595,
1969
+ "rewards/accuracies": 0.8187500238418579,
1970
+ "rewards/chosen": -0.5492029786109924,
1971
+ "rewards/margins": 1.664731740951538,
1972
+ "rewards/rejected": -2.2139344215393066,
1973
+ "step": 1370
1974
+ },
1975
+ {
1976
+ "epoch": 2.85,
1977
+ "learning_rate": 1.145912910618793e-06,
1978
+ "logits/chosen": -2.7500030994415283,
1979
+ "logits/rejected": -2.7478601932525635,
1980
+ "logps/chosen": -245.6803741455078,
1981
+ "logps/rejected": -244.2843475341797,
1982
+ "loss": 0.3457,
1983
+ "rewards/accuracies": 0.831250011920929,
1984
+ "rewards/chosen": -0.5498741269111633,
1985
+ "rewards/margins": 1.8164262771606445,
1986
+ "rewards/rejected": -2.366300344467163,
1987
+ "step": 1380
1988
+ },
1989
+ {
1990
+ "epoch": 2.87,
1991
+ "learning_rate": 9.931245225362874e-07,
1992
+ "logits/chosen": -2.78765869140625,
1993
+ "logits/rejected": -2.7899107933044434,
1994
+ "logps/chosen": -269.9845886230469,
1995
+ "logps/rejected": -260.02557373046875,
1996
+ "loss": 0.366,
1997
+ "rewards/accuracies": 0.8500000238418579,
1998
+ "rewards/chosen": -0.5584810376167297,
1999
+ "rewards/margins": 1.836942434310913,
2000
+ "rewards/rejected": -2.395423412322998,
2001
+ "step": 1390
2002
+ },
2003
+ {
2004
+ "epoch": 2.89,
2005
+ "learning_rate": 8.403361344537816e-07,
2006
+ "logits/chosen": -2.822734832763672,
2007
+ "logits/rejected": -2.7889413833618164,
2008
+ "logps/chosen": -295.0702819824219,
2009
+ "logps/rejected": -263.552978515625,
2010
+ "loss": 0.3664,
2011
+ "rewards/accuracies": 0.7875000238418579,
2012
+ "rewards/chosen": -0.6814947128295898,
2013
+ "rewards/margins": 1.600311040878296,
2014
+ "rewards/rejected": -2.2818057537078857,
2015
+ "step": 1400
2016
+ },
2017
+ {
2018
+ "epoch": 2.91,
2019
+ "learning_rate": 6.875477463712758e-07,
2020
+ "logits/chosen": -2.7872085571289062,
2021
+ "logits/rejected": -2.813737392425537,
2022
+ "logps/chosen": -271.7468566894531,
2023
+ "logps/rejected": -254.4673309326172,
2024
+ "loss": 0.3564,
2025
+ "rewards/accuracies": 0.8500000238418579,
2026
+ "rewards/chosen": -0.44644269347190857,
2027
+ "rewards/margins": 1.852097511291504,
2028
+ "rewards/rejected": -2.298539876937866,
2029
+ "step": 1410
2030
+ },
2031
+ {
2032
+ "epoch": 2.93,
2033
+ "learning_rate": 5.347593582887701e-07,
2034
+ "logits/chosen": -2.8677477836608887,
2035
+ "logits/rejected": -2.8729989528656006,
2036
+ "logps/chosen": -321.89447021484375,
2037
+ "logps/rejected": -266.3446350097656,
2038
+ "loss": 0.3638,
2039
+ "rewards/accuracies": 0.8125,
2040
+ "rewards/chosen": -0.6643561124801636,
2041
+ "rewards/margins": 1.5856568813323975,
2042
+ "rewards/rejected": -2.2500128746032715,
2043
+ "step": 1420
2044
+ },
2045
+ {
2046
+ "epoch": 2.95,
2047
+ "learning_rate": 3.819709702062643e-07,
2048
+ "logits/chosen": -2.8617660999298096,
2049
+ "logits/rejected": -2.8304145336151123,
2050
+ "logps/chosen": -289.00482177734375,
2051
+ "logps/rejected": -255.83004760742188,
2052
+ "loss": 0.3575,
2053
+ "rewards/accuracies": 0.8374999761581421,
2054
+ "rewards/chosen": -0.6355488300323486,
2055
+ "rewards/margins": 1.5412448644638062,
2056
+ "rewards/rejected": -2.1767935752868652,
2057
+ "step": 1430
2058
+ },
2059
+ {
2060
+ "epoch": 2.97,
2061
+ "learning_rate": 2.2918258212375862e-07,
2062
+ "logits/chosen": -2.813324451446533,
2063
+ "logits/rejected": -2.822589635848999,
2064
+ "logps/chosen": -270.77374267578125,
2065
+ "logps/rejected": -270.3390808105469,
2066
+ "loss": 0.3543,
2067
+ "rewards/accuracies": 0.8999999761581421,
2068
+ "rewards/chosen": -0.6036959886550903,
2069
+ "rewards/margins": 1.9311864376068115,
2070
+ "rewards/rejected": -2.5348825454711914,
2071
+ "step": 1440
2072
+ },
2073
+ {
2074
+ "epoch": 2.99,
2075
+ "learning_rate": 7.639419404125288e-08,
2076
+ "logits/chosen": -2.8171768188476562,
2077
+ "logits/rejected": -2.779967784881592,
2078
+ "logps/chosen": -303.8172912597656,
2079
+ "logps/rejected": -260.3236389160156,
2080
+ "loss": 0.3549,
2081
+ "rewards/accuracies": 0.831250011920929,
2082
+ "rewards/chosen": -0.6331970691680908,
2083
+ "rewards/margins": 1.8079423904418945,
2084
+ "rewards/rejected": -2.4411392211914062,
2085
+ "step": 1450
2086
+ },
2087
+ {
2088
+ "epoch": 3.0,
2089
+ "eval_logits/chosen": -2.842160701751709,
2090
+ "eval_logits/rejected": -2.833141326904297,
2091
+ "eval_logps/chosen": -279.5245056152344,
2092
+ "eval_logps/rejected": -246.69915771484375,
2093
+ "eval_loss": 0.45531293749809265,
2094
+ "eval_rewards/accuracies": 0.8359375,
2095
+ "eval_rewards/chosen": -0.5876308083534241,
2096
+ "eval_rewards/margins": 1.6034575700759888,
2097
+ "eval_rewards/rejected": -2.1910881996154785,
2098
+ "eval_runtime": 254.2995,
2099
+ "eval_samples_per_second": 7.865,
2100
+ "eval_steps_per_second": 0.063,
2101
+ "step": 1455
2102
+ },
2103
+ {
2104
+ "epoch": 3.0,
2105
+ "step": 1455,
2106
  "total_flos": 0.0,
2107
+ "train_loss": 0.43281792414557074,
2108
+ "train_runtime": 46468.4841,
2109
+ "train_samples_per_second": 4.001,
2110
  "train_steps_per_second": 0.031
2111
  }
2112
  ],
2113
  "logging_steps": 10,
2114
+ "max_steps": 1455,
2115
+ "num_train_epochs": 3,
2116
  "save_steps": 500,
2117
  "total_flos": 0.0,
2118
  "trial_name": null,