jikaixuan commited on
Commit
44e3e84
1 Parent(s): 1b1d13d

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,9 @@
2
  license: apache-2.0
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - dpo
9
  - generated_from_trainer
10
- datasets:
11
- - HuggingFaceH4/ultrafeedback_binarized
12
  base_model: mistralai/Mistral-7B-v0.1
13
  model-index:
14
  - name: zephyr-7b-dpo-qlora
@@ -20,17 +16,17 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # zephyr-7b-dpo-qlora
22
 
23
- This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-qlora](https://huggingface.co/alignment-handbook/zephyr-7b-sft-qlora) on the HuggingFaceH4/ultrafeedback_binarized dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 2121.0452
26
- - Rewards/chosen: 0.0578
27
- - Rewards/rejected: -0.0912
28
- - Rewards/accuracies: 0.7599
29
- - Rewards/margins: 0.1490
30
- - Logps/rejected: -253.8891
31
- - Logps/chosen: -259.2458
32
- - Logits/rejected: -2.2028
33
- - Logits/chosen: -2.2552
34
 
35
  ## Model description
36
 
@@ -67,10 +63,10 @@ The following hyperparameters were used during training:
67
 
68
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
69
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
70
- | 2149.4746 | 0.21 | 100 | 2190.7666 | 0.0445 | -0.0848 | 0.7460 | 0.1293 | -253.2523 | -260.5782 | -2.1770 | -2.2229 |
71
- | 2105.1256 | 0.42 | 200 | 2151.1555 | 0.0543 | -0.0961 | 0.7599 | 0.1504 | -254.3840 | -259.5941 | -2.2074 | -2.2603 |
72
- | 2135.4973 | 0.63 | 300 | 2129.0896 | 0.0626 | -0.0799 | 0.7560 | 0.1425 | -252.7585 | -258.7624 | -2.2232 | -2.2765 |
73
- | 2099.8018 | 0.84 | 400 | 2121.6672 | 0.0538 | -0.0959 | 0.7540 | 0.1497 | -254.3591 | -259.6440 | -2.2016 | -2.2541 |
74
 
75
 
76
  ### Framework versions
 
2
  license: apache-2.0
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - dpo
7
  - generated_from_trainer
 
 
8
  base_model: mistralai/Mistral-7B-v0.1
9
  model-index:
10
  - name: zephyr-7b-dpo-qlora
 
16
 
17
  # zephyr-7b-dpo-qlora
18
 
19
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 1721.1201
22
+ - Rewards/chosen: -0.0627
23
+ - Rewards/rejected: -0.2250
24
+ - Rewards/accuracies: 0.7738
25
+ - Rewards/margins: 0.1623
26
+ - Logps/rejected: -267.2721
27
+ - Logps/chosen: -271.2979
28
+ - Logits/rejected: -2.0354
29
+ - Logits/chosen: -2.0918
30
 
31
  ## Model description
32
 
 
63
 
64
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
65
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
66
+ | 1797.9404 | 0.21 | 100 | 1887.4103 | 0.0131 | -0.1197 | 0.7520 | 0.1328 | -256.7424 | -263.7133 | -2.1486 | -2.1969 |
67
+ | 1700.9055 | 0.42 | 200 | 1784.6598 | -0.0464 | -0.2062 | 0.7619 | 0.1598 | -265.3905 | -269.6655 | -2.1081 | -2.1618 |
68
+ | 1767.2219 | 0.63 | 300 | 1735.5183 | -0.0467 | -0.2001 | 0.7698 | 0.1534 | -264.7795 | -269.6956 | -2.1057 | -2.1587 |
69
+ | 1717.4336 | 0.84 | 400 | 1721.6765 | -0.0691 | -0.2309 | 0.7718 | 0.1618 | -267.8569 | -271.9333 | -2.0322 | -2.0885 |
70
 
71
 
72
  ### Framework versions
adapter_config.json CHANGED
@@ -19,13 +19,13 @@
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
- "up_proj",
23
- "v_proj",
24
  "q_proj",
25
  "gate_proj",
 
 
26
  "down_proj",
27
  "o_proj",
28
- "k_proj"
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
 
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
 
 
22
  "q_proj",
23
  "gate_proj",
24
+ "up_proj",
25
+ "k_proj",
26
  "down_proj",
27
  "o_proj",
28
+ "v_proj"
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6f7d1a95415a6c03799926b1b3b0647c3602207bcc4fb5c48fa957c5b2fea04
3
  size 671150064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58513bf1529e315eda3b88d4c9cacb2897ba3fd8a6c935b6b16975253aa6b856
3
  size 671150064
all_results.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_logits/chosen": -2.255185127258301,
4
- "eval_logits/rejected": -2.2027812004089355,
5
- "eval_logps/chosen": -259.245849609375,
6
- "eval_logps/rejected": -253.8891143798828,
7
- "eval_loss": 2121.045166015625,
8
- "eval_rewards/accuracies": 0.7599206566810608,
9
- "eval_rewards/chosen": 0.057787273079156876,
10
- "eval_rewards/margins": 0.14896924793720245,
11
- "eval_rewards/rejected": -0.09118196368217468,
12
- "eval_runtime": 547.839,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 3.651,
15
  "eval_steps_per_second": 0.115,
16
- "train_loss": 2164.5614415454666,
17
- "train_runtime": 32346.8016,
18
  "train_samples": 61135,
19
- "train_samples_per_second": 1.89,
20
  "train_steps_per_second": 0.015
21
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_logits/chosen": -2.0918362140655518,
4
+ "eval_logits/rejected": -2.03544020652771,
5
+ "eval_logps/chosen": -271.2979431152344,
6
+ "eval_logps/rejected": -267.2720642089844,
7
+ "eval_loss": 1721.1201171875,
8
+ "eval_rewards/accuracies": 0.773809552192688,
9
+ "eval_rewards/chosen": -0.06273359060287476,
10
+ "eval_rewards/margins": 0.16227789223194122,
11
+ "eval_rewards/rejected": -0.22501146793365479,
12
+ "eval_runtime": 548.8776,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 3.644,
15
  "eval_steps_per_second": 0.115,
16
+ "train_loss": 1826.8015694608227,
17
+ "train_runtime": 32379.7062,
18
  "train_samples": 61135,
19
+ "train_samples_per_second": 1.888,
20
  "train_steps_per_second": 0.015
21
  }
eval_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_logits/chosen": -2.255185127258301,
4
- "eval_logits/rejected": -2.2027812004089355,
5
- "eval_logps/chosen": -259.245849609375,
6
- "eval_logps/rejected": -253.8891143798828,
7
- "eval_loss": 2121.045166015625,
8
- "eval_rewards/accuracies": 0.7599206566810608,
9
- "eval_rewards/chosen": 0.057787273079156876,
10
- "eval_rewards/margins": 0.14896924793720245,
11
- "eval_rewards/rejected": -0.09118196368217468,
12
- "eval_runtime": 547.839,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 3.651,
15
  "eval_steps_per_second": 0.115
16
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_logits/chosen": -2.0918362140655518,
4
+ "eval_logits/rejected": -2.03544020652771,
5
+ "eval_logps/chosen": -271.2979431152344,
6
+ "eval_logps/rejected": -267.2720642089844,
7
+ "eval_loss": 1721.1201171875,
8
+ "eval_rewards/accuracies": 0.773809552192688,
9
+ "eval_rewards/chosen": -0.06273359060287476,
10
+ "eval_rewards/margins": 0.16227789223194122,
11
+ "eval_rewards/rejected": -0.22501146793365479,
12
+ "eval_runtime": 548.8776,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 3.644,
15
  "eval_steps_per_second": 0.115
16
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 2164.5614415454666,
4
- "train_runtime": 32346.8016,
5
  "train_samples": 61135,
6
- "train_samples_per_second": 1.89,
7
  "train_steps_per_second": 0.015
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 1826.8015694608227,
4
+ "train_runtime": 32379.7062,
5
  "train_samples": 61135,
6
+ "train_samples_per_second": 1.888,
7
  "train_steps_per_second": 0.015
8
  }
trainer_state.json CHANGED
@@ -25,732 +25,732 @@
25
  {
26
  "epoch": 0.02,
27
  "learning_rate": 1.0416666666666667e-06,
28
- "logits/chosen": -2.5851330757141113,
29
- "logits/rejected": -2.6188478469848633,
30
- "logps/chosen": -265.6952209472656,
31
- "logps/rejected": -261.4213562011719,
32
- "loss": 2495.385,
33
- "rewards/accuracies": 0.4375,
34
- "rewards/chosen": 0.005977082531899214,
35
- "rewards/margins": 0.0005994850071147084,
36
- "rewards/rejected": 0.005377596709877253,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.04,
41
  "learning_rate": 2.0833333333333334e-06,
42
- "logits/chosen": -2.6101512908935547,
43
- "logits/rejected": -2.5939109325408936,
44
- "logps/chosen": -255.68185424804688,
45
- "logps/rejected": -248.1254119873047,
46
- "loss": 2457.86,
47
- "rewards/accuracies": 0.628125011920929,
48
- "rewards/chosen": 0.013690793886780739,
49
- "rewards/margins": 0.00916606467217207,
50
- "rewards/rejected": 0.004524729214608669,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.06,
55
  "learning_rate": 3.125e-06,
56
- "logits/chosen": -2.604323148727417,
57
- "logits/rejected": -2.598053455352783,
58
- "logps/chosen": -254.423095703125,
59
- "logps/rejected": -226.73153686523438,
60
- "loss": 2402.3988,
61
- "rewards/accuracies": 0.703125,
62
- "rewards/chosen": 0.01266755722463131,
63
- "rewards/margins": 0.024019470438361168,
64
- "rewards/rejected": -0.01135191135108471,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.08,
69
  "learning_rate": 4.166666666666667e-06,
70
- "logits/chosen": -2.6043972969055176,
71
- "logits/rejected": -2.582412004470825,
72
- "logps/chosen": -279.12042236328125,
73
- "logps/rejected": -241.2065887451172,
74
- "loss": 2290.4264,
75
  "rewards/accuracies": 0.6968749761581421,
76
- "rewards/chosen": 0.024520257487893105,
77
- "rewards/margins": 0.0557018406689167,
78
- "rewards/rejected": -0.031181585043668747,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.1,
83
  "learning_rate": 4.999731868769027e-06,
84
- "logits/chosen": -2.531161308288574,
85
- "logits/rejected": -2.5264387130737305,
86
- "logps/chosen": -252.51846313476562,
87
- "logps/rejected": -247.7227325439453,
88
- "loss": 2291.9322,
89
- "rewards/accuracies": 0.6812499761581421,
90
- "rewards/chosen": 0.029673133045434952,
91
- "rewards/margins": 0.08245684206485748,
92
- "rewards/rejected": -0.05278371647000313,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.13,
97
  "learning_rate": 4.9903533134293035e-06,
98
- "logits/chosen": -2.545037031173706,
99
- "logits/rejected": -2.5416412353515625,
100
- "logps/chosen": -260.83905029296875,
101
- "logps/rejected": -239.8417205810547,
102
- "loss": 2269.9371,
103
- "rewards/accuracies": 0.6937500238418579,
104
- "rewards/chosen": 0.03231300041079521,
105
- "rewards/margins": 0.09112317860126495,
106
- "rewards/rejected": -0.05881017446517944,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.15,
111
  "learning_rate": 4.967625656594782e-06,
112
- "logits/chosen": -2.5832419395446777,
113
- "logits/rejected": -2.564356565475464,
114
- "logps/chosen": -275.95452880859375,
115
- "logps/rejected": -264.7611083984375,
116
- "loss": 2236.1113,
117
- "rewards/accuracies": 0.6968749761581421,
118
- "rewards/chosen": 0.036882974207401276,
119
- "rewards/margins": 0.08578891307115555,
120
- "rewards/rejected": -0.048905935138463974,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.17,
125
  "learning_rate": 4.93167072587771e-06,
126
- "logits/chosen": -2.552919864654541,
127
- "logits/rejected": -2.524970293045044,
128
- "logps/chosen": -257.78448486328125,
129
- "logps/rejected": -262.3812561035156,
130
- "loss": 2220.0893,
131
- "rewards/accuracies": 0.737500011920929,
132
- "rewards/chosen": 0.037375591695308685,
133
- "rewards/margins": 0.11339374631643295,
134
- "rewards/rejected": -0.07601816952228546,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.19,
139
  "learning_rate": 4.882681251368549e-06,
140
- "logits/chosen": -2.56257963180542,
141
- "logits/rejected": -2.5289363861083984,
142
- "logps/chosen": -239.4860382080078,
143
- "logps/rejected": -252.36196899414062,
144
- "loss": 2167.3848,
145
- "rewards/accuracies": 0.7250000238418579,
146
- "rewards/chosen": 0.04182355850934982,
147
- "rewards/margins": 0.10886694490909576,
148
- "rewards/rejected": -0.06704337894916534,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.21,
153
  "learning_rate": 4.8209198325401815e-06,
154
- "logits/chosen": -2.5551962852478027,
155
- "logits/rejected": -2.562063455581665,
156
- "logps/chosen": -266.8739013671875,
157
- "logps/rejected": -269.649169921875,
158
- "loss": 2149.4746,
159
  "rewards/accuracies": 0.762499988079071,
160
- "rewards/chosen": 0.04759662598371506,
161
- "rewards/margins": 0.1307816356420517,
162
- "rewards/rejected": -0.08318501710891724,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.21,
167
- "eval_logits/chosen": -2.222931385040283,
168
- "eval_logits/rejected": -2.1770126819610596,
169
- "eval_logps/chosen": -260.57818603515625,
170
- "eval_logps/rejected": -253.25228881835938,
171
- "eval_loss": 2190.7666015625,
172
- "eval_rewards/accuracies": 0.7460317611694336,
173
- "eval_rewards/chosen": 0.044464047998189926,
174
- "eval_rewards/margins": 0.12927772104740143,
175
- "eval_rewards/rejected": -0.0848136618733406,
176
- "eval_runtime": 549.355,
177
- "eval_samples_per_second": 3.641,
178
  "eval_steps_per_second": 0.115,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.23,
183
  "learning_rate": 4.746717530629565e-06,
184
- "logits/chosen": -2.5229454040527344,
185
- "logits/rejected": -2.5105621814727783,
186
- "logps/chosen": -261.46649169921875,
187
- "logps/rejected": -256.37835693359375,
188
- "loss": 2174.1184,
189
- "rewards/accuracies": 0.746874988079071,
190
- "rewards/chosen": 0.03517655283212662,
191
- "rewards/margins": 0.11897265911102295,
192
- "rewards/rejected": -0.08379611372947693,
193
  "step": 110
194
  },
195
  {
196
  "epoch": 0.25,
197
  "learning_rate": 4.660472094042121e-06,
198
- "logits/chosen": -2.5114097595214844,
199
- "logits/rejected": -2.481840133666992,
200
- "logps/chosen": -246.70370483398438,
201
- "logps/rejected": -238.27621459960938,
202
- "loss": 2181.3053,
203
- "rewards/accuracies": 0.699999988079071,
204
- "rewards/chosen": 0.044524095952510834,
205
- "rewards/margins": 0.10293309390544891,
206
- "rewards/rejected": -0.05840899422764778,
207
  "step": 120
208
  },
209
  {
210
  "epoch": 0.27,
211
  "learning_rate": 4.5626458262912745e-06,
212
- "logits/chosen": -2.4726600646972656,
213
- "logits/rejected": -2.46514630317688,
214
- "logps/chosen": -271.7862548828125,
215
- "logps/rejected": -260.61676025390625,
216
- "loss": 2175.3252,
217
- "rewards/accuracies": 0.762499988079071,
218
- "rewards/chosen": 0.06200919300317764,
219
- "rewards/margins": 0.12613125145435333,
220
- "rewards/rejected": -0.06412206590175629,
221
  "step": 130
222
  },
223
  {
224
  "epoch": 0.29,
225
  "learning_rate": 4.453763107901676e-06,
226
- "logits/chosen": -2.506436586380005,
227
- "logits/rejected": -2.5005128383636475,
228
- "logps/chosen": -237.8655242919922,
229
- "logps/rejected": -249.9298553466797,
230
- "loss": 2167.2516,
231
  "rewards/accuracies": 0.734375,
232
- "rewards/chosen": 0.024008702486753464,
233
- "rewards/margins": 0.1495535969734192,
234
- "rewards/rejected": -0.12554487586021423,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.31,
239
  "learning_rate": 4.33440758555951e-06,
240
- "logits/chosen": -2.5227842330932617,
241
- "logits/rejected": -2.536785364151001,
242
- "logps/chosen": -260.7518005371094,
243
- "logps/rejected": -235.9630889892578,
244
- "loss": 2119.4062,
245
- "rewards/accuracies": 0.715624988079071,
246
- "rewards/chosen": 0.04733316972851753,
247
- "rewards/margins": 0.12345732748508453,
248
- "rewards/rejected": -0.0761241465806961,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.33,
253
  "learning_rate": 4.205219043576955e-06,
254
- "logits/chosen": -2.5534234046936035,
255
- "logits/rejected": -2.4914207458496094,
256
- "logps/chosen": -254.14065551757812,
257
- "logps/rejected": -250.95700073242188,
258
- "loss": 2114.7645,
259
- "rewards/accuracies": 0.778124988079071,
260
- "rewards/chosen": 0.06031092256307602,
261
- "rewards/margins": 0.15202957391738892,
262
- "rewards/rejected": -0.09171866625547409,
263
  "step": 160
264
  },
265
  {
266
  "epoch": 0.36,
267
  "learning_rate": 4.066889974440757e-06,
268
- "logits/chosen": -2.5092320442199707,
269
- "logits/rejected": -2.4965577125549316,
270
- "logps/chosen": -254.91439819335938,
271
- "logps/rejected": -242.8040008544922,
272
- "loss": 2229.8135,
273
- "rewards/accuracies": 0.675000011920929,
274
- "rewards/chosen": 0.034448813647031784,
275
- "rewards/margins": 0.12951095402240753,
276
- "rewards/rejected": -0.09506212174892426,
277
  "step": 170
278
  },
279
  {
280
  "epoch": 0.38,
281
  "learning_rate": 3.92016186682789e-06,
282
- "logits/chosen": -2.521221399307251,
283
- "logits/rejected": -2.533686399459839,
284
- "logps/chosen": -251.4235382080078,
285
- "logps/rejected": -259.76220703125,
286
- "loss": 2175.5213,
287
  "rewards/accuracies": 0.721875011920929,
288
- "rewards/chosen": 0.04062749817967415,
289
- "rewards/margins": 0.120635487139225,
290
- "rewards/rejected": -0.08000798523426056,
291
  "step": 180
292
  },
293
  {
294
  "epoch": 0.4,
295
  "learning_rate": 3.7658212309857576e-06,
296
- "logits/chosen": -2.5192363262176514,
297
- "logits/rejected": -2.4917151927948,
298
- "logps/chosen": -255.2060089111328,
299
- "logps/rejected": -250.82022094726562,
300
- "loss": 2099.443,
301
- "rewards/accuracies": 0.715624988079071,
302
- "rewards/chosen": 0.0492943711578846,
303
- "rewards/margins": 0.14053165912628174,
304
- "rewards/rejected": -0.09123729914426804,
305
  "step": 190
306
  },
307
  {
308
  "epoch": 0.42,
309
  "learning_rate": 3.604695382782159e-06,
310
- "logits/chosen": -2.5251801013946533,
311
- "logits/rejected": -2.5034642219543457,
312
- "logps/chosen": -269.3675537109375,
313
- "logps/rejected": -262.86376953125,
314
- "loss": 2105.1256,
315
- "rewards/accuracies": 0.7562500238418579,
316
- "rewards/chosen": 0.0575677752494812,
317
- "rewards/margins": 0.14340198040008545,
318
- "rewards/rejected": -0.08583419024944305,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.42,
323
- "eval_logits/chosen": -2.260270833969116,
324
- "eval_logits/rejected": -2.2073864936828613,
325
- "eval_logps/chosen": -259.5941467285156,
326
- "eval_logps/rejected": -254.3839874267578,
327
- "eval_loss": 2151.155517578125,
328
- "eval_rewards/accuracies": 0.7599206566810608,
329
- "eval_rewards/chosen": 0.05430443957448006,
330
- "eval_rewards/margins": 0.15043501555919647,
331
- "eval_rewards/rejected": -0.09613056480884552,
332
- "eval_runtime": 548.195,
333
- "eval_samples_per_second": 3.648,
334
  "eval_steps_per_second": 0.115,
335
  "step": 200
336
  },
337
  {
338
  "epoch": 0.44,
339
  "learning_rate": 3.437648009023905e-06,
340
- "logits/chosen": -2.533383369445801,
341
- "logits/rejected": -2.4935860633850098,
342
- "logps/chosen": -243.6236114501953,
343
- "logps/rejected": -238.85140991210938,
344
- "loss": 2145.5416,
345
- "rewards/accuracies": 0.7437499761581421,
346
- "rewards/chosen": 0.06410142779350281,
347
- "rewards/margins": 0.14374245703220367,
348
- "rewards/rejected": -0.07964102178812027,
349
  "step": 210
350
  },
351
  {
352
  "epoch": 0.46,
353
  "learning_rate": 3.265574537815398e-06,
354
- "logits/chosen": -2.554565906524658,
355
- "logits/rejected": -2.56289005279541,
356
- "logps/chosen": -277.4061584472656,
357
- "logps/rejected": -253.40048217773438,
358
- "loss": 2196.8484,
359
- "rewards/accuracies": 0.675000011920929,
360
- "rewards/chosen": 0.052330613136291504,
361
- "rewards/margins": 0.11339585483074188,
362
- "rewards/rejected": -0.06106524541974068,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 0.48,
367
  "learning_rate": 3.089397338773569e-06,
368
- "logits/chosen": -2.4857611656188965,
369
- "logits/rejected": -2.473193407058716,
370
- "logps/chosen": -247.3427276611328,
371
- "logps/rejected": -241.8627471923828,
372
- "loss": 2160.1729,
373
- "rewards/accuracies": 0.7124999761581421,
374
- "rewards/chosen": 0.03845102712512016,
375
- "rewards/margins": 0.11976752430200577,
376
- "rewards/rejected": -0.0813164934515953,
377
  "step": 230
378
  },
379
  {
380
  "epoch": 0.5,
381
  "learning_rate": 2.9100607788275547e-06,
382
- "logits/chosen": -2.5121560096740723,
383
- "logits/rejected": -2.516338586807251,
384
- "logps/chosen": -257.1769714355469,
385
- "logps/rejected": -247.3695068359375,
386
- "loss": 2185.7641,
387
- "rewards/accuracies": 0.684374988079071,
388
- "rewards/chosen": 0.0379050187766552,
389
- "rewards/margins": 0.11140499264001846,
390
- "rewards/rejected": -0.07349997013807297,
391
  "step": 240
392
  },
393
  {
394
  "epoch": 0.52,
395
  "learning_rate": 2.72852616010567e-06,
396
- "logits/chosen": -2.5092978477478027,
397
- "logits/rejected": -2.487090826034546,
398
- "logps/chosen": -264.5955505371094,
399
- "logps/rejected": -246.3382110595703,
400
- "loss": 2136.6197,
401
- "rewards/accuracies": 0.7406250238418579,
402
- "rewards/chosen": 0.039962492883205414,
403
- "rewards/margins": 0.1403963267803192,
404
- "rewards/rejected": -0.1004338413476944,
405
  "step": 250
406
  },
407
  {
408
  "epoch": 0.54,
409
  "learning_rate": 2.5457665670441937e-06,
410
- "logits/chosen": -2.5069711208343506,
411
- "logits/rejected": -2.5030505657196045,
412
- "logps/chosen": -257.4859619140625,
413
- "logps/rejected": -231.91958618164062,
414
- "loss": 2085.2795,
415
- "rewards/accuracies": 0.762499988079071,
416
- "rewards/chosen": 0.05723271518945694,
417
- "rewards/margins": 0.15024301409721375,
418
- "rewards/rejected": -0.0930103212594986,
419
  "step": 260
420
  },
421
  {
422
  "epoch": 0.57,
423
  "learning_rate": 2.3627616503391813e-06,
424
- "logits/chosen": -2.525665760040283,
425
- "logits/rejected": -2.5043163299560547,
426
- "logps/chosen": -280.7471618652344,
427
- "logps/rejected": -267.36712646484375,
428
- "loss": 2089.859,
429
- "rewards/accuracies": 0.737500011920929,
430
- "rewards/chosen": 0.05569761246442795,
431
- "rewards/margins": 0.179846853017807,
432
- "rewards/rejected": -0.12414924055337906,
433
  "step": 270
434
  },
435
  {
436
  "epoch": 0.59,
437
  "learning_rate": 2.1804923757009885e-06,
438
- "logits/chosen": -2.500837564468384,
439
- "logits/rejected": -2.501950740814209,
440
- "logps/chosen": -270.04193115234375,
441
- "logps/rejected": -248.61978149414062,
442
- "loss": 2111.6906,
443
- "rewards/accuracies": 0.7124999761581421,
444
- "rewards/chosen": 0.05320361256599426,
445
- "rewards/margins": 0.1410333216190338,
446
- "rewards/rejected": -0.08782971650362015,
447
  "step": 280
448
  },
449
  {
450
  "epoch": 0.61,
451
  "learning_rate": 1.9999357655598894e-06,
452
- "logits/chosen": -2.5122292041778564,
453
- "logits/rejected": -2.50368070602417,
454
- "logps/chosen": -258.72686767578125,
455
- "logps/rejected": -256.91387939453125,
456
- "loss": 2137.0592,
457
- "rewards/accuracies": 0.737500011920929,
458
- "rewards/chosen": 0.053160279989242554,
459
- "rewards/margins": 0.15454119443893433,
460
- "rewards/rejected": -0.10138092190027237,
461
  "step": 290
462
  },
463
  {
464
  "epoch": 0.63,
465
  "learning_rate": 1.8220596619089576e-06,
466
- "logits/chosen": -2.471623659133911,
467
- "logits/rejected": -2.4690403938293457,
468
- "logps/chosen": -246.51766967773438,
469
- "logps/rejected": -251.79257202148438,
470
- "loss": 2135.4973,
471
- "rewards/accuracies": 0.71875,
472
- "rewards/chosen": 0.0453377440571785,
473
- "rewards/margins": 0.12641170620918274,
474
- "rewards/rejected": -0.08107397705316544,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.63,
479
- "eval_logits/chosen": -2.2764506340026855,
480
- "eval_logits/rejected": -2.2231767177581787,
481
- "eval_logps/chosen": -258.7624206542969,
482
- "eval_logps/rejected": -252.75852966308594,
483
- "eval_loss": 2129.089599609375,
484
- "eval_rewards/accuracies": 0.7559523582458496,
485
- "eval_rewards/chosen": 0.06262180209159851,
486
- "eval_rewards/margins": 0.14249789714813232,
487
- "eval_rewards/rejected": -0.07987607270479202,
488
- "eval_runtime": 547.9938,
489
- "eval_samples_per_second": 3.65,
490
  "eval_steps_per_second": 0.115,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.65,
495
  "learning_rate": 1.647817538357072e-06,
496
- "logits/chosen": -2.5041086673736572,
497
- "logits/rejected": -2.495436191558838,
498
- "logps/chosen": -264.5109558105469,
499
- "logps/rejected": -248.3275604248047,
500
- "loss": 2107.123,
501
- "rewards/accuracies": 0.731249988079071,
502
- "rewards/chosen": 0.05480458214879036,
503
- "rewards/margins": 0.13964474201202393,
504
- "rewards/rejected": -0.08484016358852386,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.67,
509
  "learning_rate": 1.4781433892011132e-06,
510
- "logits/chosen": -2.53191876411438,
511
- "logits/rejected": -2.4989166259765625,
512
- "logps/chosen": -242.36599731445312,
513
- "logps/rejected": -243.78067016601562,
514
- "loss": 2076.0621,
515
- "rewards/accuracies": 0.7718750238418579,
516
- "rewards/chosen": 0.05456935614347458,
517
- "rewards/margins": 0.14978976547718048,
518
- "rewards/rejected": -0.0952204093337059,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.69,
523
  "learning_rate": 1.3139467229135999e-06,
524
- "logits/chosen": -2.4768006801605225,
525
- "logits/rejected": -2.4569873809814453,
526
- "logps/chosen": -263.0523681640625,
527
- "logps/rejected": -250.5469207763672,
528
- "loss": 2112.1141,
529
- "rewards/accuracies": 0.734375,
530
- "rewards/chosen": 0.044828929007053375,
531
- "rewards/margins": 0.13050048053264618,
532
- "rewards/rejected": -0.0856715738773346,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.71,
537
  "learning_rate": 1.1561076868822756e-06,
538
- "logits/chosen": -2.5158028602600098,
539
- "logits/rejected": -2.5096983909606934,
540
- "logps/chosen": -275.6848449707031,
541
- "logps/rejected": -246.7259979248047,
542
- "loss": 2151.2445,
543
- "rewards/accuracies": 0.746874988079071,
544
- "rewards/chosen": 0.052164845168590546,
545
- "rewards/margins": 0.15314052999019623,
546
- "rewards/rejected": -0.10097566992044449,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.73,
551
  "learning_rate": 1.0054723495346484e-06,
552
- "logits/chosen": -2.518799304962158,
553
- "logits/rejected": -2.4620516300201416,
554
- "logps/chosen": -249.27401733398438,
555
- "logps/rejected": -218.7183074951172,
556
- "loss": 2093.9803,
557
- "rewards/accuracies": 0.7593749761581421,
558
- "rewards/chosen": 0.0662151575088501,
559
- "rewards/margins": 0.14556364715099335,
560
- "rewards/rejected": -0.07934850454330444,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.75,
565
  "learning_rate": 8.628481651367876e-07,
566
- "logits/chosen": -2.5340943336486816,
567
- "logits/rejected": -2.5006654262542725,
568
- "logps/chosen": -260.32464599609375,
569
- "logps/rejected": -237.3218536376953,
570
- "loss": 2094.1246,
571
- "rewards/accuracies": 0.765625,
572
- "rewards/chosen": 0.05396001785993576,
573
- "rewards/margins": 0.15317106246948242,
574
- "rewards/rejected": -0.09921105206012726,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.77,
579
  "learning_rate": 7.289996455765749e-07,
580
- "logits/chosen": -2.529265880584717,
581
- "logits/rejected": -2.515712261199951,
582
- "logps/chosen": -266.943115234375,
583
- "logps/rejected": -246.0579376220703,
584
- "loss": 2115.357,
585
- "rewards/accuracies": 0.71875,
586
- "rewards/chosen": 0.052078358829021454,
587
- "rewards/margins": 0.1462351232767105,
588
- "rewards/rejected": -0.09415675699710846,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.8,
593
  "learning_rate": 6.046442623320145e-07,
594
- "logits/chosen": -2.4891440868377686,
595
- "logits/rejected": -2.499753952026367,
596
- "logps/chosen": -253.51632690429688,
597
- "logps/rejected": -245.4505615234375,
598
- "loss": 2082.182,
599
- "rewards/accuracies": 0.7406250238418579,
600
- "rewards/chosen": 0.051686953753232956,
601
- "rewards/margins": 0.1390691101551056,
602
- "rewards/rejected": -0.08738215267658234,
603
  "step": 380
604
  },
605
  {
606
  "epoch": 0.82,
607
  "learning_rate": 4.904486005914027e-07,
608
- "logits/chosen": -2.532160997390747,
609
- "logits/rejected": -2.5001654624938965,
610
- "logps/chosen": -280.9754333496094,
611
- "logps/rejected": -279.0588684082031,
612
- "loss": 2114.3043,
613
- "rewards/accuracies": 0.7593749761581421,
614
- "rewards/chosen": 0.0547635443508625,
615
- "rewards/margins": 0.14076778292655945,
616
- "rewards/rejected": -0.08600424975156784,
617
  "step": 390
618
  },
619
  {
620
  "epoch": 0.84,
621
  "learning_rate": 3.8702478614051353e-07,
622
- "logits/chosen": -2.4791765213012695,
623
- "logits/rejected": -2.4799935817718506,
624
- "logps/chosen": -246.14102172851562,
625
- "logps/rejected": -251.533447265625,
626
- "loss": 2099.8018,
627
- "rewards/accuracies": 0.703125,
628
- "rewards/chosen": 0.0392024889588356,
629
- "rewards/margins": 0.13221651315689087,
630
- "rewards/rejected": -0.09301402419805527,
631
  "step": 400
632
  },
633
  {
634
  "epoch": 0.84,
635
- "eval_logits/chosen": -2.254145860671997,
636
- "eval_logits/rejected": -2.2016360759735107,
637
- "eval_logps/chosen": -259.64398193359375,
638
- "eval_logps/rejected": -254.3590850830078,
639
- "eval_loss": 2121.667236328125,
640
- "eval_rewards/accuracies": 0.7539682388305664,
641
- "eval_rewards/chosen": 0.05380600318312645,
642
- "eval_rewards/margins": 0.14968746900558472,
643
- "eval_rewards/rejected": -0.09588146954774857,
644
- "eval_runtime": 547.9727,
645
- "eval_samples_per_second": 3.65,
646
  "eval_steps_per_second": 0.115,
647
  "step": 400
648
  },
649
  {
650
  "epoch": 0.86,
651
  "learning_rate": 2.9492720416985004e-07,
652
- "logits/chosen": -2.4832329750061035,
653
- "logits/rejected": -2.463463306427002,
654
- "logps/chosen": -284.7741394042969,
655
- "logps/rejected": -252.4269561767578,
656
- "loss": 2145.448,
657
- "rewards/accuracies": 0.7250000238418579,
658
- "rewards/chosen": 0.05263269692659378,
659
- "rewards/margins": 0.15021036565303802,
660
- "rewards/rejected": -0.09757767617702484,
661
  "step": 410
662
  },
663
  {
664
  "epoch": 0.88,
665
  "learning_rate": 2.1464952759020857e-07,
666
- "logits/chosen": -2.4804348945617676,
667
- "logits/rejected": -2.457764148712158,
668
- "logps/chosen": -254.78604125976562,
669
- "logps/rejected": -278.61346435546875,
670
- "loss": 2123.6629,
671
- "rewards/accuracies": 0.6968749761581421,
672
- "rewards/chosen": 0.033899884670972824,
673
- "rewards/margins": 0.11116783320903778,
674
- "rewards/rejected": -0.07726795971393585,
675
  "step": 420
676
  },
677
  {
678
  "epoch": 0.9,
679
  "learning_rate": 1.4662207078575685e-07,
680
- "logits/chosen": -2.4848549365997314,
681
- "logits/rejected": -2.485640048980713,
682
- "logps/chosen": -268.3457336425781,
683
- "logps/rejected": -268.5885925292969,
684
- "loss": 2144.4309,
685
- "rewards/accuracies": 0.721875011920929,
686
- "rewards/chosen": 0.03841588646173477,
687
- "rewards/margins": 0.13024446368217468,
688
- "rewards/rejected": -0.09182857722043991,
689
  "step": 430
690
  },
691
  {
692
  "epoch": 0.92,
693
  "learning_rate": 9.120948298936422e-08,
694
- "logits/chosen": -2.457054615020752,
695
- "logits/rejected": -2.4329726696014404,
696
- "logps/chosen": -231.9584197998047,
697
- "logps/rejected": -234.6277313232422,
698
- "loss": 2118.3984,
699
- "rewards/accuracies": 0.737500011920929,
700
- "rewards/chosen": 0.038600482046604156,
701
- "rewards/margins": 0.13669805228710175,
702
- "rewards/rejected": -0.09809757024049759,
703
  "step": 440
704
  },
705
  {
706
  "epoch": 0.94,
707
  "learning_rate": 4.870879364444109e-08,
708
- "logits/chosen": -2.5156655311584473,
709
- "logits/rejected": -2.563300848007202,
710
- "logps/chosen": -263.9936218261719,
711
- "logps/rejected": -265.6227722167969,
712
- "loss": 2123.5402,
713
- "rewards/accuracies": 0.7124999761581421,
714
- "rewards/chosen": 0.04902677983045578,
715
- "rewards/margins": 0.1260160207748413,
716
- "rewards/rejected": -0.07698923349380493,
717
  "step": 450
718
  },
719
  {
720
  "epoch": 0.96,
721
  "learning_rate": 1.93478202307823e-08,
722
- "logits/chosen": -2.470996379852295,
723
- "logits/rejected": -2.4720451831817627,
724
- "logps/chosen": -258.21734619140625,
725
- "logps/rejected": -262.04925537109375,
726
- "loss": 2078.5094,
727
- "rewards/accuracies": 0.7281249761581421,
728
- "rewards/chosen": 0.04391016811132431,
729
- "rewards/margins": 0.14817874133586884,
730
- "rewards/rejected": -0.10426857322454453,
731
  "step": 460
732
  },
733
  {
734
  "epoch": 0.98,
735
  "learning_rate": 3.283947088983663e-09,
736
- "logits/chosen": -2.513140916824341,
737
- "logits/rejected": -2.535651206970215,
738
- "logps/chosen": -249.6727752685547,
739
- "logps/rejected": -248.2782745361328,
740
- "loss": 2093.2779,
741
- "rewards/accuracies": 0.75,
742
- "rewards/chosen": 0.04741714522242546,
743
- "rewards/margins": 0.143958181142807,
744
- "rewards/rejected": -0.09654103964567184,
745
  "step": 470
746
  },
747
  {
748
  "epoch": 1.0,
749
  "step": 477,
750
  "total_flos": 0.0,
751
- "train_loss": 2164.5614415454666,
752
- "train_runtime": 32346.8016,
753
- "train_samples_per_second": 1.89,
754
  "train_steps_per_second": 0.015
755
  }
756
  ],
 
25
  {
26
  "epoch": 0.02,
27
  "learning_rate": 1.0416666666666667e-06,
28
+ "logits/chosen": -2.585383176803589,
29
+ "logits/rejected": -2.6190898418426514,
30
+ "logps/chosen": -265.6199035644531,
31
+ "logps/rejected": -261.3590393066406,
32
+ "loss": 2489.4685,
33
+ "rewards/accuracies": 0.4548611044883728,
34
+ "rewards/chosen": 0.006730278953909874,
35
+ "rewards/margins": 0.0007296364055946469,
36
+ "rewards/rejected": 0.006000642664730549,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.04,
41
  "learning_rate": 2.0833333333333334e-06,
42
+ "logits/chosen": -2.616151809692383,
43
+ "logits/rejected": -2.599904775619507,
44
+ "logps/chosen": -253.3858184814453,
45
+ "logps/rejected": -245.82345581054688,
46
+ "loss": 2411.3754,
47
+ "rewards/accuracies": 0.6000000238418579,
48
+ "rewards/chosen": 0.036651305854320526,
49
+ "rewards/margins": 0.009106594137847424,
50
+ "rewards/rejected": 0.02754470705986023,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.06,
55
  "learning_rate": 3.125e-06,
56
+ "logits/chosen": -2.617845058441162,
57
+ "logits/rejected": -2.6118521690368652,
58
+ "logps/chosen": -250.7469482421875,
59
+ "logps/rejected": -223.05172729492188,
60
+ "loss": 2306.1311,
61
+ "rewards/accuracies": 0.671875,
62
+ "rewards/chosen": 0.04942930489778519,
63
+ "rewards/margins": 0.023983022198081017,
64
+ "rewards/rejected": 0.02544628083705902,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.08,
69
  "learning_rate": 4.166666666666667e-06,
70
+ "logits/chosen": -2.6323208808898926,
71
+ "logits/rejected": -2.608524799346924,
72
+ "logps/chosen": -276.45947265625,
73
+ "logps/rejected": -238.35391235351562,
74
+ "loss": 2100.6182,
75
  "rewards/accuracies": 0.6968749761581421,
76
+ "rewards/chosen": 0.05112973973155022,
77
+ "rewards/margins": 0.05378426983952522,
78
+ "rewards/rejected": -0.002654529409483075,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.1,
83
  "learning_rate": 4.999731868769027e-06,
84
+ "logits/chosen": -2.552873373031616,
85
+ "logits/rejected": -2.5477213859558105,
86
+ "logps/chosen": -253.2111358642578,
87
+ "logps/rejected": -248.1074676513672,
88
+ "loss": 2103.8223,
89
+ "rewards/accuracies": 0.6781250238418579,
90
+ "rewards/chosen": 0.022746428847312927,
91
+ "rewards/margins": 0.07937721163034439,
92
+ "rewards/rejected": -0.05663077160716057,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.13,
97
  "learning_rate": 4.9903533134293035e-06,
98
+ "logits/chosen": -2.556926727294922,
99
+ "logits/rejected": -2.551504611968994,
100
+ "logps/chosen": -261.6982116699219,
101
+ "logps/rejected": -240.27059936523438,
102
+ "loss": 2054.3434,
103
+ "rewards/accuracies": 0.6781250238418579,
104
+ "rewards/chosen": 0.023721303790807724,
105
+ "rewards/margins": 0.08682042360305786,
106
+ "rewards/rejected": -0.06309913098812103,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.15,
111
  "learning_rate": 4.967625656594782e-06,
112
+ "logits/chosen": -2.5740597248077393,
113
+ "logits/rejected": -2.553145408630371,
114
+ "logps/chosen": -278.0965270996094,
115
+ "logps/rejected": -267.19586181640625,
116
+ "loss": 1971.1375,
117
+ "rewards/accuracies": 0.6875,
118
+ "rewards/chosen": 0.015462947078049183,
119
+ "rewards/margins": 0.08871600031852722,
120
+ "rewards/rejected": -0.07325305044651031,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.17,
125
  "learning_rate": 4.93167072587771e-06,
126
+ "logits/chosen": -2.5298993587493896,
127
+ "logits/rejected": -2.5009925365448,
128
+ "logps/chosen": -258.5903015136719,
129
+ "logps/rejected": -263.52850341796875,
130
+ "loss": 1933.0076,
131
+ "rewards/accuracies": 0.731249988079071,
132
+ "rewards/chosen": 0.029317494481801987,
133
+ "rewards/margins": 0.11680855602025986,
134
+ "rewards/rejected": -0.08749105781316757,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.19,
139
  "learning_rate": 4.882681251368549e-06,
140
+ "logits/chosen": -2.5273003578186035,
141
+ "logits/rejected": -2.493241548538208,
142
+ "logps/chosen": -247.47506713867188,
143
+ "logps/rejected": -260.76678466796875,
144
+ "loss": 1845.5582,
145
+ "rewards/accuracies": 0.7281249761581421,
146
+ "rewards/chosen": -0.03806694597005844,
147
+ "rewards/margins": 0.11302463710308075,
148
+ "rewards/rejected": -0.15109160542488098,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.21,
153
  "learning_rate": 4.8209198325401815e-06,
154
+ "logits/chosen": -2.5287628173828125,
155
+ "logits/rejected": -2.5358829498291016,
156
+ "logps/chosen": -272.0884704589844,
157
+ "logps/rejected": -275.2580871582031,
158
+ "loss": 1797.9404,
159
  "rewards/accuracies": 0.762499988079071,
160
+ "rewards/chosen": -0.0045492262579500675,
161
+ "rewards/margins": 0.13472509384155273,
162
+ "rewards/rejected": -0.13927432894706726,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.21,
167
+ "eval_logits/chosen": -2.196876287460327,
168
+ "eval_logits/rejected": -2.1486356258392334,
169
+ "eval_logps/chosen": -263.71331787109375,
170
+ "eval_logps/rejected": -256.7424011230469,
171
+ "eval_loss": 1887.4102783203125,
172
+ "eval_rewards/accuracies": 0.7519841194152832,
173
+ "eval_rewards/chosen": 0.013112416490912437,
174
+ "eval_rewards/margins": 0.13282696902751923,
175
+ "eval_rewards/rejected": -0.11971456557512283,
176
+ "eval_runtime": 549.9966,
177
+ "eval_samples_per_second": 3.636,
178
  "eval_steps_per_second": 0.115,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.23,
183
  "learning_rate": 4.746717530629565e-06,
184
+ "logits/chosen": -2.480510711669922,
185
+ "logits/rejected": -2.4668211936950684,
186
+ "logps/chosen": -267.04180908203125,
187
+ "logps/rejected": -262.5838317871094,
188
+ "loss": 1870.9051,
189
+ "rewards/accuracies": 0.734375,
190
+ "rewards/chosen": -0.020576762035489082,
191
+ "rewards/margins": 0.125274196267128,
192
+ "rewards/rejected": -0.14585095643997192,
193
  "step": 110
194
  },
195
  {
196
  "epoch": 0.25,
197
  "learning_rate": 4.660472094042121e-06,
198
+ "logits/chosen": -2.44077205657959,
199
+ "logits/rejected": -2.4053845405578613,
200
+ "logps/chosen": -256.12939453125,
201
+ "logps/rejected": -248.28060913085938,
202
+ "loss": 1855.318,
203
+ "rewards/accuracies": 0.684374988079071,
204
+ "rewards/chosen": -0.049732744693756104,
205
+ "rewards/margins": 0.10872016102075577,
206
+ "rewards/rejected": -0.15845291316509247,
207
  "step": 120
208
  },
209
  {
210
  "epoch": 0.27,
211
  "learning_rate": 4.5626458262912745e-06,
212
+ "logits/chosen": -2.395805597305298,
213
+ "logits/rejected": -2.383305311203003,
214
+ "logps/chosen": -280.74053955078125,
215
+ "logps/rejected": -270.37860107421875,
216
+ "loss": 1811.4148,
217
+ "rewards/accuracies": 0.7437499761581421,
218
+ "rewards/chosen": -0.027533594518899918,
219
+ "rewards/margins": 0.1342071145772934,
220
+ "rewards/rejected": -0.1617407202720642,
221
  "step": 130
222
  },
223
  {
224
  "epoch": 0.29,
225
  "learning_rate": 4.453763107901676e-06,
226
+ "logits/chosen": -2.4485344886779785,
227
+ "logits/rejected": -2.43884015083313,
228
+ "logps/chosen": -243.1454315185547,
229
+ "logps/rejected": -255.15432739257812,
230
+ "loss": 1803.225,
231
  "rewards/accuracies": 0.734375,
232
+ "rewards/chosen": -0.028790492564439774,
233
+ "rewards/margins": 0.1489991694688797,
234
+ "rewards/rejected": -0.17778967320919037,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.31,
239
  "learning_rate": 4.33440758555951e-06,
240
+ "logits/chosen": -2.459658622741699,
241
+ "logits/rejected": -2.483065605163574,
242
+ "logps/chosen": -267.7740478515625,
243
+ "logps/rejected": -243.34609985351562,
244
+ "loss": 1781.1752,
245
+ "rewards/accuracies": 0.699999988079071,
246
+ "rewards/chosen": -0.02288922667503357,
247
+ "rewards/margins": 0.12706486880779266,
248
+ "rewards/rejected": -0.14995409548282623,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.33,
253
  "learning_rate": 4.205219043576955e-06,
254
+ "logits/chosen": -2.483583688735962,
255
+ "logits/rejected": -2.4244942665100098,
256
+ "logps/chosen": -260.3743896484375,
257
+ "logps/rejected": -258.7478332519531,
258
+ "loss": 1754.5766,
259
+ "rewards/accuracies": 0.784375011920929,
260
+ "rewards/chosen": -0.0020265295170247555,
261
+ "rewards/margins": 0.16760031878948212,
262
+ "rewards/rejected": -0.169626846909523,
263
  "step": 160
264
  },
265
  {
266
  "epoch": 0.36,
267
  "learning_rate": 4.066889974440757e-06,
268
+ "logits/chosen": -2.4374189376831055,
269
+ "logits/rejected": -2.428433656692505,
270
+ "logps/chosen": -264.5699768066406,
271
+ "logps/rejected": -252.79421997070312,
272
+ "loss": 1953.8818,
273
+ "rewards/accuracies": 0.668749988079071,
274
+ "rewards/chosen": -0.06210694834589958,
275
+ "rewards/margins": 0.13285748660564423,
276
+ "rewards/rejected": -0.1949644386768341,
277
  "step": 170
278
  },
279
  {
280
  "epoch": 0.38,
281
  "learning_rate": 3.92016186682789e-06,
282
+ "logits/chosen": -2.467085361480713,
283
+ "logits/rejected": -2.487204074859619,
284
+ "logps/chosen": -262.995361328125,
285
+ "logps/rejected": -271.94183349609375,
286
+ "loss": 1848.9945,
287
  "rewards/accuracies": 0.721875011920929,
288
+ "rewards/chosen": -0.07509048283100128,
289
+ "rewards/margins": 0.12671387195587158,
290
+ "rewards/rejected": -0.20180435478687286,
291
  "step": 180
292
  },
293
  {
294
  "epoch": 0.4,
295
  "learning_rate": 3.7658212309857576e-06,
296
+ "logits/chosen": -2.450601816177368,
297
+ "logits/rejected": -2.4304168224334717,
298
+ "logps/chosen": -269.1886901855469,
299
+ "logps/rejected": -265.7490539550781,
300
+ "loss": 1698.6666,
301
+ "rewards/accuracies": 0.746874988079071,
302
+ "rewards/chosen": -0.09053254127502441,
303
+ "rewards/margins": 0.14999321103096008,
304
+ "rewards/rejected": -0.2405257225036621,
305
  "step": 190
306
  },
307
  {
308
  "epoch": 0.42,
309
  "learning_rate": 3.604695382782159e-06,
310
+ "logits/chosen": -2.447007179260254,
311
+ "logits/rejected": -2.419039726257324,
312
+ "logps/chosen": -282.8253479003906,
313
+ "logps/rejected": -278.14508056640625,
314
+ "loss": 1700.9055,
315
+ "rewards/accuracies": 0.784375011920929,
316
+ "rewards/chosen": -0.07701022177934647,
317
+ "rewards/margins": 0.16163742542266846,
318
+ "rewards/rejected": -0.23864765465259552,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.42,
323
+ "eval_logits/chosen": -2.161839485168457,
324
+ "eval_logits/rejected": -2.1081268787384033,
325
+ "eval_logps/chosen": -269.66546630859375,
326
+ "eval_logps/rejected": -265.3905029296875,
327
+ "eval_loss": 1784.6597900390625,
328
+ "eval_rewards/accuracies": 0.761904776096344,
329
+ "eval_rewards/chosen": -0.0464087538421154,
330
+ "eval_rewards/margins": 0.15978708863258362,
331
+ "eval_rewards/rejected": -0.2061958611011505,
332
+ "eval_runtime": 549.0189,
333
+ "eval_samples_per_second": 3.643,
334
  "eval_steps_per_second": 0.115,
335
  "step": 200
336
  },
337
  {
338
  "epoch": 0.44,
339
  "learning_rate": 3.437648009023905e-06,
340
+ "logits/chosen": -2.458688259124756,
341
+ "logits/rejected": -2.4217796325683594,
342
+ "logps/chosen": -252.5647430419922,
343
+ "logps/rejected": -248.326416015625,
344
+ "loss": 1806.0594,
345
+ "rewards/accuracies": 0.7562500238418579,
346
+ "rewards/chosen": -0.02530970238149166,
347
+ "rewards/margins": 0.14908090233802795,
348
+ "rewards/rejected": -0.17439061403274536,
349
  "step": 210
350
  },
351
  {
352
  "epoch": 0.46,
353
  "learning_rate": 3.265574537815398e-06,
354
+ "logits/chosen": -2.4742610454559326,
355
+ "logits/rejected": -2.4789376258850098,
356
+ "logps/chosen": -286.0444030761719,
357
+ "logps/rejected": -261.9767150878906,
358
+ "loss": 1855.6273,
359
+ "rewards/accuracies": 0.706250011920929,
360
+ "rewards/chosen": -0.03405206650495529,
361
+ "rewards/margins": 0.11277566105127335,
362
+ "rewards/rejected": -0.14682772755622864,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 0.48,
367
  "learning_rate": 3.089397338773569e-06,
368
+ "logits/chosen": -2.38773775100708,
369
+ "logits/rejected": -2.3718185424804688,
370
+ "logps/chosen": -257.7181701660156,
371
+ "logps/rejected": -253.3428955078125,
372
+ "loss": 1797.9486,
373
+ "rewards/accuracies": 0.721875011920929,
374
+ "rewards/chosen": -0.06530335545539856,
375
+ "rewards/margins": 0.1308148354291916,
376
+ "rewards/rejected": -0.19611820578575134,
377
  "step": 230
378
  },
379
  {
380
  "epoch": 0.5,
381
  "learning_rate": 2.9100607788275547e-06,
382
+ "logits/chosen": -2.4125852584838867,
383
+ "logits/rejected": -2.414628267288208,
384
+ "logps/chosen": -265.2156066894531,
385
+ "logps/rejected": -257.0289001464844,
386
+ "loss": 1850.0729,
387
+ "rewards/accuracies": 0.690625011920929,
388
+ "rewards/chosen": -0.04248107224702835,
389
+ "rewards/margins": 0.12761279940605164,
390
+ "rewards/rejected": -0.17009387910366058,
391
  "step": 240
392
  },
393
  {
394
  "epoch": 0.52,
395
  "learning_rate": 2.72852616010567e-06,
396
+ "logits/chosen": -2.4339253902435303,
397
+ "logits/rejected": -2.4054951667785645,
398
+ "logps/chosen": -271.8371276855469,
399
+ "logps/rejected": -255.33438110351562,
400
+ "loss": 1766.1885,
401
+ "rewards/accuracies": 0.753125011920929,
402
+ "rewards/chosen": -0.0324532687664032,
403
+ "rewards/margins": 0.1579422652721405,
404
+ "rewards/rejected": -0.1903955340385437,
405
  "step": 250
406
  },
407
  {
408
  "epoch": 0.54,
409
  "learning_rate": 2.5457665670441937e-06,
410
+ "logits/chosen": -2.4216437339782715,
411
+ "logits/rejected": -2.4156367778778076,
412
+ "logps/chosen": -266.7996520996094,
413
+ "logps/rejected": -243.180419921875,
414
+ "loss": 1710.8809,
415
+ "rewards/accuracies": 0.753125011920929,
416
+ "rewards/chosen": -0.035904210060834885,
417
+ "rewards/margins": 0.16971439123153687,
418
+ "rewards/rejected": -0.20561861991882324,
419
  "step": 260
420
  },
421
  {
422
  "epoch": 0.57,
423
  "learning_rate": 2.3627616503391813e-06,
424
+ "logits/chosen": -2.4438915252685547,
425
+ "logits/rejected": -2.416748285293579,
426
+ "logps/chosen": -290.58453369140625,
427
+ "logps/rejected": -277.0739440917969,
428
+ "loss": 1714.5062,
429
+ "rewards/accuracies": 0.765625,
430
+ "rewards/chosen": -0.042676471173763275,
431
+ "rewards/margins": 0.17854078114032745,
432
+ "rewards/rejected": -0.22121724486351013,
433
  "step": 270
434
  },
435
  {
436
  "epoch": 0.59,
437
  "learning_rate": 2.1804923757009885e-06,
438
+ "logits/chosen": -2.414602756500244,
439
+ "logits/rejected": -2.4200820922851562,
440
+ "logps/chosen": -282.95147705078125,
441
+ "logps/rejected": -261.1886291503906,
442
+ "loss": 1764.4607,
443
+ "rewards/accuracies": 0.7437499761581421,
444
+ "rewards/chosen": -0.07589195668697357,
445
+ "rewards/margins": 0.13762618601322174,
446
+ "rewards/rejected": -0.2135181427001953,
447
  "step": 280
448
  },
449
  {
450
  "epoch": 0.61,
451
  "learning_rate": 1.9999357655598894e-06,
452
+ "logits/chosen": -2.430169105529785,
453
+ "logits/rejected": -2.4057881832122803,
454
+ "logps/chosen": -265.06805419921875,
455
+ "logps/rejected": -263.2739562988281,
456
+ "loss": 1786.2846,
457
+ "rewards/accuracies": 0.746874988079071,
458
+ "rewards/chosen": -0.010251840576529503,
459
+ "rewards/margins": 0.15472975373268127,
460
+ "rewards/rejected": -0.16498157382011414,
461
  "step": 290
462
  },
463
  {
464
  "epoch": 0.63,
465
  "learning_rate": 1.8220596619089576e-06,
466
+ "logits/chosen": -2.392138957977295,
467
+ "logits/rejected": -2.3823294639587402,
468
+ "logps/chosen": -255.75393676757812,
469
+ "logps/rejected": -261.84271240234375,
470
+ "loss": 1767.2219,
471
+ "rewards/accuracies": 0.737500011920929,
472
+ "rewards/chosen": -0.04702477902173996,
473
+ "rewards/margins": 0.13455010950565338,
474
+ "rewards/rejected": -0.18157489597797394,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.63,
479
+ "eval_logits/chosen": -2.158698797225952,
480
+ "eval_logits/rejected": -2.1057095527648926,
481
+ "eval_logps/chosen": -269.6955871582031,
482
+ "eval_logps/rejected": -264.77947998046875,
483
+ "eval_loss": 1735.518310546875,
484
+ "eval_rewards/accuracies": 0.7698412537574768,
485
+ "eval_rewards/chosen": -0.04671022295951843,
486
+ "eval_rewards/margins": 0.15337513387203217,
487
+ "eval_rewards/rejected": -0.2000853717327118,
488
+ "eval_runtime": 548.7136,
489
+ "eval_samples_per_second": 3.645,
490
  "eval_steps_per_second": 0.115,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.65,
495
  "learning_rate": 1.647817538357072e-06,
496
+ "logits/chosen": -2.4140188694000244,
497
+ "logits/rejected": -2.4031002521514893,
498
+ "logps/chosen": -274.3767395019531,
499
+ "logps/rejected": -259.40155029296875,
500
+ "loss": 1673.9693,
501
+ "rewards/accuracies": 0.746874988079071,
502
+ "rewards/chosen": -0.04385297745466232,
503
+ "rewards/margins": 0.1517268717288971,
504
+ "rewards/rejected": -0.19557985663414001,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.67,
509
  "learning_rate": 1.4781433892011132e-06,
510
+ "logits/chosen": -2.416640520095825,
511
+ "logits/rejected": -2.370535135269165,
512
+ "logps/chosen": -252.216064453125,
513
+ "logps/rejected": -255.1393280029297,
514
+ "loss": 1673.8594,
515
+ "rewards/accuracies": 0.7593749761581421,
516
+ "rewards/chosen": -0.043931327760219574,
517
+ "rewards/margins": 0.16487570106983185,
518
+ "rewards/rejected": -0.20880703628063202,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.69,
523
  "learning_rate": 1.3139467229135999e-06,
524
+ "logits/chosen": -2.362358570098877,
525
+ "logits/rejected": -2.3449196815490723,
526
+ "logps/chosen": -270.876220703125,
527
+ "logps/rejected": -259.251953125,
528
+ "loss": 1731.3877,
529
+ "rewards/accuracies": 0.753125011920929,
530
+ "rewards/chosen": -0.03340950980782509,
531
+ "rewards/margins": 0.13931182026863098,
532
+ "rewards/rejected": -0.17272132635116577,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.71,
537
  "learning_rate": 1.1561076868822756e-06,
538
+ "logits/chosen": -2.3923397064208984,
539
+ "logits/rejected": -2.384582281112671,
540
+ "logps/chosen": -284.8360290527344,
541
+ "logps/rejected": -257.0713806152344,
542
+ "loss": 1778.2957,
543
+ "rewards/accuracies": 0.7718750238418579,
544
+ "rewards/chosen": -0.039347052574157715,
545
+ "rewards/margins": 0.1650826632976532,
546
+ "rewards/rejected": -0.20442970097064972,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.73,
551
  "learning_rate": 1.0054723495346484e-06,
552
+ "logits/chosen": -2.3869528770446777,
553
+ "logits/rejected": -2.3370375633239746,
554
+ "logps/chosen": -259.75189208984375,
555
+ "logps/rejected": -231.13577270507812,
556
+ "loss": 1665.3461,
557
+ "rewards/accuracies": 0.7437499761581421,
558
+ "rewards/chosen": -0.03856384754180908,
559
+ "rewards/margins": 0.1649591028690338,
560
+ "rewards/rejected": -0.2035229504108429,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.75,
565
  "learning_rate": 8.628481651367876e-07,
566
+ "logits/chosen": -2.4105262756347656,
567
+ "logits/rejected": -2.352128744125366,
568
+ "logps/chosen": -269.03790283203125,
569
+ "logps/rejected": -247.90872192382812,
570
+ "loss": 1665.3982,
571
+ "rewards/accuracies": 0.7875000238418579,
572
+ "rewards/chosen": -0.03317265957593918,
573
+ "rewards/margins": 0.1719072014093399,
574
+ "rewards/rejected": -0.20507986843585968,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.77,
579
  "learning_rate": 7.289996455765749e-07,
580
+ "logits/chosen": -2.4054064750671387,
581
+ "logits/rejected": -2.38871431350708,
582
+ "logps/chosen": -279.2740173339844,
583
+ "logps/rejected": -259.63690185546875,
584
+ "loss": 1704.7645,
585
+ "rewards/accuracies": 0.7124999761581421,
586
+ "rewards/chosen": -0.07123039662837982,
587
+ "rewards/margins": 0.15871620178222656,
588
+ "rewards/rejected": -0.22994661331176758,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.8,
593
  "learning_rate": 6.046442623320145e-07,
594
+ "logits/chosen": -2.3605639934539795,
595
+ "logits/rejected": -2.368460178375244,
596
+ "logps/chosen": -267.2261657714844,
597
+ "logps/rejected": -260.45550537109375,
598
+ "loss": 1647.7326,
599
+ "rewards/accuracies": 0.734375,
600
+ "rewards/chosen": -0.0854114517569542,
601
+ "rewards/margins": 0.15202030539512634,
602
+ "rewards/rejected": -0.23743176460266113,
603
  "step": 380
604
  },
605
  {
606
  "epoch": 0.82,
607
  "learning_rate": 4.904486005914027e-07,
608
+ "logits/chosen": -2.4076011180877686,
609
+ "logits/rejected": -2.3770554065704346,
610
+ "logps/chosen": -292.8500061035156,
611
+ "logps/rejected": -292.0636291503906,
612
+ "loss": 1739.5414,
613
+ "rewards/accuracies": 0.746874988079071,
614
+ "rewards/chosen": -0.06398223340511322,
615
+ "rewards/margins": 0.15206970274448395,
616
+ "rewards/rejected": -0.21605193614959717,
617
  "step": 390
618
  },
619
  {
620
  "epoch": 0.84,
621
  "learning_rate": 3.8702478614051353e-07,
622
+ "logits/chosen": -2.3384757041931152,
623
+ "logits/rejected": -2.3366100788116455,
624
+ "logps/chosen": -259.2252502441406,
625
+ "logps/rejected": -265.5692138671875,
626
+ "loss": 1717.4336,
627
+ "rewards/accuracies": 0.7124999761581421,
628
+ "rewards/chosen": -0.09164019674062729,
629
+ "rewards/margins": 0.14173154532909393,
630
+ "rewards/rejected": -0.23337173461914062,
631
  "step": 400
632
  },
633
  {
634
  "epoch": 0.84,
635
+ "eval_logits/chosen": -2.088451385498047,
636
+ "eval_logits/rejected": -2.032222032546997,
637
+ "eval_logps/chosen": -271.9333190917969,
638
+ "eval_logps/rejected": -267.85687255859375,
639
+ "eval_loss": 1721.676513671875,
640
+ "eval_rewards/accuracies": 0.77182537317276,
641
+ "eval_rewards/chosen": -0.06908722221851349,
642
+ "eval_rewards/margins": 0.1617719829082489,
643
+ "eval_rewards/rejected": -0.23085922002792358,
644
+ "eval_runtime": 548.423,
645
+ "eval_samples_per_second": 3.647,
646
  "eval_steps_per_second": 0.115,
647
  "step": 400
648
  },
649
  {
650
  "epoch": 0.86,
651
  "learning_rate": 2.9492720416985004e-07,
652
+ "logits/chosen": -2.3725836277008057,
653
+ "logits/rejected": -2.3304688930511475,
654
+ "logps/chosen": -296.72991943359375,
655
+ "logps/rejected": -266.0842590332031,
656
+ "loss": 1755.1898,
657
+ "rewards/accuracies": 0.7406250238418579,
658
+ "rewards/chosen": -0.0669253021478653,
659
+ "rewards/margins": 0.16722533106803894,
660
+ "rewards/rejected": -0.23415064811706543,
661
  "step": 410
662
  },
663
  {
664
  "epoch": 0.88,
665
  "learning_rate": 2.1464952759020857e-07,
666
+ "logits/chosen": -2.3592472076416016,
667
+ "logits/rejected": -2.331540107727051,
668
+ "logps/chosen": -266.99005126953125,
669
+ "logps/rejected": -292.03680419921875,
670
+ "loss": 1730.9672,
671
+ "rewards/accuracies": 0.706250011920929,
672
+ "rewards/chosen": -0.08814045041799545,
673
+ "rewards/margins": 0.12336041778326035,
674
+ "rewards/rejected": -0.2115008533000946,
675
  "step": 420
676
  },
677
  {
678
  "epoch": 0.9,
679
  "learning_rate": 1.4662207078575685e-07,
680
+ "logits/chosen": -2.366381883621216,
681
+ "logits/rejected": -2.35951566696167,
682
+ "logps/chosen": -280.3116149902344,
683
+ "logps/rejected": -281.93939208984375,
684
+ "loss": 1760.3617,
685
+ "rewards/accuracies": 0.7281249761581421,
686
+ "rewards/chosen": -0.08124328404664993,
687
+ "rewards/margins": 0.14409320056438446,
688
+ "rewards/rejected": -0.2253364771604538,
689
  "step": 430
690
  },
691
  {
692
  "epoch": 0.92,
693
  "learning_rate": 9.120948298936422e-08,
694
+ "logits/chosen": -2.3266444206237793,
695
+ "logits/rejected": -2.2898497581481934,
696
+ "logps/chosen": -243.9964141845703,
697
+ "logps/rejected": -248.1795196533203,
698
+ "loss": 1711.7143,
699
+ "rewards/accuracies": 0.75,
700
+ "rewards/chosen": -0.08177933841943741,
701
+ "rewards/margins": 0.15183614194393158,
702
+ "rewards/rejected": -0.23361548781394958,
703
  "step": 440
704
  },
705
  {
706
  "epoch": 0.94,
707
  "learning_rate": 4.870879364444109e-08,
708
+ "logits/chosen": -2.3814821243286133,
709
+ "logits/rejected": -2.4406635761260986,
710
+ "logps/chosen": -275.4070129394531,
711
+ "logps/rejected": -278.91082763671875,
712
+ "loss": 1743.9877,
713
+ "rewards/accuracies": 0.7406250238418579,
714
+ "rewards/chosen": -0.06510698050260544,
715
+ "rewards/margins": 0.14476314187049866,
716
+ "rewards/rejected": -0.2098701000213623,
717
  "step": 450
718
  },
719
  {
720
  "epoch": 0.96,
721
  "learning_rate": 1.93478202307823e-08,
722
+ "logits/chosen": -2.34289288520813,
723
+ "logits/rejected": -2.346625804901123,
724
+ "logps/chosen": -270.3787841796875,
725
+ "logps/rejected": -275.61651611328125,
726
+ "loss": 1676.5176,
727
+ "rewards/accuracies": 0.737500011920929,
728
+ "rewards/chosen": -0.07770398259162903,
729
+ "rewards/margins": 0.16223737597465515,
730
+ "rewards/rejected": -0.23994135856628418,
731
  "step": 460
732
  },
733
  {
734
  "epoch": 0.98,
735
  "learning_rate": 3.283947088983663e-09,
736
+ "logits/chosen": -2.3888649940490723,
737
+ "logits/rejected": -2.4120144844055176,
738
+ "logps/chosen": -261.28997802734375,
739
+ "logps/rejected": -261.67755126953125,
740
+ "loss": 1663.4154,
741
+ "rewards/accuracies": 0.768750011920929,
742
+ "rewards/chosen": -0.0687546655535698,
743
+ "rewards/margins": 0.16177912056446075,
744
+ "rewards/rejected": -0.23053380846977234,
745
  "step": 470
746
  },
747
  {
748
  "epoch": 1.0,
749
  "step": 477,
750
  "total_flos": 0.0,
751
+ "train_loss": 1826.8015694608227,
752
+ "train_runtime": 32379.7062,
753
+ "train_samples_per_second": 1.888,
754
  "train_steps_per_second": 0.015
755
  }
756
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89ae9269821a7a76bfccee733cea8c3af1d1b7b751ef31fb40915f4d080f4944
3
  size 4920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9de8831bf203a26c117251200a242a486dd5bc4f1aae373c17a996f39be3288
3
  size 4920