GaetanMichelet commited on
Commit
3e8b986
1 Parent(s): 93f1e69

Model save

Browse files
README.md ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
3
+ library_name: peft
4
+ license: llama3.1
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: Llama-31-8B_task-1_120-samples_config-4
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # Llama-31-8B_task-1_120-samples_config-4
18
+
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 1.3447
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 1e-05
41
+ - train_batch_size: 1
42
+ - eval_batch_size: 1
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - gradient_accumulation_steps: 16
46
+ - total_train_batch_size: 16
47
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
+ - lr_scheduler_type: cosine
49
+ - lr_scheduler_warmup_ratio: 0.1
50
+ - num_epochs: 150
51
+
52
+ ### Training results
53
+
54
+ | Training Loss | Epoch | Step | Validation Loss |
55
+ |:-------------:|:-------:|:----:|:---------------:|
56
+ | 2.121 | 0.9091 | 5 | 2.1020 |
57
+ | 2.0709 | 2.0 | 11 | 2.0931 |
58
+ | 2.0454 | 2.9091 | 16 | 2.0755 |
59
+ | 2.0502 | 4.0 | 22 | 2.0472 |
60
+ | 2.0511 | 4.9091 | 27 | 2.0100 |
61
+ | 1.9554 | 6.0 | 33 | 1.9472 |
62
+ | 1.8921 | 6.9091 | 38 | 1.8795 |
63
+ | 1.8104 | 8.0 | 44 | 1.7813 |
64
+ | 1.7636 | 8.9091 | 49 | 1.6937 |
65
+ | 1.6011 | 10.0 | 55 | 1.6142 |
66
+ | 1.5128 | 10.9091 | 60 | 1.5751 |
67
+ | 1.4277 | 12.0 | 66 | 1.5353 |
68
+ | 1.4998 | 12.9091 | 71 | 1.5001 |
69
+ | 1.4154 | 14.0 | 77 | 1.4583 |
70
+ | 1.4201 | 14.9091 | 82 | 1.4252 |
71
+ | 1.3364 | 16.0 | 88 | 1.3921 |
72
+ | 1.2762 | 16.9091 | 93 | 1.3691 |
73
+ | 1.2851 | 18.0 | 99 | 1.3437 |
74
+ | 1.2239 | 18.9091 | 104 | 1.3261 |
75
+ | 1.221 | 20.0 | 110 | 1.3084 |
76
+ | 1.2011 | 20.9091 | 115 | 1.2951 |
77
+ | 1.1433 | 22.0 | 121 | 1.2824 |
78
+ | 1.1579 | 22.9091 | 126 | 1.2746 |
79
+ | 1.0871 | 24.0 | 132 | 1.2680 |
80
+ | 1.0745 | 24.9091 | 137 | 1.2635 |
81
+ | 1.0006 | 26.0 | 143 | 1.2674 |
82
+ | 0.9628 | 26.9091 | 148 | 1.2689 |
83
+ | 0.9237 | 28.0 | 154 | 1.2717 |
84
+ | 0.8824 | 28.9091 | 159 | 1.2880 |
85
+ | 0.8706 | 30.0 | 165 | 1.2961 |
86
+ | 0.8328 | 30.9091 | 170 | 1.3266 |
87
+ | 0.7667 | 32.0 | 176 | 1.3447 |
88
+
89
+
90
+ ### Framework versions
91
+
92
+ - PEFT 0.12.0
93
+ - Transformers 4.44.0
94
+ - Pytorch 2.1.2+cu121
95
+ - Datasets 2.20.0
96
+ - Tokenizers 0.19.1
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d257f1a42bcd455d4ddf3b1b6b0471fe67eb5645d489754f1a573a7071ac814d
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2813b9da52ce43b51ba2878f47ace0bdd4068f9f91b035ade30c14db8cde6430
3
  size 167832240
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 32.0,
3
+ "total_flos": 4.322507713465549e+16,
4
+ "train_loss": 1.4077397015961735,
5
+ "train_runtime": 3298.1261,
6
+ "train_samples": 88,
7
+ "train_samples_per_second": 4.002,
8
+ "train_steps_per_second": 0.227
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 32.0,
3
+ "total_flos": 4.322507713465549e+16,
4
+ "train_loss": 1.4077397015961735,
5
+ "train_runtime": 3298.1261,
6
+ "train_samples": 88,
7
+ "train_samples_per_second": 4.002,
8
+ "train_steps_per_second": 0.227
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,930 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.2635489702224731,
3
+ "best_model_checkpoint": "data/Llama-31-8B_task-1_120-samples_config-4/checkpoint-137",
4
+ "epoch": 32.0,
5
+ "eval_steps": 500,
6
+ "global_step": 176,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.18181818181818182,
13
+ "grad_norm": 1.874562382698059,
14
+ "learning_rate": 1.3333333333333336e-07,
15
+ "loss": 2.2898,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.36363636363636365,
20
+ "grad_norm": 1.8042479753494263,
21
+ "learning_rate": 2.666666666666667e-07,
22
+ "loss": 2.0811,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.7272727272727273,
27
+ "grad_norm": 1.760155200958252,
28
+ "learning_rate": 5.333333333333335e-07,
29
+ "loss": 2.121,
30
+ "step": 4
31
+ },
32
+ {
33
+ "epoch": 0.9090909090909091,
34
+ "eval_loss": 2.101999044418335,
35
+ "eval_runtime": 9.6381,
36
+ "eval_samples_per_second": 2.49,
37
+ "eval_steps_per_second": 2.49,
38
+ "step": 5
39
+ },
40
+ {
41
+ "epoch": 1.0909090909090908,
42
+ "grad_norm": 1.5958633422851562,
43
+ "learning_rate": 8.000000000000001e-07,
44
+ "loss": 1.9468,
45
+ "step": 6
46
+ },
47
+ {
48
+ "epoch": 1.4545454545454546,
49
+ "grad_norm": 1.8069952726364136,
50
+ "learning_rate": 1.066666666666667e-06,
51
+ "loss": 2.1471,
52
+ "step": 8
53
+ },
54
+ {
55
+ "epoch": 1.8181818181818183,
56
+ "grad_norm": 1.736191749572754,
57
+ "learning_rate": 1.3333333333333334e-06,
58
+ "loss": 2.0709,
59
+ "step": 10
60
+ },
61
+ {
62
+ "epoch": 2.0,
63
+ "eval_loss": 2.0930726528167725,
64
+ "eval_runtime": 9.6259,
65
+ "eval_samples_per_second": 2.493,
66
+ "eval_steps_per_second": 2.493,
67
+ "step": 11
68
+ },
69
+ {
70
+ "epoch": 2.1818181818181817,
71
+ "grad_norm": 1.6809439659118652,
72
+ "learning_rate": 1.6000000000000001e-06,
73
+ "loss": 2.1306,
74
+ "step": 12
75
+ },
76
+ {
77
+ "epoch": 2.5454545454545454,
78
+ "grad_norm": 1.747848391532898,
79
+ "learning_rate": 1.8666666666666669e-06,
80
+ "loss": 2.0776,
81
+ "step": 14
82
+ },
83
+ {
84
+ "epoch": 2.909090909090909,
85
+ "grad_norm": 1.8399686813354492,
86
+ "learning_rate": 2.133333333333334e-06,
87
+ "loss": 2.0454,
88
+ "step": 16
89
+ },
90
+ {
91
+ "epoch": 2.909090909090909,
92
+ "eval_loss": 2.075482130050659,
93
+ "eval_runtime": 9.6366,
94
+ "eval_samples_per_second": 2.491,
95
+ "eval_steps_per_second": 2.491,
96
+ "step": 16
97
+ },
98
+ {
99
+ "epoch": 3.2727272727272725,
100
+ "grad_norm": 1.8816026449203491,
101
+ "learning_rate": 2.4000000000000003e-06,
102
+ "loss": 2.1284,
103
+ "step": 18
104
+ },
105
+ {
106
+ "epoch": 3.6363636363636362,
107
+ "grad_norm": 1.5757534503936768,
108
+ "learning_rate": 2.666666666666667e-06,
109
+ "loss": 2.03,
110
+ "step": 20
111
+ },
112
+ {
113
+ "epoch": 4.0,
114
+ "grad_norm": 1.9466415643692017,
115
+ "learning_rate": 2.9333333333333338e-06,
116
+ "loss": 2.0502,
117
+ "step": 22
118
+ },
119
+ {
120
+ "epoch": 4.0,
121
+ "eval_loss": 2.0472075939178467,
122
+ "eval_runtime": 9.6277,
123
+ "eval_samples_per_second": 2.493,
124
+ "eval_steps_per_second": 2.493,
125
+ "step": 22
126
+ },
127
+ {
128
+ "epoch": 4.363636363636363,
129
+ "grad_norm": 1.6615264415740967,
130
+ "learning_rate": 3.2000000000000003e-06,
131
+ "loss": 2.0197,
132
+ "step": 24
133
+ },
134
+ {
135
+ "epoch": 4.7272727272727275,
136
+ "grad_norm": 1.7619231939315796,
137
+ "learning_rate": 3.4666666666666672e-06,
138
+ "loss": 2.0511,
139
+ "step": 26
140
+ },
141
+ {
142
+ "epoch": 4.909090909090909,
143
+ "eval_loss": 2.0100386142730713,
144
+ "eval_runtime": 9.6313,
145
+ "eval_samples_per_second": 2.492,
146
+ "eval_steps_per_second": 2.492,
147
+ "step": 27
148
+ },
149
+ {
150
+ "epoch": 5.090909090909091,
151
+ "grad_norm": 1.8135912418365479,
152
+ "learning_rate": 3.7333333333333337e-06,
153
+ "loss": 1.9759,
154
+ "step": 28
155
+ },
156
+ {
157
+ "epoch": 5.454545454545454,
158
+ "grad_norm": 1.8354995250701904,
159
+ "learning_rate": 4.000000000000001e-06,
160
+ "loss": 2.0128,
161
+ "step": 30
162
+ },
163
+ {
164
+ "epoch": 5.818181818181818,
165
+ "grad_norm": 1.8015000820159912,
166
+ "learning_rate": 4.266666666666668e-06,
167
+ "loss": 1.9554,
168
+ "step": 32
169
+ },
170
+ {
171
+ "epoch": 6.0,
172
+ "eval_loss": 1.9471648931503296,
173
+ "eval_runtime": 9.6325,
174
+ "eval_samples_per_second": 2.492,
175
+ "eval_steps_per_second": 2.492,
176
+ "step": 33
177
+ },
178
+ {
179
+ "epoch": 6.181818181818182,
180
+ "grad_norm": 1.7802411317825317,
181
+ "learning_rate": 4.533333333333334e-06,
182
+ "loss": 1.9607,
183
+ "step": 34
184
+ },
185
+ {
186
+ "epoch": 6.545454545454545,
187
+ "grad_norm": 1.5615451335906982,
188
+ "learning_rate": 4.800000000000001e-06,
189
+ "loss": 1.9032,
190
+ "step": 36
191
+ },
192
+ {
193
+ "epoch": 6.909090909090909,
194
+ "grad_norm": 1.8741137981414795,
195
+ "learning_rate": 5.0666666666666676e-06,
196
+ "loss": 1.8921,
197
+ "step": 38
198
+ },
199
+ {
200
+ "epoch": 6.909090909090909,
201
+ "eval_loss": 1.8795281648635864,
202
+ "eval_runtime": 9.6403,
203
+ "eval_samples_per_second": 2.49,
204
+ "eval_steps_per_second": 2.49,
205
+ "step": 38
206
+ },
207
+ {
208
+ "epoch": 7.2727272727272725,
209
+ "grad_norm": 1.7604111433029175,
210
+ "learning_rate": 5.333333333333334e-06,
211
+ "loss": 1.8681,
212
+ "step": 40
213
+ },
214
+ {
215
+ "epoch": 7.636363636363637,
216
+ "grad_norm": 1.6821084022521973,
217
+ "learning_rate": 5.600000000000001e-06,
218
+ "loss": 1.769,
219
+ "step": 42
220
+ },
221
+ {
222
+ "epoch": 8.0,
223
+ "grad_norm": 1.665964126586914,
224
+ "learning_rate": 5.8666666666666675e-06,
225
+ "loss": 1.8104,
226
+ "step": 44
227
+ },
228
+ {
229
+ "epoch": 8.0,
230
+ "eval_loss": 1.7812843322753906,
231
+ "eval_runtime": 9.6236,
232
+ "eval_samples_per_second": 2.494,
233
+ "eval_steps_per_second": 2.494,
234
+ "step": 44
235
+ },
236
+ {
237
+ "epoch": 8.363636363636363,
238
+ "grad_norm": 1.5216890573501587,
239
+ "learning_rate": 6.133333333333334e-06,
240
+ "loss": 1.7145,
241
+ "step": 46
242
+ },
243
+ {
244
+ "epoch": 8.727272727272727,
245
+ "grad_norm": 1.4722410440444946,
246
+ "learning_rate": 6.4000000000000006e-06,
247
+ "loss": 1.7636,
248
+ "step": 48
249
+ },
250
+ {
251
+ "epoch": 8.909090909090908,
252
+ "eval_loss": 1.6937414407730103,
253
+ "eval_runtime": 9.6259,
254
+ "eval_samples_per_second": 2.493,
255
+ "eval_steps_per_second": 2.493,
256
+ "step": 49
257
+ },
258
+ {
259
+ "epoch": 9.090909090909092,
260
+ "grad_norm": 1.2136281728744507,
261
+ "learning_rate": 6.666666666666667e-06,
262
+ "loss": 1.659,
263
+ "step": 50
264
+ },
265
+ {
266
+ "epoch": 9.454545454545455,
267
+ "grad_norm": 1.0023685693740845,
268
+ "learning_rate": 6.9333333333333344e-06,
269
+ "loss": 1.6509,
270
+ "step": 52
271
+ },
272
+ {
273
+ "epoch": 9.818181818181818,
274
+ "grad_norm": 1.0440162420272827,
275
+ "learning_rate": 7.2000000000000005e-06,
276
+ "loss": 1.6011,
277
+ "step": 54
278
+ },
279
+ {
280
+ "epoch": 10.0,
281
+ "eval_loss": 1.6141911745071411,
282
+ "eval_runtime": 9.6302,
283
+ "eval_samples_per_second": 2.492,
284
+ "eval_steps_per_second": 2.492,
285
+ "step": 55
286
+ },
287
+ {
288
+ "epoch": 10.181818181818182,
289
+ "grad_norm": 0.7877157926559448,
290
+ "learning_rate": 7.4666666666666675e-06,
291
+ "loss": 1.5814,
292
+ "step": 56
293
+ },
294
+ {
295
+ "epoch": 10.545454545454545,
296
+ "grad_norm": 0.6534942984580994,
297
+ "learning_rate": 7.733333333333334e-06,
298
+ "loss": 1.5824,
299
+ "step": 58
300
+ },
301
+ {
302
+ "epoch": 10.909090909090908,
303
+ "grad_norm": 0.6240991950035095,
304
+ "learning_rate": 8.000000000000001e-06,
305
+ "loss": 1.5128,
306
+ "step": 60
307
+ },
308
+ {
309
+ "epoch": 10.909090909090908,
310
+ "eval_loss": 1.5751093626022339,
311
+ "eval_runtime": 9.6475,
312
+ "eval_samples_per_second": 2.488,
313
+ "eval_steps_per_second": 2.488,
314
+ "step": 60
315
+ },
316
+ {
317
+ "epoch": 11.272727272727273,
318
+ "grad_norm": 0.6224139928817749,
319
+ "learning_rate": 8.266666666666667e-06,
320
+ "loss": 1.5444,
321
+ "step": 62
322
+ },
323
+ {
324
+ "epoch": 11.636363636363637,
325
+ "grad_norm": 0.6345284581184387,
326
+ "learning_rate": 8.533333333333335e-06,
327
+ "loss": 1.5682,
328
+ "step": 64
329
+ },
330
+ {
331
+ "epoch": 12.0,
332
+ "grad_norm": 0.5680299997329712,
333
+ "learning_rate": 8.8e-06,
334
+ "loss": 1.4277,
335
+ "step": 66
336
+ },
337
+ {
338
+ "epoch": 12.0,
339
+ "eval_loss": 1.5352739095687866,
340
+ "eval_runtime": 9.6312,
341
+ "eval_samples_per_second": 2.492,
342
+ "eval_steps_per_second": 2.492,
343
+ "step": 66
344
+ },
345
+ {
346
+ "epoch": 12.363636363636363,
347
+ "grad_norm": 0.5991209745407104,
348
+ "learning_rate": 9.066666666666667e-06,
349
+ "loss": 1.4703,
350
+ "step": 68
351
+ },
352
+ {
353
+ "epoch": 12.727272727272727,
354
+ "grad_norm": 0.5993205308914185,
355
+ "learning_rate": 9.333333333333334e-06,
356
+ "loss": 1.4998,
357
+ "step": 70
358
+ },
359
+ {
360
+ "epoch": 12.909090909090908,
361
+ "eval_loss": 1.5001062154769897,
362
+ "eval_runtime": 9.6248,
363
+ "eval_samples_per_second": 2.494,
364
+ "eval_steps_per_second": 2.494,
365
+ "step": 71
366
+ },
367
+ {
368
+ "epoch": 13.090909090909092,
369
+ "grad_norm": 0.5633314251899719,
370
+ "learning_rate": 9.600000000000001e-06,
371
+ "loss": 1.445,
372
+ "step": 72
373
+ },
374
+ {
375
+ "epoch": 13.454545454545455,
376
+ "grad_norm": 0.5419648885726929,
377
+ "learning_rate": 9.866666666666668e-06,
378
+ "loss": 1.4256,
379
+ "step": 74
380
+ },
381
+ {
382
+ "epoch": 13.818181818181818,
383
+ "grad_norm": 0.5384172201156616,
384
+ "learning_rate": 9.999945845889795e-06,
385
+ "loss": 1.4154,
386
+ "step": 76
387
+ },
388
+ {
389
+ "epoch": 14.0,
390
+ "eval_loss": 1.4582971334457397,
391
+ "eval_runtime": 9.6218,
392
+ "eval_samples_per_second": 2.494,
393
+ "eval_steps_per_second": 2.494,
394
+ "step": 77
395
+ },
396
+ {
397
+ "epoch": 14.181818181818182,
398
+ "grad_norm": 0.6161755323410034,
399
+ "learning_rate": 9.999512620046523e-06,
400
+ "loss": 1.4459,
401
+ "step": 78
402
+ },
403
+ {
404
+ "epoch": 14.545454545454545,
405
+ "grad_norm": 0.5570430159568787,
406
+ "learning_rate": 9.99864620589731e-06,
407
+ "loss": 1.3661,
408
+ "step": 80
409
+ },
410
+ {
411
+ "epoch": 14.909090909090908,
412
+ "grad_norm": 0.5637471675872803,
413
+ "learning_rate": 9.99734667851357e-06,
414
+ "loss": 1.4201,
415
+ "step": 82
416
+ },
417
+ {
418
+ "epoch": 14.909090909090908,
419
+ "eval_loss": 1.4252301454544067,
420
+ "eval_runtime": 9.6319,
421
+ "eval_samples_per_second": 2.492,
422
+ "eval_steps_per_second": 2.492,
423
+ "step": 82
424
+ },
425
+ {
426
+ "epoch": 15.272727272727273,
427
+ "grad_norm": 0.5539014935493469,
428
+ "learning_rate": 9.995614150494293e-06,
429
+ "loss": 1.3497,
430
+ "step": 84
431
+ },
432
+ {
433
+ "epoch": 15.636363636363637,
434
+ "grad_norm": 0.5583813786506653,
435
+ "learning_rate": 9.993448771956285e-06,
436
+ "loss": 1.3512,
437
+ "step": 86
438
+ },
439
+ {
440
+ "epoch": 16.0,
441
+ "grad_norm": 0.5377728939056396,
442
+ "learning_rate": 9.99085073052117e-06,
443
+ "loss": 1.3364,
444
+ "step": 88
445
+ },
446
+ {
447
+ "epoch": 16.0,
448
+ "eval_loss": 1.3921159505844116,
449
+ "eval_runtime": 9.635,
450
+ "eval_samples_per_second": 2.491,
451
+ "eval_steps_per_second": 2.491,
452
+ "step": 88
453
+ },
454
+ {
455
+ "epoch": 16.363636363636363,
456
+ "grad_norm": 0.5390649437904358,
457
+ "learning_rate": 9.987820251299121e-06,
458
+ "loss": 1.3614,
459
+ "step": 90
460
+ },
461
+ {
462
+ "epoch": 16.727272727272727,
463
+ "grad_norm": 0.5126790404319763,
464
+ "learning_rate": 9.984357596869369e-06,
465
+ "loss": 1.2762,
466
+ "step": 92
467
+ },
468
+ {
469
+ "epoch": 16.90909090909091,
470
+ "eval_loss": 1.3691315650939941,
471
+ "eval_runtime": 9.6319,
472
+ "eval_samples_per_second": 2.492,
473
+ "eval_steps_per_second": 2.492,
474
+ "step": 93
475
+ },
476
+ {
477
+ "epoch": 17.09090909090909,
478
+ "grad_norm": 0.5642189383506775,
479
+ "learning_rate": 9.980463067257437e-06,
480
+ "loss": 1.2961,
481
+ "step": 94
482
+ },
483
+ {
484
+ "epoch": 17.454545454545453,
485
+ "grad_norm": 0.5290245413780212,
486
+ "learning_rate": 9.976136999909156e-06,
487
+ "loss": 1.1987,
488
+ "step": 96
489
+ },
490
+ {
491
+ "epoch": 17.818181818181817,
492
+ "grad_norm": 0.5963008403778076,
493
+ "learning_rate": 9.971379769661422e-06,
494
+ "loss": 1.2851,
495
+ "step": 98
496
+ },
497
+ {
498
+ "epoch": 18.0,
499
+ "eval_loss": 1.3436861038208008,
500
+ "eval_runtime": 9.6214,
501
+ "eval_samples_per_second": 2.494,
502
+ "eval_steps_per_second": 2.494,
503
+ "step": 99
504
+ },
505
+ {
506
+ "epoch": 18.181818181818183,
507
+ "grad_norm": 0.5820615291595459,
508
+ "learning_rate": 9.966191788709716e-06,
509
+ "loss": 1.329,
510
+ "step": 100
511
+ },
512
+ {
513
+ "epoch": 18.545454545454547,
514
+ "grad_norm": 0.5619508624076843,
515
+ "learning_rate": 9.960573506572391e-06,
516
+ "loss": 1.2428,
517
+ "step": 102
518
+ },
519
+ {
520
+ "epoch": 18.90909090909091,
521
+ "grad_norm": 0.5272645950317383,
522
+ "learning_rate": 9.95452541005172e-06,
523
+ "loss": 1.2239,
524
+ "step": 104
525
+ },
526
+ {
527
+ "epoch": 18.90909090909091,
528
+ "eval_loss": 1.3261139392852783,
529
+ "eval_runtime": 9.6475,
530
+ "eval_samples_per_second": 2.488,
531
+ "eval_steps_per_second": 2.488,
532
+ "step": 104
533
+ },
534
+ {
535
+ "epoch": 19.272727272727273,
536
+ "grad_norm": 0.5720901489257812,
537
+ "learning_rate": 9.948048023191728e-06,
538
+ "loss": 1.1753,
539
+ "step": 106
540
+ },
541
+ {
542
+ "epoch": 19.636363636363637,
543
+ "grad_norm": 0.5877869725227356,
544
+ "learning_rate": 9.941141907232766e-06,
545
+ "loss": 1.2334,
546
+ "step": 108
547
+ },
548
+ {
549
+ "epoch": 20.0,
550
+ "grad_norm": 0.5674625039100647,
551
+ "learning_rate": 9.933807660562898e-06,
552
+ "loss": 1.221,
553
+ "step": 110
554
+ },
555
+ {
556
+ "epoch": 20.0,
557
+ "eval_loss": 1.308407187461853,
558
+ "eval_runtime": 9.6226,
559
+ "eval_samples_per_second": 2.494,
560
+ "eval_steps_per_second": 2.494,
561
+ "step": 110
562
+ },
563
+ {
564
+ "epoch": 20.363636363636363,
565
+ "grad_norm": 0.5934170484542847,
566
+ "learning_rate": 9.926045918666045e-06,
567
+ "loss": 1.1685,
568
+ "step": 112
569
+ },
570
+ {
571
+ "epoch": 20.727272727272727,
572
+ "grad_norm": 0.6199212670326233,
573
+ "learning_rate": 9.91785735406693e-06,
574
+ "loss": 1.2011,
575
+ "step": 114
576
+ },
577
+ {
578
+ "epoch": 20.90909090909091,
579
+ "eval_loss": 1.2950953245162964,
580
+ "eval_runtime": 9.6285,
581
+ "eval_samples_per_second": 2.493,
582
+ "eval_steps_per_second": 2.493,
583
+ "step": 115
584
+ },
585
+ {
586
+ "epoch": 21.09090909090909,
587
+ "grad_norm": 0.5995011329650879,
588
+ "learning_rate": 9.909242676272797e-06,
589
+ "loss": 1.1717,
590
+ "step": 116
591
+ },
592
+ {
593
+ "epoch": 21.454545454545453,
594
+ "grad_norm": 0.6024748682975769,
595
+ "learning_rate": 9.90020263171194e-06,
596
+ "loss": 1.1654,
597
+ "step": 118
598
+ },
599
+ {
600
+ "epoch": 21.818181818181817,
601
+ "grad_norm": 0.6147428750991821,
602
+ "learning_rate": 9.890738003669029e-06,
603
+ "loss": 1.1433,
604
+ "step": 120
605
+ },
606
+ {
607
+ "epoch": 22.0,
608
+ "eval_loss": 1.2823587656021118,
609
+ "eval_runtime": 9.6228,
610
+ "eval_samples_per_second": 2.494,
611
+ "eval_steps_per_second": 2.494,
612
+ "step": 121
613
+ },
614
+ {
615
+ "epoch": 22.181818181818183,
616
+ "grad_norm": 0.612140953540802,
617
+ "learning_rate": 9.880849612217238e-06,
618
+ "loss": 1.0765,
619
+ "step": 122
620
+ },
621
+ {
622
+ "epoch": 22.545454545454547,
623
+ "grad_norm": 0.647298276424408,
624
+ "learning_rate": 9.870538314147194e-06,
625
+ "loss": 1.1183,
626
+ "step": 124
627
+ },
628
+ {
629
+ "epoch": 22.90909090909091,
630
+ "grad_norm": 0.6705971360206604,
631
+ "learning_rate": 9.859805002892733e-06,
632
+ "loss": 1.1579,
633
+ "step": 126
634
+ },
635
+ {
636
+ "epoch": 22.90909090909091,
637
+ "eval_loss": 1.2746213674545288,
638
+ "eval_runtime": 9.6328,
639
+ "eval_samples_per_second": 2.491,
640
+ "eval_steps_per_second": 2.491,
641
+ "step": 126
642
+ },
643
+ {
644
+ "epoch": 23.272727272727273,
645
+ "grad_norm": 0.670023500919342,
646
+ "learning_rate": 9.84865060845349e-06,
647
+ "loss": 1.0965,
648
+ "step": 128
649
+ },
650
+ {
651
+ "epoch": 23.636363636363637,
652
+ "grad_norm": 0.6824691891670227,
653
+ "learning_rate": 9.83707609731432e-06,
654
+ "loss": 1.061,
655
+ "step": 130
656
+ },
657
+ {
658
+ "epoch": 24.0,
659
+ "grad_norm": 0.6598721146583557,
660
+ "learning_rate": 9.825082472361558e-06,
661
+ "loss": 1.0871,
662
+ "step": 132
663
+ },
664
+ {
665
+ "epoch": 24.0,
666
+ "eval_loss": 1.268039345741272,
667
+ "eval_runtime": 9.6219,
668
+ "eval_samples_per_second": 2.494,
669
+ "eval_steps_per_second": 2.494,
670
+ "step": 132
671
+ },
672
+ {
673
+ "epoch": 24.363636363636363,
674
+ "grad_norm": 0.6824683547019958,
675
+ "learning_rate": 9.812670772796113e-06,
676
+ "loss": 1.0733,
677
+ "step": 134
678
+ },
679
+ {
680
+ "epoch": 24.727272727272727,
681
+ "grad_norm": 0.7309969663619995,
682
+ "learning_rate": 9.799842074043438e-06,
683
+ "loss": 1.0745,
684
+ "step": 136
685
+ },
686
+ {
687
+ "epoch": 24.90909090909091,
688
+ "eval_loss": 1.2635489702224731,
689
+ "eval_runtime": 9.6334,
690
+ "eval_samples_per_second": 2.491,
691
+ "eval_steps_per_second": 2.491,
692
+ "step": 137
693
+ },
694
+ {
695
+ "epoch": 25.09090909090909,
696
+ "grad_norm": 0.8717047572135925,
697
+ "learning_rate": 9.786597487660336e-06,
698
+ "loss": 1.0049,
699
+ "step": 138
700
+ },
701
+ {
702
+ "epoch": 25.454545454545453,
703
+ "grad_norm": 0.7290262579917908,
704
+ "learning_rate": 9.77293816123866e-06,
705
+ "loss": 1.0355,
706
+ "step": 140
707
+ },
708
+ {
709
+ "epoch": 25.818181818181817,
710
+ "grad_norm": 0.8125291466712952,
711
+ "learning_rate": 9.75886527830587e-06,
712
+ "loss": 1.0006,
713
+ "step": 142
714
+ },
715
+ {
716
+ "epoch": 26.0,
717
+ "eval_loss": 1.2674241065979004,
718
+ "eval_runtime": 9.6242,
719
+ "eval_samples_per_second": 2.494,
720
+ "eval_steps_per_second": 2.494,
721
+ "step": 143
722
+ },
723
+ {
724
+ "epoch": 26.181818181818183,
725
+ "grad_norm": 0.777037501335144,
726
+ "learning_rate": 9.744380058222483e-06,
727
+ "loss": 1.0235,
728
+ "step": 144
729
+ },
730
+ {
731
+ "epoch": 26.545454545454547,
732
+ "grad_norm": 0.7910988330841064,
733
+ "learning_rate": 9.729483756076436e-06,
734
+ "loss": 1.0119,
735
+ "step": 146
736
+ },
737
+ {
738
+ "epoch": 26.90909090909091,
739
+ "grad_norm": 0.8250744342803955,
740
+ "learning_rate": 9.714177662574316e-06,
741
+ "loss": 0.9628,
742
+ "step": 148
743
+ },
744
+ {
745
+ "epoch": 26.90909090909091,
746
+ "eval_loss": 1.2688733339309692,
747
+ "eval_runtime": 9.6388,
748
+ "eval_samples_per_second": 2.49,
749
+ "eval_steps_per_second": 2.49,
750
+ "step": 148
751
+ },
752
+ {
753
+ "epoch": 27.272727272727273,
754
+ "grad_norm": 0.9542063474655151,
755
+ "learning_rate": 9.698463103929542e-06,
756
+ "loss": 0.9176,
757
+ "step": 150
758
+ },
759
+ {
760
+ "epoch": 27.636363636363637,
761
+ "grad_norm": 0.8577086925506592,
762
+ "learning_rate": 9.682341441747446e-06,
763
+ "loss": 0.9908,
764
+ "step": 152
765
+ },
766
+ {
767
+ "epoch": 28.0,
768
+ "grad_norm": 0.8569504022598267,
769
+ "learning_rate": 9.665814072907293e-06,
770
+ "loss": 0.9237,
771
+ "step": 154
772
+ },
773
+ {
774
+ "epoch": 28.0,
775
+ "eval_loss": 1.2716994285583496,
776
+ "eval_runtime": 9.6271,
777
+ "eval_samples_per_second": 2.493,
778
+ "eval_steps_per_second": 2.493,
779
+ "step": 154
780
+ },
781
+ {
782
+ "epoch": 28.363636363636363,
783
+ "grad_norm": 0.933702826499939,
784
+ "learning_rate": 9.648882429441258e-06,
785
+ "loss": 0.9053,
786
+ "step": 156
787
+ },
788
+ {
789
+ "epoch": 28.727272727272727,
790
+ "grad_norm": 1.002100944519043,
791
+ "learning_rate": 9.63154797841033e-06,
792
+ "loss": 0.8824,
793
+ "step": 158
794
+ },
795
+ {
796
+ "epoch": 28.90909090909091,
797
+ "eval_loss": 1.2879880666732788,
798
+ "eval_runtime": 9.6501,
799
+ "eval_samples_per_second": 2.487,
800
+ "eval_steps_per_second": 2.487,
801
+ "step": 159
802
+ },
803
+ {
804
+ "epoch": 29.09090909090909,
805
+ "grad_norm": 0.9883065819740295,
806
+ "learning_rate": 9.613812221777212e-06,
807
+ "loss": 0.9089,
808
+ "step": 160
809
+ },
810
+ {
811
+ "epoch": 29.454545454545453,
812
+ "grad_norm": 1.0561895370483398,
813
+ "learning_rate": 9.595676696276173e-06,
814
+ "loss": 0.9285,
815
+ "step": 162
816
+ },
817
+ {
818
+ "epoch": 29.818181818181817,
819
+ "grad_norm": 1.1776067018508911,
820
+ "learning_rate": 9.577142973279896e-06,
821
+ "loss": 0.8706,
822
+ "step": 164
823
+ },
824
+ {
825
+ "epoch": 30.0,
826
+ "eval_loss": 1.296054482460022,
827
+ "eval_runtime": 9.6279,
828
+ "eval_samples_per_second": 2.493,
829
+ "eval_steps_per_second": 2.493,
830
+ "step": 165
831
+ },
832
+ {
833
+ "epoch": 30.181818181818183,
834
+ "grad_norm": 1.0879513025283813,
835
+ "learning_rate": 9.55821265866333e-06,
836
+ "loss": 0.7961,
837
+ "step": 166
838
+ },
839
+ {
840
+ "epoch": 30.545454545454547,
841
+ "grad_norm": 1.1668307781219482,
842
+ "learning_rate": 9.538887392664544e-06,
843
+ "loss": 0.7865,
844
+ "step": 168
845
+ },
846
+ {
847
+ "epoch": 30.90909090909091,
848
+ "grad_norm": 1.065364956855774,
849
+ "learning_rate": 9.519168849742603e-06,
850
+ "loss": 0.8328,
851
+ "step": 170
852
+ },
853
+ {
854
+ "epoch": 30.90909090909091,
855
+ "eval_loss": 1.326621174812317,
856
+ "eval_runtime": 9.6286,
857
+ "eval_samples_per_second": 2.493,
858
+ "eval_steps_per_second": 2.493,
859
+ "step": 170
860
+ },
861
+ {
862
+ "epoch": 31.272727272727273,
863
+ "grad_norm": 1.143373727798462,
864
+ "learning_rate": 9.499058738432492e-06,
865
+ "loss": 0.8381,
866
+ "step": 172
867
+ },
868
+ {
869
+ "epoch": 31.636363636363637,
870
+ "grad_norm": 1.1452257633209229,
871
+ "learning_rate": 9.478558801197065e-06,
872
+ "loss": 0.7725,
873
+ "step": 174
874
+ },
875
+ {
876
+ "epoch": 32.0,
877
+ "grad_norm": 1.2163513898849487,
878
+ "learning_rate": 9.457670814276083e-06,
879
+ "loss": 0.7667,
880
+ "step": 176
881
+ },
882
+ {
883
+ "epoch": 32.0,
884
+ "eval_loss": 1.344734787940979,
885
+ "eval_runtime": 9.6252,
886
+ "eval_samples_per_second": 2.493,
887
+ "eval_steps_per_second": 2.493,
888
+ "step": 176
889
+ },
890
+ {
891
+ "epoch": 32.0,
892
+ "step": 176,
893
+ "total_flos": 4.322507713465549e+16,
894
+ "train_loss": 1.4077397015961735,
895
+ "train_runtime": 3298.1261,
896
+ "train_samples_per_second": 4.002,
897
+ "train_steps_per_second": 0.227
898
+ }
899
+ ],
900
+ "logging_steps": 2,
901
+ "max_steps": 750,
902
+ "num_input_tokens_seen": 0,
903
+ "num_train_epochs": 150,
904
+ "save_steps": 25,
905
+ "stateful_callbacks": {
906
+ "EarlyStoppingCallback": {
907
+ "args": {
908
+ "early_stopping_patience": 7,
909
+ "early_stopping_threshold": 0.0
910
+ },
911
+ "attributes": {
912
+ "early_stopping_patience_counter": 0
913
+ }
914
+ },
915
+ "TrainerControl": {
916
+ "args": {
917
+ "should_epoch_stop": false,
918
+ "should_evaluate": false,
919
+ "should_log": false,
920
+ "should_save": true,
921
+ "should_training_stop": true
922
+ },
923
+ "attributes": {}
924
+ }
925
+ },
926
+ "total_flos": 4.322507713465549e+16,
927
+ "train_batch_size": 1,
928
+ "trial_name": null,
929
+ "trial_params": null
930
+ }