beamaia commited on
Commit
ed3be56
1 Parent(s): 4a0179a

Training in progress, step 400, checkpoint

Browse files
.gitattributes CHANGED
@@ -48,3 +48,8 @@ checkpoint-300/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
48
  checkpoint-300/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
49
  checkpoint-300/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
50
  checkpoint-300/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
48
  checkpoint-300/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
49
  checkpoint-300/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
50
  checkpoint-300/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
51
+ checkpoint-400/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text
52
+ checkpoint-400/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
53
+ checkpoint-400/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
54
+ checkpoint-400/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
55
+ checkpoint-400/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
checkpoint-400/optimizer_0/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a74ba8fcf2d857e573ced9e8ccd472ece612ef1ca47c4379e8bbc05bf43f4fa8
3
+ size 2108254
checkpoint-400/optimizer_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97526ddea90efa3d0ae7a7d8514c3026bb492dcb26c1a21808ad1abe933f1b3e
3
+ size 13256787644
checkpoint-400/optimizer_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1dc15f1fbdfe70cae527d0d50c5cbc873ad34a6bfd62cc2ba9cdbfbb83b449c
3
+ size 13257964260
checkpoint-400/pytorch_model_fsdp_0/.metadata ADDED
Binary file (734 kB). View file
 
checkpoint-400/pytorch_model_fsdp_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3269587842cb193ba57dc01027f127072026f88cb973af7a4b41959bb536545
3
+ size 6628321920
checkpoint-400/pytorch_model_fsdp_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a99872f16f3c83f33eccb1cd9309dc05ab2df6d7468d08a343ee4befa4c9bde5
3
+ size 6628321920
checkpoint-400/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db5d99a6c1f613178aee8a88858e35c43888794b32c3c9357e466f03e1f5906e
3
+ size 14512
checkpoint-400/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29747cb7712d7e98c4ad5d236c3fc140ebea524534e69095c72c88ed3ebc8530
3
+ size 14512
checkpoint-400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7febe3a2ca1c0d459643964286e56b56721c9bc05dbbeb61e176c534c63819d8
3
+ size 1000
checkpoint-400/trainer_state.json ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.4464975595474243,
3
+ "best_model_checkpoint": "./llama3/30-08-24-Weni-Pipeline_test_Experiment with SFT and Llama3 70b-2_max_steps-1362_batch_8_2024-08-30/checkpoint-400",
4
+ "epoch": 1.76017601760176,
5
+ "eval_steps": 100,
6
+ "global_step": 400,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04400440044004401,
13
+ "grad_norm": 0.5568628907203674,
14
+ "learning_rate": 7.5e-05,
15
+ "loss": 2.0875,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.08800880088008801,
20
+ "grad_norm": 0.2537558972835541,
21
+ "learning_rate": 0.00015,
22
+ "loss": 0.9378,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.132013201320132,
27
+ "grad_norm": 0.24558919668197632,
28
+ "learning_rate": 0.000225,
29
+ "loss": 0.7,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.17601760176017603,
34
+ "grad_norm": 0.13937097787857056,
35
+ "learning_rate": 0.0003,
36
+ "loss": 0.6298,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.22002200220022003,
41
+ "grad_norm": 0.1871194988489151,
42
+ "learning_rate": 0.00029995764763563235,
43
+ "loss": 0.6321,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.264026402640264,
48
+ "grad_norm": 0.14626263082027435,
49
+ "learning_rate": 0.00029983061445883305,
50
+ "loss": 0.6403,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.30803080308030806,
55
+ "grad_norm": 0.12049665302038193,
56
+ "learning_rate": 0.0002996189722050073,
57
+ "loss": 0.5998,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.35203520352035206,
62
+ "grad_norm": 0.13617923855781555,
63
+ "learning_rate": 0.0002993228403881531,
64
+ "loss": 0.5942,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.39603960396039606,
69
+ "grad_norm": 0.1271793246269226,
70
+ "learning_rate": 0.00029894238623337174,
71
+ "loss": 0.5647,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.44004400440044006,
76
+ "grad_norm": 0.18757876753807068,
77
+ "learning_rate": 0.00029847782458243663,
78
+ "loss": 0.5619,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.44004400440044006,
83
+ "eval_accuracy": 1.0,
84
+ "eval_f1": 1.0,
85
+ "eval_f1_macro": 1.0,
86
+ "eval_f1_micro": 1.0,
87
+ "eval_loss": 0.5742923021316528,
88
+ "eval_precision": 1.0,
89
+ "eval_precision_macro": 1.0,
90
+ "eval_precision_micro": 1.0,
91
+ "eval_recall": 1.0,
92
+ "eval_recall_macro": 1.0,
93
+ "eval_recall_micro": 1.0,
94
+ "eval_runtime": 90.5857,
95
+ "eval_samples_per_second": 4.46,
96
+ "eval_steps_per_second": 1.115,
97
+ "step": 100
98
+ },
99
+ {
100
+ "epoch": 0.48404840484048406,
101
+ "grad_norm": 0.14132679998874664,
102
+ "learning_rate": 0.00029792941777247184,
103
+ "loss": 0.5584,
104
+ "step": 110
105
+ },
106
+ {
107
+ "epoch": 0.528052805280528,
108
+ "grad_norm": 0.15474887192249298,
109
+ "learning_rate": 0.0002972974754878111,
110
+ "loss": 0.5752,
111
+ "step": 120
112
+ },
113
+ {
114
+ "epoch": 0.5720572057205721,
115
+ "grad_norm": 0.13014496862888336,
116
+ "learning_rate": 0.0002965823545851199,
117
+ "loss": 0.5565,
118
+ "step": 130
119
+ },
120
+ {
121
+ "epoch": 0.6160616061606161,
122
+ "grad_norm": 0.12456662207841873,
123
+ "learning_rate": 0.00029578445889187865,
124
+ "loss": 0.5722,
125
+ "step": 140
126
+ },
127
+ {
128
+ "epoch": 0.6600660066006601,
129
+ "grad_norm": 0.12824317812919617,
130
+ "learning_rate": 0.00029490423897834234,
131
+ "loss": 0.523,
132
+ "step": 150
133
+ },
134
+ {
135
+ "epoch": 0.7040704070407041,
136
+ "grad_norm": 0.14279119670391083,
137
+ "learning_rate": 0.0002939421919031044,
138
+ "loss": 0.5523,
139
+ "step": 160
140
+ },
141
+ {
142
+ "epoch": 0.7480748074807481,
143
+ "grad_norm": 0.11781885474920273,
144
+ "learning_rate": 0.00029289886093240847,
145
+ "loss": 0.5291,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 0.7920792079207921,
150
+ "grad_norm": 0.1608349233865738,
151
+ "learning_rate": 0.0002917748352333667,
152
+ "loss": 0.5417,
153
+ "step": 180
154
+ },
155
+ {
156
+ "epoch": 0.8360836083608361,
157
+ "grad_norm": 0.13777320086956024,
158
+ "learning_rate": 0.0002905707495412589,
159
+ "loss": 0.4967,
160
+ "step": 190
161
+ },
162
+ {
163
+ "epoch": 0.8800880088008801,
164
+ "grad_norm": 0.21577192842960358,
165
+ "learning_rate": 0.00028928728380109764,
166
+ "loss": 0.6545,
167
+ "step": 200
168
+ },
169
+ {
170
+ "epoch": 0.8800880088008801,
171
+ "eval_accuracy": 1.0,
172
+ "eval_f1": 1.0,
173
+ "eval_f1_macro": 1.0,
174
+ "eval_f1_micro": 1.0,
175
+ "eval_loss": 0.6772989630699158,
176
+ "eval_precision": 1.0,
177
+ "eval_precision_macro": 1.0,
178
+ "eval_precision_micro": 1.0,
179
+ "eval_recall": 1.0,
180
+ "eval_recall_macro": 1.0,
181
+ "eval_recall_micro": 1.0,
182
+ "eval_runtime": 90.2067,
183
+ "eval_samples_per_second": 4.479,
184
+ "eval_steps_per_second": 1.12,
185
+ "step": 200
186
+ },
187
+ {
188
+ "epoch": 0.9240924092409241,
189
+ "grad_norm": 3.4556286334991455,
190
+ "learning_rate": 0.00028792516278366547,
191
+ "loss": 2.5144,
192
+ "step": 210
193
+ },
194
+ {
195
+ "epoch": 0.9680968096809681,
196
+ "grad_norm": 0.13046959042549133,
197
+ "learning_rate": 0.00028648515567623764,
198
+ "loss": 0.6004,
199
+ "step": 220
200
+ },
201
+ {
202
+ "epoch": 1.012101210121012,
203
+ "grad_norm": 0.39528125524520874,
204
+ "learning_rate": 0.0002849680756482235,
205
+ "loss": 0.5174,
206
+ "step": 230
207
+ },
208
+ {
209
+ "epoch": 1.056105610561056,
210
+ "grad_norm": 0.23764920234680176,
211
+ "learning_rate": 0.00028337477939197135,
212
+ "loss": 0.4065,
213
+ "step": 240
214
+ },
215
+ {
216
+ "epoch": 1.1001100110011002,
217
+ "grad_norm": 0.14821326732635498,
218
+ "learning_rate": 0.0002817061666389958,
219
+ "loss": 0.4425,
220
+ "step": 250
221
+ },
222
+ {
223
+ "epoch": 1.1441144114411441,
224
+ "grad_norm": 0.15903742611408234,
225
+ "learning_rate": 0.0002799631796519007,
226
+ "loss": 0.4107,
227
+ "step": 260
228
+ },
229
+ {
230
+ "epoch": 1.188118811881188,
231
+ "grad_norm": 0.16189326345920563,
232
+ "learning_rate": 0.00027814680269228574,
233
+ "loss": 0.4215,
234
+ "step": 270
235
+ },
236
+ {
237
+ "epoch": 1.2321232123212322,
238
+ "grad_norm": 0.20622508227825165,
239
+ "learning_rate": 0.00027625806146493523,
240
+ "loss": 0.3968,
241
+ "step": 280
242
+ },
243
+ {
244
+ "epoch": 1.2761276127612762,
245
+ "grad_norm": 0.17622467875480652,
246
+ "learning_rate": 0.0002742980225386045,
247
+ "loss": 0.4419,
248
+ "step": 290
249
+ },
250
+ {
251
+ "epoch": 1.3201320132013201,
252
+ "grad_norm": 0.12472284585237503,
253
+ "learning_rate": 0.0002722677927437307,
254
+ "loss": 0.396,
255
+ "step": 300
256
+ },
257
+ {
258
+ "epoch": 1.3201320132013201,
259
+ "eval_accuracy": 1.0,
260
+ "eval_f1": 1.0,
261
+ "eval_f1_macro": 1.0,
262
+ "eval_f1_micro": 1.0,
263
+ "eval_loss": 0.5126128792762756,
264
+ "eval_precision": 1.0,
265
+ "eval_precision_macro": 1.0,
266
+ "eval_precision_micro": 1.0,
267
+ "eval_recall": 1.0,
268
+ "eval_recall_macro": 1.0,
269
+ "eval_recall_micro": 1.0,
270
+ "eval_runtime": 90.5135,
271
+ "eval_samples_per_second": 4.463,
272
+ "eval_steps_per_second": 1.116,
273
+ "step": 300
274
+ },
275
+ {
276
+ "epoch": 1.364136413641364,
277
+ "grad_norm": 0.17043285071849823,
278
+ "learning_rate": 0.0002701685185474076,
279
+ "loss": 0.4297,
280
+ "step": 310
281
+ },
282
+ {
283
+ "epoch": 1.408140814081408,
284
+ "grad_norm": 0.12771165370941162,
285
+ "learning_rate": 0.00026800138540597723,
286
+ "loss": 0.4174,
287
+ "step": 320
288
+ },
289
+ {
290
+ "epoch": 1.4521452145214522,
291
+ "grad_norm": 0.18149854242801666,
292
+ "learning_rate": 0.00026576761709560555,
293
+ "loss": 0.4313,
294
+ "step": 330
295
+ },
296
+ {
297
+ "epoch": 1.4961496149614961,
298
+ "grad_norm": 0.17737938463687897,
299
+ "learning_rate": 0.00026346847502121783,
300
+ "loss": 0.3693,
301
+ "step": 340
302
+ },
303
+ {
304
+ "epoch": 1.5401540154015403,
305
+ "grad_norm": 1.699652910232544,
306
+ "learning_rate": 0.0002611052575041856,
307
+ "loss": 0.4237,
308
+ "step": 350
309
+ },
310
+ {
311
+ "epoch": 1.5841584158415842,
312
+ "grad_norm": 0.18545816838741302,
313
+ "learning_rate": 0.00025867929904916704,
314
+ "loss": 0.3925,
315
+ "step": 360
316
+ },
317
+ {
318
+ "epoch": 1.6281628162816282,
319
+ "grad_norm": 0.16388830542564392,
320
+ "learning_rate": 0.0002561919695905145,
321
+ "loss": 0.3759,
322
+ "step": 370
323
+ },
324
+ {
325
+ "epoch": 1.6721672167216721,
326
+ "grad_norm": 0.1380474716424942,
327
+ "learning_rate": 0.0002536446737186751,
328
+ "loss": 0.3859,
329
+ "step": 380
330
+ },
331
+ {
332
+ "epoch": 1.716171617161716,
333
+ "grad_norm": 0.16071344912052155,
334
+ "learning_rate": 0.0002510388498870211,
335
+ "loss": 0.41,
336
+ "step": 390
337
+ },
338
+ {
339
+ "epoch": 1.76017601760176,
340
+ "grad_norm": 0.15108689665794373,
341
+ "learning_rate": 0.00024837596959955777,
342
+ "loss": 0.3645,
343
+ "step": 400
344
+ },
345
+ {
346
+ "epoch": 1.76017601760176,
347
+ "eval_accuracy": 1.0,
348
+ "eval_f1": 1.0,
349
+ "eval_f1_macro": 1.0,
350
+ "eval_f1_micro": 1.0,
351
+ "eval_loss": 0.4464975595474243,
352
+ "eval_precision": 1.0,
353
+ "eval_precision_macro": 1.0,
354
+ "eval_precision_micro": 1.0,
355
+ "eval_recall": 1.0,
356
+ "eval_recall_macro": 1.0,
357
+ "eval_recall_micro": 1.0,
358
+ "eval_runtime": 90.5484,
359
+ "eval_samples_per_second": 4.462,
360
+ "eval_steps_per_second": 1.115,
361
+ "step": 400
362
+ }
363
+ ],
364
+ "logging_steps": 10,
365
+ "max_steps": 1362,
366
+ "num_input_tokens_seen": 0,
367
+ "num_train_epochs": 6,
368
+ "save_steps": 100,
369
+ "stateful_callbacks": {
370
+ "TrainerControl": {
371
+ "args": {
372
+ "should_epoch_stop": false,
373
+ "should_evaluate": false,
374
+ "should_log": false,
375
+ "should_save": true,
376
+ "should_training_stop": false
377
+ },
378
+ "attributes": {}
379
+ }
380
+ },
381
+ "total_flos": 5.589901503407063e+17,
382
+ "train_batch_size": 2,
383
+ "trial_name": null,
384
+ "trial_params": null
385
+ }