beamaia commited on
Commit
4a0179a
1 Parent(s): 9756cfc

Training in progress, step 300, checkpoint

Browse files
.gitattributes CHANGED
@@ -43,3 +43,8 @@ checkpoint-200/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
43
  checkpoint-200/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
44
  checkpoint-200/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
45
  checkpoint-200/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
43
  checkpoint-200/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
44
  checkpoint-200/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
45
  checkpoint-200/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
46
+ checkpoint-300/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text
47
+ checkpoint-300/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
48
+ checkpoint-300/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
49
+ checkpoint-300/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
50
+ checkpoint-300/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
checkpoint-300/optimizer_0/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a74ba8fcf2d857e573ced9e8ccd472ece612ef1ca47c4379e8bbc05bf43f4fa8
3
+ size 2108254
checkpoint-300/optimizer_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd83da4b3217eabac2b07a5b2394f52f53eb41ce08a2c0be6eb990aa0fb60a34
3
+ size 13256787644
checkpoint-300/optimizer_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cf2d9fcb2b371884c1b05c2a2b560255b21d739eb1ec167ff6a1e79289e856a
3
+ size 13257964260
checkpoint-300/pytorch_model_fsdp_0/.metadata ADDED
Binary file (734 kB). View file
 
checkpoint-300/pytorch_model_fsdp_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff19db211147da7a07902d8ddfa2e2fdc4b39fde68ef13434824b735c816b833
3
+ size 6628321920
checkpoint-300/pytorch_model_fsdp_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b39738a9edc1d7bbc8c5bb1b25001211e780b6870039f20096482f56805f7008
3
+ size 6628321920
checkpoint-300/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beca2eb6acdd5a7b8641bec048f30fbea991ad0429f37793c30df5e17dab62ab
3
+ size 14512
checkpoint-300/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a3ac9c4fbcfa1a0af2c291acea2ff563bf7187bd6cacef0705850fe757f3994
3
+ size 14512
checkpoint-300/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce9a26f12dd7dd941eaff313822101cf4704fc86a85950ebbf3b355f828e2a5f
3
+ size 1000
checkpoint-300/trainer_state.json ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5126128792762756,
3
+ "best_model_checkpoint": "./llama3/30-08-24-Weni-Pipeline_test_Experiment with SFT and Llama3 70b-2_max_steps-1362_batch_8_2024-08-30/checkpoint-300",
4
+ "epoch": 1.3201320132013201,
5
+ "eval_steps": 100,
6
+ "global_step": 300,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04400440044004401,
13
+ "grad_norm": 0.5568628907203674,
14
+ "learning_rate": 7.5e-05,
15
+ "loss": 2.0875,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.08800880088008801,
20
+ "grad_norm": 0.2537558972835541,
21
+ "learning_rate": 0.00015,
22
+ "loss": 0.9378,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.132013201320132,
27
+ "grad_norm": 0.24558919668197632,
28
+ "learning_rate": 0.000225,
29
+ "loss": 0.7,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.17601760176017603,
34
+ "grad_norm": 0.13937097787857056,
35
+ "learning_rate": 0.0003,
36
+ "loss": 0.6298,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.22002200220022003,
41
+ "grad_norm": 0.1871194988489151,
42
+ "learning_rate": 0.00029995764763563235,
43
+ "loss": 0.6321,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.264026402640264,
48
+ "grad_norm": 0.14626263082027435,
49
+ "learning_rate": 0.00029983061445883305,
50
+ "loss": 0.6403,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.30803080308030806,
55
+ "grad_norm": 0.12049665302038193,
56
+ "learning_rate": 0.0002996189722050073,
57
+ "loss": 0.5998,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.35203520352035206,
62
+ "grad_norm": 0.13617923855781555,
63
+ "learning_rate": 0.0002993228403881531,
64
+ "loss": 0.5942,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.39603960396039606,
69
+ "grad_norm": 0.1271793246269226,
70
+ "learning_rate": 0.00029894238623337174,
71
+ "loss": 0.5647,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.44004400440044006,
76
+ "grad_norm": 0.18757876753807068,
77
+ "learning_rate": 0.00029847782458243663,
78
+ "loss": 0.5619,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.44004400440044006,
83
+ "eval_accuracy": 1.0,
84
+ "eval_f1": 1.0,
85
+ "eval_f1_macro": 1.0,
86
+ "eval_f1_micro": 1.0,
87
+ "eval_loss": 0.5742923021316528,
88
+ "eval_precision": 1.0,
89
+ "eval_precision_macro": 1.0,
90
+ "eval_precision_micro": 1.0,
91
+ "eval_recall": 1.0,
92
+ "eval_recall_macro": 1.0,
93
+ "eval_recall_micro": 1.0,
94
+ "eval_runtime": 90.5857,
95
+ "eval_samples_per_second": 4.46,
96
+ "eval_steps_per_second": 1.115,
97
+ "step": 100
98
+ },
99
+ {
100
+ "epoch": 0.48404840484048406,
101
+ "grad_norm": 0.14132679998874664,
102
+ "learning_rate": 0.00029792941777247184,
103
+ "loss": 0.5584,
104
+ "step": 110
105
+ },
106
+ {
107
+ "epoch": 0.528052805280528,
108
+ "grad_norm": 0.15474887192249298,
109
+ "learning_rate": 0.0002972974754878111,
110
+ "loss": 0.5752,
111
+ "step": 120
112
+ },
113
+ {
114
+ "epoch": 0.5720572057205721,
115
+ "grad_norm": 0.13014496862888336,
116
+ "learning_rate": 0.0002965823545851199,
117
+ "loss": 0.5565,
118
+ "step": 130
119
+ },
120
+ {
121
+ "epoch": 0.6160616061606161,
122
+ "grad_norm": 0.12456662207841873,
123
+ "learning_rate": 0.00029578445889187865,
124
+ "loss": 0.5722,
125
+ "step": 140
126
+ },
127
+ {
128
+ "epoch": 0.6600660066006601,
129
+ "grad_norm": 0.12824317812919617,
130
+ "learning_rate": 0.00029490423897834234,
131
+ "loss": 0.523,
132
+ "step": 150
133
+ },
134
+ {
135
+ "epoch": 0.7040704070407041,
136
+ "grad_norm": 0.14279119670391083,
137
+ "learning_rate": 0.0002939421919031044,
138
+ "loss": 0.5523,
139
+ "step": 160
140
+ },
141
+ {
142
+ "epoch": 0.7480748074807481,
143
+ "grad_norm": 0.11781885474920273,
144
+ "learning_rate": 0.00029289886093240847,
145
+ "loss": 0.5291,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 0.7920792079207921,
150
+ "grad_norm": 0.1608349233865738,
151
+ "learning_rate": 0.0002917748352333667,
152
+ "loss": 0.5417,
153
+ "step": 180
154
+ },
155
+ {
156
+ "epoch": 0.8360836083608361,
157
+ "grad_norm": 0.13777320086956024,
158
+ "learning_rate": 0.0002905707495412589,
159
+ "loss": 0.4967,
160
+ "step": 190
161
+ },
162
+ {
163
+ "epoch": 0.8800880088008801,
164
+ "grad_norm": 0.21577192842960358,
165
+ "learning_rate": 0.00028928728380109764,
166
+ "loss": 0.6545,
167
+ "step": 200
168
+ },
169
+ {
170
+ "epoch": 0.8800880088008801,
171
+ "eval_accuracy": 1.0,
172
+ "eval_f1": 1.0,
173
+ "eval_f1_macro": 1.0,
174
+ "eval_f1_micro": 1.0,
175
+ "eval_loss": 0.6772989630699158,
176
+ "eval_precision": 1.0,
177
+ "eval_precision_macro": 1.0,
178
+ "eval_precision_micro": 1.0,
179
+ "eval_recall": 1.0,
180
+ "eval_recall_macro": 1.0,
181
+ "eval_recall_micro": 1.0,
182
+ "eval_runtime": 90.2067,
183
+ "eval_samples_per_second": 4.479,
184
+ "eval_steps_per_second": 1.12,
185
+ "step": 200
186
+ },
187
+ {
188
+ "epoch": 0.9240924092409241,
189
+ "grad_norm": 3.4556286334991455,
190
+ "learning_rate": 0.00028792516278366547,
191
+ "loss": 2.5144,
192
+ "step": 210
193
+ },
194
+ {
195
+ "epoch": 0.9680968096809681,
196
+ "grad_norm": 0.13046959042549133,
197
+ "learning_rate": 0.00028648515567623764,
198
+ "loss": 0.6004,
199
+ "step": 220
200
+ },
201
+ {
202
+ "epoch": 1.012101210121012,
203
+ "grad_norm": 0.39528125524520874,
204
+ "learning_rate": 0.0002849680756482235,
205
+ "loss": 0.5174,
206
+ "step": 230
207
+ },
208
+ {
209
+ "epoch": 1.056105610561056,
210
+ "grad_norm": 0.23764920234680176,
211
+ "learning_rate": 0.00028337477939197135,
212
+ "loss": 0.4065,
213
+ "step": 240
214
+ },
215
+ {
216
+ "epoch": 1.1001100110011002,
217
+ "grad_norm": 0.14821326732635498,
218
+ "learning_rate": 0.0002817061666389958,
219
+ "loss": 0.4425,
220
+ "step": 250
221
+ },
222
+ {
223
+ "epoch": 1.1441144114411441,
224
+ "grad_norm": 0.15903742611408234,
225
+ "learning_rate": 0.0002799631796519007,
226
+ "loss": 0.4107,
227
+ "step": 260
228
+ },
229
+ {
230
+ "epoch": 1.188118811881188,
231
+ "grad_norm": 0.16189326345920563,
232
+ "learning_rate": 0.00027814680269228574,
233
+ "loss": 0.4215,
234
+ "step": 270
235
+ },
236
+ {
237
+ "epoch": 1.2321232123212322,
238
+ "grad_norm": 0.20622508227825165,
239
+ "learning_rate": 0.00027625806146493523,
240
+ "loss": 0.3968,
241
+ "step": 280
242
+ },
243
+ {
244
+ "epoch": 1.2761276127612762,
245
+ "grad_norm": 0.17622467875480652,
246
+ "learning_rate": 0.0002742980225386045,
247
+ "loss": 0.4419,
248
+ "step": 290
249
+ },
250
+ {
251
+ "epoch": 1.3201320132013201,
252
+ "grad_norm": 0.12472284585237503,
253
+ "learning_rate": 0.0002722677927437307,
254
+ "loss": 0.396,
255
+ "step": 300
256
+ },
257
+ {
258
+ "epoch": 1.3201320132013201,
259
+ "eval_accuracy": 1.0,
260
+ "eval_f1": 1.0,
261
+ "eval_f1_macro": 1.0,
262
+ "eval_f1_micro": 1.0,
263
+ "eval_loss": 0.5126128792762756,
264
+ "eval_precision": 1.0,
265
+ "eval_precision_macro": 1.0,
266
+ "eval_precision_micro": 1.0,
267
+ "eval_recall": 1.0,
268
+ "eval_recall_macro": 1.0,
269
+ "eval_recall_micro": 1.0,
270
+ "eval_runtime": 90.5135,
271
+ "eval_samples_per_second": 4.463,
272
+ "eval_steps_per_second": 1.116,
273
+ "step": 300
274
+ }
275
+ ],
276
+ "logging_steps": 10,
277
+ "max_steps": 1362,
278
+ "num_input_tokens_seen": 0,
279
+ "num_train_epochs": 6,
280
+ "save_steps": 100,
281
+ "stateful_callbacks": {
282
+ "TrainerControl": {
283
+ "args": {
284
+ "should_epoch_stop": false,
285
+ "should_evaluate": false,
286
+ "should_log": false,
287
+ "should_save": true,
288
+ "should_training_stop": false
289
+ },
290
+ "attributes": {}
291
+ }
292
+ },
293
+ "total_flos": 4.182761586777129e+17,
294
+ "train_batch_size": 2,
295
+ "trial_name": null,
296
+ "trial_params": null
297
+ }