beamaia commited on
Commit
9756cfc
1 Parent(s): f98097a

Training in progress, step 200, checkpoint

Browse files
.gitattributes CHANGED
@@ -38,3 +38,8 @@ checkpoint-100/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
38
  checkpoint-100/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
39
  checkpoint-100/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
40
  checkpoint-100/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
38
  checkpoint-100/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
39
  checkpoint-100/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
40
  checkpoint-100/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
41
+ checkpoint-200/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text
42
+ checkpoint-200/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
43
+ checkpoint-200/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
44
+ checkpoint-200/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
45
+ checkpoint-200/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
checkpoint-200/optimizer_0/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a74ba8fcf2d857e573ced9e8ccd472ece612ef1ca47c4379e8bbc05bf43f4fa8
3
+ size 2108254
checkpoint-200/optimizer_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:674032aaae689e20caaff7e152cbd81a796a39b1720c9d8499e2e8bfe020d17c
3
+ size 13256787644
checkpoint-200/optimizer_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7f0533ae806729f12554e1303ee02f6956c3b534ee929f5f3a4bf1db4434793
3
+ size 13257964260
checkpoint-200/pytorch_model_fsdp_0/.metadata ADDED
Binary file (734 kB). View file
 
checkpoint-200/pytorch_model_fsdp_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f99269a6b3feba7dcd3885f3d103cdec8bb7a514423ae942dfd7db8b418e8225
3
+ size 6628321920
checkpoint-200/pytorch_model_fsdp_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5dd01741a4ccc7edbdfc39cfd832497ccf67fa73c32959b4325d940eb95fb46
3
+ size 6628321920
checkpoint-200/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:985be3c100966af62f00b8631c27deac90e52e423a7e4a2c3255b59f4da8eee1
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41cb1fd0cf02312df7c9db483296344d5cea23a620661cf3977d37a50cab42f0
3
  size 14512
checkpoint-200/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1e9b24a21ea7d3a361422caff7c5c58bb120577ea3bdc81d81a35c85fe0c4b8
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4db18a80bfe3ec5bfeabaed27eedc5b6daa844f4842bf328e9ce888aeb18adc5
3
  size 14512
checkpoint-200/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_metric": 0.6097185015678406,
3
- "best_model_checkpoint": "./llama3/28-08-24-Weni-Pipeline_test_Experiment with SFT and Llama3 70b-2_max_steps-1362_batch_16_2024-08-28/checkpoint-200",
4
- "epoch": 0.4404073768235618,
5
  "eval_steps": 100,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
@@ -9,186 +9,186 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.02202036884117809,
13
- "grad_norm": 0.6965782642364502,
14
  "learning_rate": 7.5e-05,
15
- "loss": 2.1201,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.04404073768235618,
20
- "grad_norm": 0.7504029870033264,
21
  "learning_rate": 0.00015,
22
- "loss": 0.9392,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.06606110652353427,
27
- "grad_norm": 0.7269854545593262,
28
  "learning_rate": 0.000225,
29
- "loss": 0.7958,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.08808147536471236,
34
- "grad_norm": 0.15891791880130768,
35
  "learning_rate": 0.0003,
36
- "loss": 0.7251,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.11010184420589045,
41
- "grad_norm": 0.14764881134033203,
42
  "learning_rate": 0.00029995764763563235,
43
- "loss": 0.6941,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.13212221304706853,
48
- "grad_norm": 0.11882930248975754,
49
  "learning_rate": 0.00029983061445883305,
50
- "loss": 0.673,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.15414258188824662,
55
- "grad_norm": 0.15152081847190857,
56
  "learning_rate": 0.0002996189722050073,
57
- "loss": 0.6428,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.17616295072942473,
62
- "grad_norm": 0.1619480848312378,
63
  "learning_rate": 0.0002993228403881531,
64
- "loss": 0.6465,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.1981833195706028,
69
- "grad_norm": 0.15019242465496063,
70
  "learning_rate": 0.00029894238623337174,
71
- "loss": 0.6308,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.2202036884117809,
76
- "grad_norm": 0.14553773403167725,
77
  "learning_rate": 0.00029847782458243663,
78
- "loss": 0.6314,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.2202036884117809,
83
  "eval_accuracy": 1.0,
84
  "eval_f1": 1.0,
85
  "eval_f1_macro": 1.0,
86
  "eval_f1_micro": 1.0,
87
- "eval_loss": 0.6374139189720154,
88
  "eval_precision": 1.0,
89
  "eval_precision_macro": 1.0,
90
  "eval_precision_micro": 1.0,
91
  "eval_recall": 1.0,
92
  "eval_recall_macro": 1.0,
93
  "eval_recall_micro": 1.0,
94
- "eval_runtime": 1957.1676,
95
- "eval_samples_per_second": 0.413,
96
- "eval_steps_per_second": 0.206,
97
  "step": 100
98
  },
99
  {
100
- "epoch": 0.24222405725295898,
101
- "grad_norm": 0.20665155351161957,
102
  "learning_rate": 0.00029792941777247184,
103
- "loss": 0.646,
104
  "step": 110
105
  },
106
  {
107
- "epoch": 0.26424442609413706,
108
- "grad_norm": 0.18165886402130127,
109
  "learning_rate": 0.0002972974754878111,
110
- "loss": 0.6326,
111
  "step": 120
112
  },
113
  {
114
- "epoch": 0.28626479493531515,
115
- "grad_norm": 0.13634872436523438,
116
  "learning_rate": 0.0002965823545851199,
117
- "loss": 0.5785,
118
  "step": 130
119
  },
120
  {
121
- "epoch": 0.30828516377649323,
122
- "grad_norm": 0.14753705263137817,
123
  "learning_rate": 0.00029578445889187865,
124
- "loss": 0.639,
125
  "step": 140
126
  },
127
  {
128
- "epoch": 0.33030553261767137,
129
- "grad_norm": 0.2674409747123718,
130
  "learning_rate": 0.00029490423897834234,
131
- "loss": 0.626,
132
  "step": 150
133
  },
134
  {
135
- "epoch": 0.35232590145884946,
136
- "grad_norm": 0.16381941735744476,
137
  "learning_rate": 0.0002939421919031044,
138
- "loss": 0.6009,
139
  "step": 160
140
  },
141
  {
142
- "epoch": 0.37434627030002754,
143
- "grad_norm": 0.1673993617296219,
144
  "learning_rate": 0.00029289886093240847,
145
- "loss": 0.6193,
146
  "step": 170
147
  },
148
  {
149
- "epoch": 0.3963666391412056,
150
- "grad_norm": 0.16463495790958405,
151
  "learning_rate": 0.0002917748352333667,
152
- "loss": 0.5894,
153
  "step": 180
154
  },
155
  {
156
- "epoch": 0.4183870079823837,
157
- "grad_norm": 0.17818249762058258,
158
  "learning_rate": 0.0002905707495412589,
159
- "loss": 0.6249,
160
  "step": 190
161
  },
162
  {
163
- "epoch": 0.4404073768235618,
164
- "grad_norm": 0.14480963349342346,
165
  "learning_rate": 0.00028928728380109764,
166
- "loss": 0.6223,
167
  "step": 200
168
  },
169
  {
170
- "epoch": 0.4404073768235618,
171
  "eval_accuracy": 1.0,
172
  "eval_f1": 1.0,
173
  "eval_f1_macro": 1.0,
174
  "eval_f1_micro": 1.0,
175
- "eval_loss": 0.6097185015678406,
176
  "eval_precision": 1.0,
177
  "eval_precision_macro": 1.0,
178
  "eval_precision_micro": 1.0,
179
  "eval_recall": 1.0,
180
  "eval_recall_macro": 1.0,
181
  "eval_recall_micro": 1.0,
182
- "eval_runtime": 1863.6968,
183
- "eval_samples_per_second": 0.434,
184
- "eval_steps_per_second": 0.217,
185
  "step": 200
186
  }
187
  ],
188
  "logging_steps": 10,
189
  "max_steps": 1362,
190
  "num_input_tokens_seen": 0,
191
- "num_train_epochs": 3,
192
  "save_steps": 100,
193
  "stateful_callbacks": {
194
  "TrainerControl": {
@@ -202,7 +202,7 @@
202
  "attributes": {}
203
  }
204
  },
205
- "total_flos": 9495153475584.0,
206
  "train_batch_size": 2,
207
  "trial_name": null,
208
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.5742923021316528,
3
+ "best_model_checkpoint": "./llama3/30-08-24-Weni-Pipeline_test_Experiment with SFT and Llama3 70b-2_max_steps-1362_batch_8_2024-08-30/checkpoint-100",
4
+ "epoch": 0.8800880088008801,
5
  "eval_steps": 100,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.04400440044004401,
13
+ "grad_norm": 0.5568628907203674,
14
  "learning_rate": 7.5e-05,
15
+ "loss": 2.0875,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.08800880088008801,
20
+ "grad_norm": 0.2537558972835541,
21
  "learning_rate": 0.00015,
22
+ "loss": 0.9378,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.132013201320132,
27
+ "grad_norm": 0.24558919668197632,
28
  "learning_rate": 0.000225,
29
+ "loss": 0.7,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.17601760176017603,
34
+ "grad_norm": 0.13937097787857056,
35
  "learning_rate": 0.0003,
36
+ "loss": 0.6298,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.22002200220022003,
41
+ "grad_norm": 0.1871194988489151,
42
  "learning_rate": 0.00029995764763563235,
43
+ "loss": 0.6321,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.264026402640264,
48
+ "grad_norm": 0.14626263082027435,
49
  "learning_rate": 0.00029983061445883305,
50
+ "loss": 0.6403,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.30803080308030806,
55
+ "grad_norm": 0.12049665302038193,
56
  "learning_rate": 0.0002996189722050073,
57
+ "loss": 0.5998,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.35203520352035206,
62
+ "grad_norm": 0.13617923855781555,
63
  "learning_rate": 0.0002993228403881531,
64
+ "loss": 0.5942,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.39603960396039606,
69
+ "grad_norm": 0.1271793246269226,
70
  "learning_rate": 0.00029894238623337174,
71
+ "loss": 0.5647,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.44004400440044006,
76
+ "grad_norm": 0.18757876753807068,
77
  "learning_rate": 0.00029847782458243663,
78
+ "loss": 0.5619,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.44004400440044006,
83
  "eval_accuracy": 1.0,
84
  "eval_f1": 1.0,
85
  "eval_f1_macro": 1.0,
86
  "eval_f1_micro": 1.0,
87
+ "eval_loss": 0.5742923021316528,
88
  "eval_precision": 1.0,
89
  "eval_precision_macro": 1.0,
90
  "eval_precision_micro": 1.0,
91
  "eval_recall": 1.0,
92
  "eval_recall_macro": 1.0,
93
  "eval_recall_micro": 1.0,
94
+ "eval_runtime": 90.5857,
95
+ "eval_samples_per_second": 4.46,
96
+ "eval_steps_per_second": 1.115,
97
  "step": 100
98
  },
99
  {
100
+ "epoch": 0.48404840484048406,
101
+ "grad_norm": 0.14132679998874664,
102
  "learning_rate": 0.00029792941777247184,
103
+ "loss": 0.5584,
104
  "step": 110
105
  },
106
  {
107
+ "epoch": 0.528052805280528,
108
+ "grad_norm": 0.15474887192249298,
109
  "learning_rate": 0.0002972974754878111,
110
+ "loss": 0.5752,
111
  "step": 120
112
  },
113
  {
114
+ "epoch": 0.5720572057205721,
115
+ "grad_norm": 0.13014496862888336,
116
  "learning_rate": 0.0002965823545851199,
117
+ "loss": 0.5565,
118
  "step": 130
119
  },
120
  {
121
+ "epoch": 0.6160616061606161,
122
+ "grad_norm": 0.12456662207841873,
123
  "learning_rate": 0.00029578445889187865,
124
+ "loss": 0.5722,
125
  "step": 140
126
  },
127
  {
128
+ "epoch": 0.6600660066006601,
129
+ "grad_norm": 0.12824317812919617,
130
  "learning_rate": 0.00029490423897834234,
131
+ "loss": 0.523,
132
  "step": 150
133
  },
134
  {
135
+ "epoch": 0.7040704070407041,
136
+ "grad_norm": 0.14279119670391083,
137
  "learning_rate": 0.0002939421919031044,
138
+ "loss": 0.5523,
139
  "step": 160
140
  },
141
  {
142
+ "epoch": 0.7480748074807481,
143
+ "grad_norm": 0.11781885474920273,
144
  "learning_rate": 0.00029289886093240847,
145
+ "loss": 0.5291,
146
  "step": 170
147
  },
148
  {
149
+ "epoch": 0.7920792079207921,
150
+ "grad_norm": 0.1608349233865738,
151
  "learning_rate": 0.0002917748352333667,
152
+ "loss": 0.5417,
153
  "step": 180
154
  },
155
  {
156
+ "epoch": 0.8360836083608361,
157
+ "grad_norm": 0.13777320086956024,
158
  "learning_rate": 0.0002905707495412589,
159
+ "loss": 0.4967,
160
  "step": 190
161
  },
162
  {
163
+ "epoch": 0.8800880088008801,
164
+ "grad_norm": 0.21577192842960358,
165
  "learning_rate": 0.00028928728380109764,
166
+ "loss": 0.6545,
167
  "step": 200
168
  },
169
  {
170
+ "epoch": 0.8800880088008801,
171
  "eval_accuracy": 1.0,
172
  "eval_f1": 1.0,
173
  "eval_f1_macro": 1.0,
174
  "eval_f1_micro": 1.0,
175
+ "eval_loss": 0.6772989630699158,
176
  "eval_precision": 1.0,
177
  "eval_precision_macro": 1.0,
178
  "eval_precision_micro": 1.0,
179
  "eval_recall": 1.0,
180
  "eval_recall_macro": 1.0,
181
  "eval_recall_micro": 1.0,
182
+ "eval_runtime": 90.2067,
183
+ "eval_samples_per_second": 4.479,
184
+ "eval_steps_per_second": 1.12,
185
  "step": 200
186
  }
187
  ],
188
  "logging_steps": 10,
189
  "max_steps": 1362,
190
  "num_input_tokens_seen": 0,
191
+ "num_train_epochs": 6,
192
  "save_steps": 100,
193
  "stateful_callbacks": {
194
  "TrainerControl": {
 
202
  "attributes": {}
203
  }
204
  },
205
+ "total_flos": 2.774286031954903e+17,
206
  "train_batch_size": 2,
207
  "trial_name": null,
208
  "trial_params": null