Update README.md
Browse files
README.md
CHANGED
@@ -88,15 +88,15 @@ alignment-handbook DPO with UNA on top of the SFT lora.
|
|
88 |
|
89 |
### Evaluation lm-evaluation-harness
|
90 |
|
91 |
-
#### GSM8K
|
92 |
```
|
93 |
-
hf (pretrained
|
94 |
```
|
95 |
|Tasks|Version| Filter | Metric |Value | |Stderr|
|
96 |
|-----|-------|----------|-----------|-----:|---|-----:|
|
97 |
-
|gsm8k|Yaml |get-answer|exact_match|0.
|
98 |
|
99 |
-
#### 0-Shot
|
100 |
```
|
101 |
hf (pretrained=fblgit/juanako-7b-v1,load_in_4bit=False,dtype=float16), limit: None, num_fewshot: 0, batch_size: 8
|
102 |
```
|
@@ -120,30 +120,22 @@ hf (pretrained=fblgit/juanako-7b-v1,load_in_4bit=False,dtype=float16), limit: No
|
|
120 |
| - truthfulqa_mc2 |Yaml |none |acc | 0.5847|± |0.0153|
|
121 |
|winogrande |Yaml |none |acc | 0.7609|± |0.0120|
|
122 |
|
123 |
-
#### 1-Shot
|
124 |
```
|
125 |
-
hf (pretrained=fblgit/juanako-7b-v1,load_in_4bit=False,dtype=float16), limit: None, num_fewshot:
|
126 |
```
|
127 |
-
|
|
128 |
-
|
129 |
-
|arc_challenge
|
130 |
-
|
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
137 |
-
|
138 |
-
|
|
139 |
-
|
|
140 |
-
|sciq |Yaml |none |acc | 0.9730|± |0.0051|
|
141 |
-
| | |none |acc_norm | 0.9740|± |0.0050|
|
142 |
-
|truthfulqa |N/A |none |bleu_max |18.9814|± |0.4805|
|
143 |
-
| | |none |acc | 0.4856|± |0.0521|
|
144 |
-
| - truthfulqa_mc1 |Yaml |none |acc | 0.4333|± |0.0173|
|
145 |
-
| - truthfulqa_mc2 |Yaml |none |acc | 0.5903|± |0.0153|
|
146 |
-
|winogrande |Yaml |none |acc | 0.7609|± |0.0120|
|
147 |
|
148 |
## Training procedure
|
149 |
|
@@ -185,82 +177,82 @@ The following hyperparameters were used during training:
|
|
185 |
|
186 |
## MMLU Results
|
187 |
|
188 |
-
####
|
189 |
```
|
190 |
-
hf (pretrained=fblgit/juanako-7b-v1,load_in_4bit=False,dtype=float16), limit: None, num_fewshot:
|
191 |
```
|
192 |
| Tasks |Version|Filter|Metric|Value | |Stderr|
|
193 |
|---------------------------------------|-------|------|------|-----:|---|-----:|
|
194 |
-
|mmlu |N/A |none |acc |0.
|
195 |
-
| - humanities |N/A |none |acc |0.
|
196 |
-
| - formal_logic |Yaml |none |acc |0.
|
197 |
-
| - high_school_european_history |Yaml |none |acc |0.
|
198 |
-
| - high_school_us_history |Yaml |none |acc |0.
|
199 |
-
| - high_school_world_history |Yaml |none |acc |0.
|
200 |
-
| - international_law |Yaml |none |acc |0.
|
201 |
-
| - jurisprudence |Yaml |none |acc |0.
|
202 |
-
| - logical_fallacies |Yaml |none |acc |0.
|
203 |
-
| - moral_disputes |Yaml |none |acc |0.
|
204 |
-
| - moral_scenarios |Yaml |none |acc |0.
|
205 |
-
| - philosophy |Yaml |none |acc |0.
|
206 |
-
| - prehistory |Yaml |none |acc |0.
|
207 |
-
| - professional_law |Yaml |none |acc |0.
|
208 |
-
| - world_religions |Yaml |none |acc |0.
|
209 |
-
| - other |N/A |none |acc |0.
|
210 |
-
| - business_ethics |Yaml |none |acc |0.
|
211 |
-
| - clinical_knowledge |Yaml |none |acc |0.
|
212 |
-
| - college_medicine |Yaml |none |acc |0.
|
213 |
-
| - global_facts |Yaml |none |acc |0.
|
214 |
-
| - human_aging |Yaml |none |acc |0.
|
215 |
-
| - management |Yaml |none |acc |0.
|
216 |
-
| - marketing |Yaml |none |acc |0.
|
217 |
-
| - medical_genetics |Yaml |none |acc |0.
|
218 |
-
| - miscellaneous |Yaml |none |acc |0.
|
219 |
-
| - nutrition |Yaml |none |acc |0.
|
220 |
| - professional_accounting |Yaml |none |acc |0.4929|± |0.0298|
|
221 |
-
| - professional_medicine |Yaml |none |acc |0.
|
222 |
-
| - virology |Yaml |none |acc |0.
|
223 |
-
| - social_sciences |N/A |none |acc |0.
|
224 |
| - econometrics |Yaml |none |acc |0.5000|± |0.0470|
|
225 |
-
| - high_school_geography |Yaml |none |acc |0.
|
226 |
-
| - high_school_government_and_politics|Yaml |none |acc |0.
|
227 |
-
| - high_school_macroeconomics |Yaml |none |acc |0.
|
228 |
-
| - high_school_microeconomics |Yaml |none |acc |0.
|
229 |
-
| - high_school_psychology |Yaml |none |acc |0.
|
230 |
-
| - human_sexuality |Yaml |none |acc |0.
|
231 |
-
| - professional_psychology |Yaml |none |acc |0.
|
232 |
| - public_relations |Yaml |none |acc |0.6636|± |0.0453|
|
233 |
-
| - security_studies |Yaml |none |acc |0.
|
234 |
-
| - sociology |Yaml |none |acc |0.
|
235 |
-
| - us_foreign_policy |Yaml |none |acc |0.
|
236 |
-
| - stem |N/A |none |acc |0.
|
237 |
-
| - abstract_algebra |Yaml |none |acc |0.
|
238 |
-
| - anatomy |Yaml |none |acc |0.
|
239 |
-
| - astronomy |Yaml |none |acc |0.
|
240 |
-
| - college_biology |Yaml |none |acc |0.
|
241 |
-
| - college_chemistry |Yaml |none |acc |0.
|
242 |
-
| - college_computer_science |Yaml |none |acc |0.
|
243 |
-
| - college_mathematics |Yaml |none |acc |0.
|
244 |
-
| - college_physics |Yaml |none |acc |0.
|
245 |
-
| - computer_security |Yaml |none |acc |0.
|
246 |
-
| - conceptual_physics |Yaml |none |acc |0.
|
247 |
-
| - electrical_engineering |Yaml |none |acc |0.
|
248 |
-
| - elementary_mathematics |Yaml |none |acc |0.
|
249 |
-
| - high_school_biology |Yaml |none |acc |0.
|
250 |
-
| - high_school_chemistry |Yaml |none |acc |0.
|
251 |
-
| - high_school_computer_science |Yaml |none |acc |0.
|
252 |
-
| - high_school_mathematics |Yaml |none |acc |0.
|
253 |
-
| - high_school_physics |Yaml |none |acc |0.
|
254 |
| - high_school_statistics |Yaml |none |acc |0.5139|± |0.0341|
|
255 |
-
| - machine_learning |Yaml |none |acc |0.
|
256 |
|
257 |
| Groups |Version|Filter|Metric|Value | |Stderr|
|
258 |
|------------------|-------|------|------|-----:|---|-----:|
|
259 |
-
|mmlu |N/A |none |acc |0.
|
260 |
-
| - humanities |N/A |none |acc |0.
|
261 |
-
| - other |N/A |none |acc |0.
|
262 |
-
| - social_sciences|N/A |none |acc |0.
|
263 |
-
| - stem |N/A |none |acc |0.
|
264 |
|
265 |
### Citations
|
266 |
Please feel free to raise a PR if there is any missing citation.
|
|
|
88 |
|
89 |
### Evaluation lm-evaluation-harness
|
90 |
|
91 |
+
#### GSM8K 5-Shot
|
92 |
```
|
93 |
+
hf (pretrained=fblgit/juanako-7b-v1,load_in_4bit=False,dtype=float16), limit: None, num_fewshot: 5, batch_size: 4
|
94 |
```
|
95 |
|Tasks|Version| Filter | Metric |Value | |Stderr|
|
96 |
|-----|-------|----------|-----------|-----:|---|-----:|
|
97 |
+
|gsm8k|Yaml |get-answer|exact_match|0.4761|± |0.0138|
|
98 |
|
99 |
+
#### 0-Shot Tests
|
100 |
```
|
101 |
hf (pretrained=fblgit/juanako-7b-v1,load_in_4bit=False,dtype=float16), limit: None, num_fewshot: 0, batch_size: 8
|
102 |
```
|
|
|
120 |
| - truthfulqa_mc2 |Yaml |none |acc | 0.5847|± |0.0153|
|
121 |
|winogrande |Yaml |none |acc | 0.7609|± |0.0120|
|
122 |
|
|
|
123 |
```
|
124 |
+
hf (pretrained=fblgit/juanako-7b-v1,load_in_4bit=False,dtype=float16), limit: None, num_fewshot: 25, batch_size: 1
|
125 |
```
|
126 |
+
| Tasks |Version|Filter| Metric |Value | |Stderr|
|
127 |
+
|-------------|-------|------|--------|-----:|---|-----:|
|
128 |
+
|arc_challenge|Yaml |none |acc |0.6058|± |0.0143|
|
129 |
+
| | |none |acc_norm|0.6485|± |0.0140|
|
130 |
+
|
131 |
+
#### HellaSwag 10-Shot
|
132 |
+
```
|
133 |
+
hf (pretrained=fblgit/juanako-7b-v1,load_in_4bit=False,dtype=float16), limit: None, num_fewshot: 10, batch_size: 16
|
134 |
+
```
|
135 |
+
| Tasks |Version|Filter| Metric |Value | |Stderr|
|
136 |
+
|---------|-------|------|--------|-----:|---|-----:|
|
137 |
+
|hellaswag|Yaml |none |acc |0.6582|± |0.0047|
|
138 |
+
| | |none |acc_norm|0.8513|± |0.0036|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
## Training procedure
|
141 |
|
|
|
177 |
|
178 |
## MMLU Results
|
179 |
|
180 |
+
#### 5-Shot
|
181 |
```
|
182 |
+
hf (pretrained=fblgit/juanako-7b-v1,load_in_4bit=False,dtype=float16), limit: None, num_fewshot: 5, batch_size: 1
|
183 |
```
|
184 |
| Tasks |Version|Filter|Metric|Value | |Stderr|
|
185 |
|---------------------------------------|-------|------|------|-----:|---|-----:|
|
186 |
+
|mmlu |N/A |none |acc |0.6236|± |0.1269|
|
187 |
+
| - humanities |N/A |none |acc |0.5651|± |0.1289|
|
188 |
+
| - formal_logic |Yaml |none |acc |0.4365|± |0.0444|
|
189 |
+
| - high_school_european_history |Yaml |none |acc |0.7636|± |0.0332|
|
190 |
+
| - high_school_us_history |Yaml |none |acc |0.8039|± |0.0279|
|
191 |
+
| - high_school_world_history |Yaml |none |acc |0.7848|± |0.0268|
|
192 |
+
| - international_law |Yaml |none |acc |0.7686|± |0.0385|
|
193 |
+
| - jurisprudence |Yaml |none |acc |0.7778|± |0.0402|
|
194 |
+
| - logical_fallacies |Yaml |none |acc |0.7853|± |0.0323|
|
195 |
+
| - moral_disputes |Yaml |none |acc |0.7168|± |0.0243|
|
196 |
+
| - moral_scenarios |Yaml |none |acc |0.3207|± |0.0156|
|
197 |
+
| - philosophy |Yaml |none |acc |0.7042|± |0.0259|
|
198 |
+
| - prehistory |Yaml |none |acc |0.7593|± |0.0238|
|
199 |
+
| - professional_law |Yaml |none |acc |0.4433|± |0.0127|
|
200 |
+
| - world_religions |Yaml |none |acc |0.8363|± |0.0284|
|
201 |
+
| - other |N/A |none |acc |0.6987|± |0.1048|
|
202 |
+
| - business_ethics |Yaml |none |acc |0.5800|± |0.0496|
|
203 |
+
| - clinical_knowledge |Yaml |none |acc |0.7019|± |0.0282|
|
204 |
+
| - college_medicine |Yaml |none |acc |0.6474|± |0.0364|
|
205 |
+
| - global_facts |Yaml |none |acc |0.3900|± |0.0490|
|
206 |
+
| - human_aging |Yaml |none |acc |0.6502|± |0.0320|
|
207 |
+
| - management |Yaml |none |acc |0.7864|± |0.0406|
|
208 |
+
| - marketing |Yaml |none |acc |0.8590|± |0.0228|
|
209 |
+
| - medical_genetics |Yaml |none |acc |0.7400|± |0.0441|
|
210 |
+
| - miscellaneous |Yaml |none |acc |0.8148|± |0.0139|
|
211 |
+
| - nutrition |Yaml |none |acc |0.7418|± |0.0251|
|
212 |
| - professional_accounting |Yaml |none |acc |0.4929|± |0.0298|
|
213 |
+
| - professional_medicine |Yaml |none |acc |0.6618|± |0.0287|
|
214 |
+
| - virology |Yaml |none |acc |0.5482|± |0.0387|
|
215 |
+
| - social_sciences |N/A |none |acc |0.7361|± |0.0640|
|
216 |
| - econometrics |Yaml |none |acc |0.5000|± |0.0470|
|
217 |
+
| - high_school_geography |Yaml |none |acc |0.7727|± |0.0299|
|
218 |
+
| - high_school_government_and_politics|Yaml |none |acc |0.8808|± |0.0234|
|
219 |
+
| - high_school_macroeconomics |Yaml |none |acc |0.6667|± |0.0239|
|
220 |
+
| - high_school_microeconomics |Yaml |none |acc |0.6597|± |0.0308|
|
221 |
+
| - high_school_psychology |Yaml |none |acc |0.8202|± |0.0165|
|
222 |
+
| - human_sexuality |Yaml |none |acc |0.7939|± |0.0355|
|
223 |
+
| - professional_psychology |Yaml |none |acc |0.6716|± |0.0190|
|
224 |
| - public_relations |Yaml |none |acc |0.6636|± |0.0453|
|
225 |
+
| - security_studies |Yaml |none |acc |0.7551|± |0.0275|
|
226 |
+
| - sociology |Yaml |none |acc |0.8209|± |0.0271|
|
227 |
+
| - us_foreign_policy |Yaml |none |acc |0.8300|± |0.0378|
|
228 |
+
| - stem |N/A |none |acc |0.5268|± |0.1263|
|
229 |
+
| - abstract_algebra |Yaml |none |acc |0.3200|± |0.0469|
|
230 |
+
| - anatomy |Yaml |none |acc |0.6296|± |0.0417|
|
231 |
+
| - astronomy |Yaml |none |acc |0.6645|± |0.0384|
|
232 |
+
| - college_biology |Yaml |none |acc |0.7431|± |0.0365|
|
233 |
+
| - college_chemistry |Yaml |none |acc |0.4800|± |0.0502|
|
234 |
+
| - college_computer_science |Yaml |none |acc |0.5200|± |0.0502|
|
235 |
+
| - college_mathematics |Yaml |none |acc |0.4200|± |0.0496|
|
236 |
+
| - college_physics |Yaml |none |acc |0.4510|± |0.0495|
|
237 |
+
| - computer_security |Yaml |none |acc |0.7800|± |0.0416|
|
238 |
+
| - conceptual_physics |Yaml |none |acc |0.5489|± |0.0325|
|
239 |
+
| - electrical_engineering |Yaml |none |acc |0.5655|± |0.0413|
|
240 |
+
| - elementary_mathematics |Yaml |none |acc |0.3915|± |0.0251|
|
241 |
+
| - high_school_biology |Yaml |none |acc |0.7548|± |0.0245|
|
242 |
+
| - high_school_chemistry |Yaml |none |acc |0.5222|± |0.0351|
|
243 |
+
| - high_school_computer_science |Yaml |none |acc |0.6900|± |0.0465|
|
244 |
+
| - high_school_mathematics |Yaml |none |acc |0.3222|± |0.0285|
|
245 |
+
| - high_school_physics |Yaml |none |acc |0.3444|± |0.0388|
|
246 |
| - high_school_statistics |Yaml |none |acc |0.5139|± |0.0341|
|
247 |
+
| - machine_learning |Yaml |none |acc |0.4643|± |0.0473|
|
248 |
|
249 |
| Groups |Version|Filter|Metric|Value | |Stderr|
|
250 |
|------------------|-------|------|------|-----:|---|-----:|
|
251 |
+
|mmlu |N/A |none |acc |0.6236|± |0.1269|
|
252 |
+
| - humanities |N/A |none |acc |0.5651|± |0.1289|
|
253 |
+
| - other |N/A |none |acc |0.6987|± |0.1048|
|
254 |
+
| - social_sciences|N/A |none |acc |0.7361|± |0.0640|
|
255 |
+
| - stem |N/A |none |acc |0.5268|± |0.1263|
|
256 |
|
257 |
### Citations
|
258 |
Please feel free to raise a PR if there is any missing citation.
|