eval
Browse files
README.md
CHANGED
@@ -101,19 +101,19 @@ litgpt evaluate --tasks 'leaderboard' --out_dir 'evaluate-0/' --batch_size 4 --d
|
|
101 |
litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-1/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
|
102 |
```
|
103 |
|
104 |
-
| Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr|
|
105 |
-
|---------------------------------------|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|
106 |
-
|arc_challenge | 1|none | 0|acc |↑ |0.1911|± |0.0115|
|
107 |
-
| | |none | 0|acc_norm |↑ |0.2355|± |0.0124|
|
108 |
-
|gsm8k | 3|flexible-extract| 5|exact_match|↑ |0.0152|± |0.0034|
|
109 |
-
| | |strict-match | 5|exact_match|↑ |0.0000|± |0.0000|
|
110 |
-
|hellaswag | 1|none | 0|acc |↑ |0.2661|± |0.0044|
|
111 |
-
| | |none | 0|acc_norm |↑ |0.2708|± |0.0044|
|
112 |
-
|mmlu | 2|none | |acc |↑ |0.2315|± |0.0036|
|
113 |
-
| - humanities | 2|none | |acc |↑ |0.2372|± |0.0062|
|
114 |
-
| - formal_logic | 1|none | 0|acc |↑ |0.2937|± |0.0407|
|
115 |
-
| - high_school_european_history | 1|none | 0|acc |↑ |0.2424|± |0.0335|
|
116 |
-
| - high_school_us_history | 1|none | 0|acc
|
117 |
| - high_school_world_history | 1|none | 0|acc |↑ |0.2321|± |0.0275|
|
118 |
| - international_law | 1|none | 0|acc |↑ |0.1983|± |0.0364|
|
119 |
| - jurisprudence | 1|none | 0|acc |↑ |0.2315|± |0.0408|
|
|
|
101 |
litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-1/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
|
102 |
```
|
103 |
|
104 |
+
| Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr|
|
105 |
+
|---------------------------------------|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|
106 |
+
|arc_challenge | 1|none | 0|acc |↑ |0.1911|± |0.0115|
|
107 |
+
| | |none | 0|acc_norm |↑ |0.2355|± |0.0124|
|
108 |
+
|gsm8k | 3|flexible-extract| 5|exact_match|↑ |0.0152|± |0.0034|
|
109 |
+
| | |strict-match | 5|exact_match|↑ |0.0000|± |0.0000|
|
110 |
+
|hellaswag | 1|none | 0|acc |↑ |0.2661|± |0.0044|
|
111 |
+
| | |none | 0|acc_norm |↑ |0.2708|± |0.0044|
|
112 |
+
|mmlu | 2|none | |acc |↑ |0.2315|± |0.0036|
|
113 |
+
| - humanities | 2|none | |acc |↑ |0.2372|± |0.0062|
|
114 |
+
| - formal_logic | 1|none | 0|acc |↑ |0.2937|± |0.0407|
|
115 |
+
| - high_school_european_history | 1|none | 0|acc |↑ |0.2424|± |0.0335|
|
116 |
+
| - high_school_us_history | 1|none | 0|acc |��� |0.2451|± |0.0302|
|
117 |
| - high_school_world_history | 1|none | 0|acc |↑ |0.2321|± |0.0275|
|
118 |
| - international_law | 1|none | 0|acc |↑ |0.1983|± |0.0364|
|
119 |
| - jurisprudence | 1|none | 0|acc |↑ |0.2315|± |0.0408|
|