|
| Tasks |Version|Filter|n-shot|Metric|Value | |Stderr| |
|
|---------------------------------------|-------|------|-----:|------|-----:|---|-----:| |
|
|mmlu |N/A |none | 0|acc |0.4082|± |0.0041| |
|
| - humanities |N/A |none | 0|acc |0.3783|± |0.0069| |
|
| - formal_logic | 0|none | 0|acc |0.3333|± |0.0422| |
|
| - high_school_european_history | 0|none | 0|acc |0.5818|± |0.0385| |
|
| - high_school_us_history | 0|none | 0|acc |0.4804|± |0.0351| |
|
| - high_school_world_history | 0|none | 0|acc |0.5781|± |0.0321| |
|
| - international_law | 0|none | 0|acc |0.5372|± |0.0455| |
|
| - jurisprudence | 0|none | 0|acc |0.4352|± |0.0479| |
|
| - logical_fallacies | 0|none | 0|acc |0.4785|± |0.0392| |
|
| - moral_disputes | 0|none | 0|acc |0.4017|± |0.0264| |
|
| - moral_scenarios | 0|none | 0|acc |0.2413|± |0.0143| |
|
| - philosophy | 0|none | 0|acc |0.4952|± |0.0284| |
|
| - prehistory | 0|none | 0|acc |0.4321|± |0.0276| |
|
| - professional_law | 0|none | 0|acc |0.3031|± |0.0117| |
|
| - world_religions | 0|none | 0|acc |0.6023|± |0.0375| |
|
| - other |N/A |none | 0|acc |0.4606|± |0.0089| |
|
| - business_ethics | 0|none | 0|acc |0.4700|± |0.0502| |
|
| - clinical_knowledge | 0|none | 0|acc |0.4717|± |0.0307| |
|
| - college_medicine | 0|none | 0|acc |0.4509|± |0.0379| |
|
| - global_facts | 0|none | 0|acc |0.3000|± |0.0461| |
|
| - human_aging | 0|none | 0|acc |0.3946|± |0.0328| |
|
| - management | 0|none | 0|acc |0.4369|± |0.0491| |
|
| - marketing | 0|none | 0|acc |0.5940|± |0.0322| |
|
| - medical_genetics | 0|none | 0|acc |0.5000|± |0.0503| |
|
| - miscellaneous | 0|none | 0|acc |0.5223|± |0.0179| |
|
| - nutrition | 0|none | 0|acc |0.4444|± |0.0285| |
|
| - professional_accounting | 0|none | 0|acc |0.3794|± |0.0289| |
|
| - professional_medicine | 0|none | 0|acc |0.4228|± |0.0300| |
|
| - virology | 0|none | 0|acc |0.3735|± |0.0377| |
|
| - social_sciences |N/A |none | 0|acc |0.4621|± |0.0089| |
|
| - econometrics | 0|none | 0|acc |0.2105|± |0.0384| |
|
| - high_school_geography | 0|none | 0|acc |0.4545|± |0.0355| |
|
| - high_school_government_and_politics| 0|none | 0|acc |0.5440|± |0.0359| |
|
| - high_school_macroeconomics | 0|none | 0|acc |0.3872|± |0.0247| |
|
| - high_school_microeconomics | 0|none | 0|acc |0.3529|± |0.0310| |
|
| - high_school_psychology | 0|none | 0|acc |0.5468|± |0.0213| |
|
| - human_sexuality | 0|none | 0|acc |0.5115|± |0.0438| |
|
| - professional_psychology | 0|none | 0|acc |0.4297|± |0.0200| |
|
| - public_relations | 0|none | 0|acc |0.4273|± |0.0474| |
|
| - security_studies | 0|none | 0|acc |0.4571|± |0.0319| |
|
| - sociology | 0|none | 0|acc |0.5920|± |0.0348| |
|
| - us_foreign_policy | 0|none | 0|acc |0.6200|± |0.0488| |
|
| - stem |N/A |none | 0|acc |0.3486|± |0.0084| |
|
| - abstract_algebra | 0|none | 0|acc |0.2800|± |0.0451| |
|
| - anatomy | 0|none | 0|acc |0.4148|± |0.0426| |
|
| - astronomy | 0|none | 0|acc |0.4079|± |0.0400| |
|
| - college_biology | 0|none | 0|acc |0.4583|± |0.0417| |
|
| - college_chemistry | 0|none | 0|acc |0.3500|± |0.0479| |
|
| - college_computer_science | 0|none | 0|acc |0.3200|± |0.0469| |
|
| - college_mathematics | 0|none | 0|acc |0.3100|± |0.0465| |
|
| - college_physics | 0|none | 0|acc |0.3137|± |0.0462| |
|
| - computer_security | 0|none | 0|acc |0.5200|± |0.0502| |
|
| - conceptual_physics | 0|none | 0|acc |0.3489|± |0.0312| |
|
| - electrical_engineering | 0|none | 0|acc |0.3655|± |0.0401| |
|
| - elementary_mathematics | 0|none | 0|acc |0.2725|± |0.0229| |
|
| - high_school_biology | 0|none | 0|acc |0.4581|± |0.0283| |
|
| - high_school_chemistry | 0|none | 0|acc |0.3399|± |0.0333| |
|
| - high_school_computer_science | 0|none | 0|acc |0.4100|± |0.0494| |
|
| - high_school_mathematics | 0|none | 0|acc |0.2407|± |0.0261| |
|
| - high_school_physics | 0|none | 0|acc |0.2914|± |0.0371| |
|
| - high_school_statistics | 0|none | 0|acc |0.2870|± |0.0309| |
|
| - machine_learning | 0|none | 0|acc |0.3929|± |0.0464| |
|
|