|
{ |
|
"results": { |
|
"hendrycksTest-abstract_algebra": { |
|
"acc": 0.27, |
|
"acc_stderr": 0.0446196043338474, |
|
"acc_norm": 0.27, |
|
"acc_norm_stderr": 0.0446196043338474 |
|
}, |
|
"hendrycksTest-anatomy": { |
|
"acc": 0.4444444444444444, |
|
"acc_stderr": 0.04292596718256981, |
|
"acc_norm": 0.4444444444444444, |
|
"acc_norm_stderr": 0.04292596718256981 |
|
}, |
|
"hendrycksTest-astronomy": { |
|
"acc": 0.4473684210526316, |
|
"acc_stderr": 0.04046336883978251, |
|
"acc_norm": 0.4473684210526316, |
|
"acc_norm_stderr": 0.04046336883978251 |
|
}, |
|
"hendrycksTest-business_ethics": { |
|
"acc": 0.43, |
|
"acc_stderr": 0.04975698519562428, |
|
"acc_norm": 0.43, |
|
"acc_norm_stderr": 0.04975698519562428 |
|
}, |
|
"hendrycksTest-clinical_knowledge": { |
|
"acc": 0.4528301886792453, |
|
"acc_stderr": 0.030635627957961823, |
|
"acc_norm": 0.4528301886792453, |
|
"acc_norm_stderr": 0.030635627957961823 |
|
}, |
|
"hendrycksTest-college_biology": { |
|
"acc": 0.4513888888888889, |
|
"acc_stderr": 0.04161402398403279, |
|
"acc_norm": 0.4513888888888889, |
|
"acc_norm_stderr": 0.04161402398403279 |
|
}, |
|
"hendrycksTest-college_chemistry": { |
|
"acc": 0.37, |
|
"acc_stderr": 0.04852365870939099, |
|
"acc_norm": 0.37, |
|
"acc_norm_stderr": 0.04852365870939099 |
|
}, |
|
"hendrycksTest-college_computer_science": { |
|
"acc": 0.35, |
|
"acc_stderr": 0.0479372485441102, |
|
"acc_norm": 0.35, |
|
"acc_norm_stderr": 0.0479372485441102 |
|
}, |
|
"hendrycksTest-college_mathematics": { |
|
"acc": 0.23, |
|
"acc_stderr": 0.04229525846816508, |
|
"acc_norm": 0.23, |
|
"acc_norm_stderr": 0.04229525846816508 |
|
}, |
|
"hendrycksTest-college_medicine": { |
|
"acc": 0.3699421965317919, |
|
"acc_stderr": 0.036812296333943194, |
|
"acc_norm": 0.3699421965317919, |
|
"acc_norm_stderr": 0.036812296333943194 |
|
}, |
|
"hendrycksTest-college_physics": { |
|
"acc": 0.20588235294117646, |
|
"acc_stderr": 0.04023382273617746, |
|
"acc_norm": 0.20588235294117646, |
|
"acc_norm_stderr": 0.04023382273617746 |
|
}, |
|
"hendrycksTest-computer_security": { |
|
"acc": 0.59, |
|
"acc_stderr": 0.049431107042371025, |
|
"acc_norm": 0.59, |
|
"acc_norm_stderr": 0.049431107042371025 |
|
}, |
|
"hendrycksTest-conceptual_physics": { |
|
"acc": 0.39148936170212767, |
|
"acc_stderr": 0.03190701242326812, |
|
"acc_norm": 0.39148936170212767, |
|
"acc_norm_stderr": 0.03190701242326812 |
|
}, |
|
"hendrycksTest-econometrics": { |
|
"acc": 0.2719298245614035, |
|
"acc_stderr": 0.04185774424022056, |
|
"acc_norm": 0.2719298245614035, |
|
"acc_norm_stderr": 0.04185774424022056 |
|
}, |
|
"hendrycksTest-electrical_engineering": { |
|
"acc": 0.42758620689655175, |
|
"acc_stderr": 0.04122737111370333, |
|
"acc_norm": 0.42758620689655175, |
|
"acc_norm_stderr": 0.04122737111370333 |
|
}, |
|
"hendrycksTest-elementary_mathematics": { |
|
"acc": 0.3148148148148148, |
|
"acc_stderr": 0.02391998416404773, |
|
"acc_norm": 0.3148148148148148, |
|
"acc_norm_stderr": 0.02391998416404773 |
|
}, |
|
"hendrycksTest-formal_logic": { |
|
"acc": 0.2777777777777778, |
|
"acc_stderr": 0.04006168083848878, |
|
"acc_norm": 0.2777777777777778, |
|
"acc_norm_stderr": 0.04006168083848878 |
|
}, |
|
"hendrycksTest-global_facts": { |
|
"acc": 0.29, |
|
"acc_stderr": 0.045604802157206845, |
|
"acc_norm": 0.29, |
|
"acc_norm_stderr": 0.045604802157206845 |
|
}, |
|
"hendrycksTest-high_school_biology": { |
|
"acc": 0.5, |
|
"acc_stderr": 0.028444006199428714, |
|
"acc_norm": 0.5, |
|
"acc_norm_stderr": 0.028444006199428714 |
|
}, |
|
"hendrycksTest-high_school_chemistry": { |
|
"acc": 0.31527093596059114, |
|
"acc_stderr": 0.03269080871970186, |
|
"acc_norm": 0.31527093596059114, |
|
"acc_norm_stderr": 0.03269080871970186 |
|
}, |
|
"hendrycksTest-high_school_computer_science": { |
|
"acc": 0.44, |
|
"acc_stderr": 0.04988876515698589, |
|
"acc_norm": 0.44, |
|
"acc_norm_stderr": 0.04988876515698589 |
|
}, |
|
"hendrycksTest-high_school_european_history": { |
|
"acc": 0.593939393939394, |
|
"acc_stderr": 0.03834816355401181, |
|
"acc_norm": 0.593939393939394, |
|
"acc_norm_stderr": 0.03834816355401181 |
|
}, |
|
"hendrycksTest-high_school_geography": { |
|
"acc": 0.48484848484848486, |
|
"acc_stderr": 0.03560716516531061, |
|
"acc_norm": 0.48484848484848486, |
|
"acc_norm_stderr": 0.03560716516531061 |
|
}, |
|
"hendrycksTest-high_school_government_and_politics": { |
|
"acc": 0.6269430051813472, |
|
"acc_stderr": 0.03490205592048573, |
|
"acc_norm": 0.6269430051813472, |
|
"acc_norm_stderr": 0.03490205592048573 |
|
}, |
|
"hendrycksTest-high_school_macroeconomics": { |
|
"acc": 0.3871794871794872, |
|
"acc_stderr": 0.02469721693087894, |
|
"acc_norm": 0.3871794871794872, |
|
"acc_norm_stderr": 0.02469721693087894 |
|
}, |
|
"hendrycksTest-high_school_mathematics": { |
|
"acc": 0.2851851851851852, |
|
"acc_stderr": 0.027528599210340496, |
|
"acc_norm": 0.2851851851851852, |
|
"acc_norm_stderr": 0.027528599210340496 |
|
}, |
|
"hendrycksTest-high_school_microeconomics": { |
|
"acc": 0.42436974789915966, |
|
"acc_stderr": 0.032104790510157764, |
|
"acc_norm": 0.42436974789915966, |
|
"acc_norm_stderr": 0.032104790510157764 |
|
}, |
|
"hendrycksTest-high_school_physics": { |
|
"acc": 0.31788079470198677, |
|
"acc_stderr": 0.03802039760107903, |
|
"acc_norm": 0.31788079470198677, |
|
"acc_norm_stderr": 0.03802039760107903 |
|
}, |
|
"hendrycksTest-high_school_psychology": { |
|
"acc": 0.5743119266055046, |
|
"acc_stderr": 0.0211992359724708, |
|
"acc_norm": 0.5743119266055046, |
|
"acc_norm_stderr": 0.0211992359724708 |
|
}, |
|
"hendrycksTest-high_school_statistics": { |
|
"acc": 0.35648148148148145, |
|
"acc_stderr": 0.03266478331527272, |
|
"acc_norm": 0.35648148148148145, |
|
"acc_norm_stderr": 0.03266478331527272 |
|
}, |
|
"hendrycksTest-high_school_us_history": { |
|
"acc": 0.6029411764705882, |
|
"acc_stderr": 0.0343413116471913, |
|
"acc_norm": 0.6029411764705882, |
|
"acc_norm_stderr": 0.0343413116471913 |
|
}, |
|
"hendrycksTest-high_school_world_history": { |
|
"acc": 0.6666666666666666, |
|
"acc_stderr": 0.0306858205966108, |
|
"acc_norm": 0.6666666666666666, |
|
"acc_norm_stderr": 0.0306858205966108 |
|
}, |
|
"hendrycksTest-human_aging": { |
|
"acc": 0.5426008968609866, |
|
"acc_stderr": 0.033435777055830646, |
|
"acc_norm": 0.5426008968609866, |
|
"acc_norm_stderr": 0.033435777055830646 |
|
}, |
|
"hendrycksTest-human_sexuality": { |
|
"acc": 0.4732824427480916, |
|
"acc_stderr": 0.04379024936553894, |
|
"acc_norm": 0.4732824427480916, |
|
"acc_norm_stderr": 0.04379024936553894 |
|
}, |
|
"hendrycksTest-international_law": { |
|
"acc": 0.6776859504132231, |
|
"acc_stderr": 0.04266416363352167, |
|
"acc_norm": 0.6776859504132231, |
|
"acc_norm_stderr": 0.04266416363352167 |
|
}, |
|
"hendrycksTest-jurisprudence": { |
|
"acc": 0.5370370370370371, |
|
"acc_stderr": 0.04820403072760627, |
|
"acc_norm": 0.5370370370370371, |
|
"acc_norm_stderr": 0.04820403072760627 |
|
}, |
|
"hendrycksTest-logical_fallacies": { |
|
"acc": 0.4601226993865031, |
|
"acc_stderr": 0.03915857291436971, |
|
"acc_norm": 0.4601226993865031, |
|
"acc_norm_stderr": 0.03915857291436971 |
|
}, |
|
"hendrycksTest-machine_learning": { |
|
"acc": 0.35714285714285715, |
|
"acc_stderr": 0.04547960999764376, |
|
"acc_norm": 0.35714285714285715, |
|
"acc_norm_stderr": 0.04547960999764376 |
|
}, |
|
"hendrycksTest-management": { |
|
"acc": 0.5825242718446602, |
|
"acc_stderr": 0.048828405482122375, |
|
"acc_norm": 0.5825242718446602, |
|
"acc_norm_stderr": 0.048828405482122375 |
|
}, |
|
"hendrycksTest-marketing": { |
|
"acc": 0.6367521367521367, |
|
"acc_stderr": 0.03150712523091264, |
|
"acc_norm": 0.6367521367521367, |
|
"acc_norm_stderr": 0.03150712523091264 |
|
}, |
|
"hendrycksTest-medical_genetics": { |
|
"acc": 0.43, |
|
"acc_stderr": 0.04975698519562428, |
|
"acc_norm": 0.43, |
|
"acc_norm_stderr": 0.04975698519562428 |
|
}, |
|
"hendrycksTest-miscellaneous": { |
|
"acc": 0.6245210727969349, |
|
"acc_stderr": 0.017316613197182786, |
|
"acc_norm": 0.6245210727969349, |
|
"acc_norm_stderr": 0.017316613197182786 |
|
}, |
|
"hendrycksTest-moral_disputes": { |
|
"acc": 0.5086705202312138, |
|
"acc_stderr": 0.026915047355369818, |
|
"acc_norm": 0.5086705202312138, |
|
"acc_norm_stderr": 0.026915047355369818 |
|
}, |
|
"hendrycksTest-moral_scenarios": { |
|
"acc": 0.24134078212290502, |
|
"acc_stderr": 0.014310999547961447, |
|
"acc_norm": 0.24134078212290502, |
|
"acc_norm_stderr": 0.014310999547961447 |
|
}, |
|
"hendrycksTest-nutrition": { |
|
"acc": 0.4869281045751634, |
|
"acc_stderr": 0.028620130800700246, |
|
"acc_norm": 0.4869281045751634, |
|
"acc_norm_stderr": 0.028620130800700246 |
|
}, |
|
"hendrycksTest-philosophy": { |
|
"acc": 0.5466237942122186, |
|
"acc_stderr": 0.028274359854894238, |
|
"acc_norm": 0.5466237942122186, |
|
"acc_norm_stderr": 0.028274359854894238 |
|
}, |
|
"hendrycksTest-prehistory": { |
|
"acc": 0.5061728395061729, |
|
"acc_stderr": 0.027818623962583295, |
|
"acc_norm": 0.5061728395061729, |
|
"acc_norm_stderr": 0.027818623962583295 |
|
}, |
|
"hendrycksTest-professional_accounting": { |
|
"acc": 0.3617021276595745, |
|
"acc_stderr": 0.0286638201471995, |
|
"acc_norm": 0.3617021276595745, |
|
"acc_norm_stderr": 0.0286638201471995 |
|
}, |
|
"hendrycksTest-professional_law": { |
|
"acc": 0.3741851368970013, |
|
"acc_stderr": 0.012359335618172061, |
|
"acc_norm": 0.3741851368970013, |
|
"acc_norm_stderr": 0.012359335618172061 |
|
}, |
|
"hendrycksTest-professional_medicine": { |
|
"acc": 0.45588235294117646, |
|
"acc_stderr": 0.030254372573976687, |
|
"acc_norm": 0.45588235294117646, |
|
"acc_norm_stderr": 0.030254372573976687 |
|
}, |
|
"hendrycksTest-professional_psychology": { |
|
"acc": 0.43300653594771243, |
|
"acc_stderr": 0.020045442473324227, |
|
"acc_norm": 0.43300653594771243, |
|
"acc_norm_stderr": 0.020045442473324227 |
|
}, |
|
"hendrycksTest-public_relations": { |
|
"acc": 0.4818181818181818, |
|
"acc_stderr": 0.04785964010794916, |
|
"acc_norm": 0.4818181818181818, |
|
"acc_norm_stderr": 0.04785964010794916 |
|
}, |
|
"hendrycksTest-security_studies": { |
|
"acc": 0.5020408163265306, |
|
"acc_stderr": 0.0320089533497105, |
|
"acc_norm": 0.5020408163265306, |
|
"acc_norm_stderr": 0.0320089533497105 |
|
}, |
|
"hendrycksTest-sociology": { |
|
"acc": 0.5671641791044776, |
|
"acc_stderr": 0.03503490923673281, |
|
"acc_norm": 0.5671641791044776, |
|
"acc_norm_stderr": 0.03503490923673281 |
|
}, |
|
"hendrycksTest-us_foreign_policy": { |
|
"acc": 0.7, |
|
"acc_stderr": 0.046056618647183814, |
|
"acc_norm": 0.7, |
|
"acc_norm_stderr": 0.046056618647183814 |
|
}, |
|
"hendrycksTest-virology": { |
|
"acc": 0.3795180722891566, |
|
"acc_stderr": 0.03777798822748018, |
|
"acc_norm": 0.3795180722891566, |
|
"acc_norm_stderr": 0.03777798822748018 |
|
}, |
|
"hendrycksTest-world_religions": { |
|
"acc": 0.6783625730994152, |
|
"acc_stderr": 0.03582529442573122, |
|
"acc_norm": 0.6783625730994152, |
|
"acc_norm_stderr": 0.03582529442573122 |
|
} |
|
}, |
|
"versions": { |
|
"hendrycksTest-abstract_algebra": 1, |
|
"hendrycksTest-anatomy": 1, |
|
"hendrycksTest-astronomy": 1, |
|
"hendrycksTest-business_ethics": 1, |
|
"hendrycksTest-clinical_knowledge": 1, |
|
"hendrycksTest-college_biology": 1, |
|
"hendrycksTest-college_chemistry": 1, |
|
"hendrycksTest-college_computer_science": 1, |
|
"hendrycksTest-college_mathematics": 1, |
|
"hendrycksTest-college_medicine": 1, |
|
"hendrycksTest-college_physics": 1, |
|
"hendrycksTest-computer_security": 1, |
|
"hendrycksTest-conceptual_physics": 1, |
|
"hendrycksTest-econometrics": 1, |
|
"hendrycksTest-electrical_engineering": 1, |
|
"hendrycksTest-elementary_mathematics": 1, |
|
"hendrycksTest-formal_logic": 1, |
|
"hendrycksTest-global_facts": 1, |
|
"hendrycksTest-high_school_biology": 1, |
|
"hendrycksTest-high_school_chemistry": 1, |
|
"hendrycksTest-high_school_computer_science": 1, |
|
"hendrycksTest-high_school_european_history": 1, |
|
"hendrycksTest-high_school_geography": 1, |
|
"hendrycksTest-high_school_government_and_politics": 1, |
|
"hendrycksTest-high_school_macroeconomics": 1, |
|
"hendrycksTest-high_school_mathematics": 1, |
|
"hendrycksTest-high_school_microeconomics": 1, |
|
"hendrycksTest-high_school_physics": 1, |
|
"hendrycksTest-high_school_psychology": 1, |
|
"hendrycksTest-high_school_statistics": 1, |
|
"hendrycksTest-high_school_us_history": 1, |
|
"hendrycksTest-high_school_world_history": 1, |
|
"hendrycksTest-human_aging": 1, |
|
"hendrycksTest-human_sexuality": 1, |
|
"hendrycksTest-international_law": 1, |
|
"hendrycksTest-jurisprudence": 1, |
|
"hendrycksTest-logical_fallacies": 1, |
|
"hendrycksTest-machine_learning": 1, |
|
"hendrycksTest-management": 1, |
|
"hendrycksTest-marketing": 1, |
|
"hendrycksTest-medical_genetics": 1, |
|
"hendrycksTest-miscellaneous": 1, |
|
"hendrycksTest-moral_disputes": 1, |
|
"hendrycksTest-moral_scenarios": 1, |
|
"hendrycksTest-nutrition": 1, |
|
"hendrycksTest-philosophy": 1, |
|
"hendrycksTest-prehistory": 1, |
|
"hendrycksTest-professional_accounting": 1, |
|
"hendrycksTest-professional_law": 1, |
|
"hendrycksTest-professional_medicine": 1, |
|
"hendrycksTest-professional_psychology": 1, |
|
"hendrycksTest-public_relations": 1, |
|
"hendrycksTest-security_studies": 1, |
|
"hendrycksTest-sociology": 1, |
|
"hendrycksTest-us_foreign_policy": 1, |
|
"hendrycksTest-virology": 1, |
|
"hendrycksTest-world_religions": 1 |
|
}, |
|
"config": { |
|
"model": "sparseml", |
|
"model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-retrained_sparse50_llama2_DATAUpdated_KDFalse_GCTrue_LR1e-4_E2-/combined/,trust_remote_code=True", |
|
"num_fewshot": 5, |
|
"batch_size": "6", |
|
"batch_sizes": [], |
|
"device": "cuda:0", |
|
"no_cache": true, |
|
"limit": null, |
|
"bootstrap_iters": 100000, |
|
"description_dict": {} |
|
} |
|
} |