|
{ |
|
"results": { |
|
"hendrycksTest-abstract_algebra": { |
|
"acc": 0.31, |
|
"acc_stderr": 0.04648231987117316, |
|
"acc_norm": 0.31, |
|
"acc_norm_stderr": 0.04648231987117316 |
|
}, |
|
"hendrycksTest-anatomy": { |
|
"acc": 0.43703703703703706, |
|
"acc_stderr": 0.04284958639753399, |
|
"acc_norm": 0.43703703703703706, |
|
"acc_norm_stderr": 0.04284958639753399 |
|
}, |
|
"hendrycksTest-astronomy": { |
|
"acc": 0.40789473684210525, |
|
"acc_stderr": 0.03999309712777471, |
|
"acc_norm": 0.40789473684210525, |
|
"acc_norm_stderr": 0.03999309712777471 |
|
}, |
|
"hendrycksTest-business_ethics": { |
|
"acc": 0.46, |
|
"acc_stderr": 0.05009082659620332, |
|
"acc_norm": 0.46, |
|
"acc_norm_stderr": 0.05009082659620332 |
|
}, |
|
"hendrycksTest-clinical_knowledge": { |
|
"acc": 0.4830188679245283, |
|
"acc_stderr": 0.030755120364119905, |
|
"acc_norm": 0.4830188679245283, |
|
"acc_norm_stderr": 0.030755120364119905 |
|
}, |
|
"hendrycksTest-college_biology": { |
|
"acc": 0.4652777777777778, |
|
"acc_stderr": 0.04171115858181618, |
|
"acc_norm": 0.4652777777777778, |
|
"acc_norm_stderr": 0.04171115858181618 |
|
}, |
|
"hendrycksTest-college_chemistry": { |
|
"acc": 0.27, |
|
"acc_stderr": 0.044619604333847394, |
|
"acc_norm": 0.27, |
|
"acc_norm_stderr": 0.044619604333847394 |
|
}, |
|
"hendrycksTest-college_computer_science": { |
|
"acc": 0.39, |
|
"acc_stderr": 0.04902071300001975, |
|
"acc_norm": 0.39, |
|
"acc_norm_stderr": 0.04902071300001975 |
|
}, |
|
"hendrycksTest-college_mathematics": { |
|
"acc": 0.29, |
|
"acc_stderr": 0.045604802157206845, |
|
"acc_norm": 0.29, |
|
"acc_norm_stderr": 0.045604802157206845 |
|
}, |
|
"hendrycksTest-college_medicine": { |
|
"acc": 0.36416184971098264, |
|
"acc_stderr": 0.03669072477416907, |
|
"acc_norm": 0.36416184971098264, |
|
"acc_norm_stderr": 0.03669072477416907 |
|
}, |
|
"hendrycksTest-college_physics": { |
|
"acc": 0.2549019607843137, |
|
"acc_stderr": 0.043364327079931785, |
|
"acc_norm": 0.2549019607843137, |
|
"acc_norm_stderr": 0.043364327079931785 |
|
}, |
|
"hendrycksTest-computer_security": { |
|
"acc": 0.55, |
|
"acc_stderr": 0.05, |
|
"acc_norm": 0.55, |
|
"acc_norm_stderr": 0.05 |
|
}, |
|
"hendrycksTest-conceptual_physics": { |
|
"acc": 0.4085106382978723, |
|
"acc_stderr": 0.03213418026701576, |
|
"acc_norm": 0.4085106382978723, |
|
"acc_norm_stderr": 0.03213418026701576 |
|
}, |
|
"hendrycksTest-econometrics": { |
|
"acc": 0.2894736842105263, |
|
"acc_stderr": 0.04266339443159394, |
|
"acc_norm": 0.2894736842105263, |
|
"acc_norm_stderr": 0.04266339443159394 |
|
}, |
|
"hendrycksTest-electrical_engineering": { |
|
"acc": 0.4206896551724138, |
|
"acc_stderr": 0.0411391498118926, |
|
"acc_norm": 0.4206896551724138, |
|
"acc_norm_stderr": 0.0411391498118926 |
|
}, |
|
"hendrycksTest-elementary_mathematics": { |
|
"acc": 0.291005291005291, |
|
"acc_stderr": 0.02339382650048487, |
|
"acc_norm": 0.291005291005291, |
|
"acc_norm_stderr": 0.02339382650048487 |
|
}, |
|
"hendrycksTest-formal_logic": { |
|
"acc": 0.23015873015873015, |
|
"acc_stderr": 0.037649508797906045, |
|
"acc_norm": 0.23015873015873015, |
|
"acc_norm_stderr": 0.037649508797906045 |
|
}, |
|
"hendrycksTest-global_facts": { |
|
"acc": 0.34, |
|
"acc_stderr": 0.04760952285695236, |
|
"acc_norm": 0.34, |
|
"acc_norm_stderr": 0.04760952285695236 |
|
}, |
|
"hendrycksTest-high_school_biology": { |
|
"acc": 0.4870967741935484, |
|
"acc_stderr": 0.028434533152681855, |
|
"acc_norm": 0.4870967741935484, |
|
"acc_norm_stderr": 0.028434533152681855 |
|
}, |
|
"hendrycksTest-high_school_chemistry": { |
|
"acc": 0.33497536945812806, |
|
"acc_stderr": 0.033208527423483104, |
|
"acc_norm": 0.33497536945812806, |
|
"acc_norm_stderr": 0.033208527423483104 |
|
}, |
|
"hendrycksTest-high_school_computer_science": { |
|
"acc": 0.44, |
|
"acc_stderr": 0.04988876515698589, |
|
"acc_norm": 0.44, |
|
"acc_norm_stderr": 0.04988876515698589 |
|
}, |
|
"hendrycksTest-high_school_european_history": { |
|
"acc": 0.6363636363636364, |
|
"acc_stderr": 0.03756335775187897, |
|
"acc_norm": 0.6363636363636364, |
|
"acc_norm_stderr": 0.03756335775187897 |
|
}, |
|
"hendrycksTest-high_school_geography": { |
|
"acc": 0.5202020202020202, |
|
"acc_stderr": 0.03559443565563918, |
|
"acc_norm": 0.5202020202020202, |
|
"acc_norm_stderr": 0.03559443565563918 |
|
}, |
|
"hendrycksTest-high_school_government_and_politics": { |
|
"acc": 0.6424870466321243, |
|
"acc_stderr": 0.034588160421810114, |
|
"acc_norm": 0.6424870466321243, |
|
"acc_norm_stderr": 0.034588160421810114 |
|
}, |
|
"hendrycksTest-high_school_macroeconomics": { |
|
"acc": 0.4025641025641026, |
|
"acc_stderr": 0.02486499515976775, |
|
"acc_norm": 0.4025641025641026, |
|
"acc_norm_stderr": 0.02486499515976775 |
|
}, |
|
"hendrycksTest-high_school_mathematics": { |
|
"acc": 0.25925925925925924, |
|
"acc_stderr": 0.026719240783712177, |
|
"acc_norm": 0.25925925925925924, |
|
"acc_norm_stderr": 0.026719240783712177 |
|
}, |
|
"hendrycksTest-high_school_microeconomics": { |
|
"acc": 0.4411764705882353, |
|
"acc_stderr": 0.0322529423239964, |
|
"acc_norm": 0.4411764705882353, |
|
"acc_norm_stderr": 0.0322529423239964 |
|
}, |
|
"hendrycksTest-high_school_physics": { |
|
"acc": 0.23841059602649006, |
|
"acc_stderr": 0.03479185572599661, |
|
"acc_norm": 0.23841059602649006, |
|
"acc_norm_stderr": 0.03479185572599661 |
|
}, |
|
"hendrycksTest-high_school_psychology": { |
|
"acc": 0.6165137614678899, |
|
"acc_stderr": 0.020847156641915984, |
|
"acc_norm": 0.6165137614678899, |
|
"acc_norm_stderr": 0.020847156641915984 |
|
}, |
|
"hendrycksTest-high_school_statistics": { |
|
"acc": 0.27314814814814814, |
|
"acc_stderr": 0.03038805130167812, |
|
"acc_norm": 0.27314814814814814, |
|
"acc_norm_stderr": 0.03038805130167812 |
|
}, |
|
"hendrycksTest-high_school_us_history": { |
|
"acc": 0.6470588235294118, |
|
"acc_stderr": 0.03354092437591518, |
|
"acc_norm": 0.6470588235294118, |
|
"acc_norm_stderr": 0.03354092437591518 |
|
}, |
|
"hendrycksTest-high_school_world_history": { |
|
"acc": 0.6455696202531646, |
|
"acc_stderr": 0.031137304297185805, |
|
"acc_norm": 0.6455696202531646, |
|
"acc_norm_stderr": 0.031137304297185805 |
|
}, |
|
"hendrycksTest-human_aging": { |
|
"acc": 0.547085201793722, |
|
"acc_stderr": 0.03340867501923324, |
|
"acc_norm": 0.547085201793722, |
|
"acc_norm_stderr": 0.03340867501923324 |
|
}, |
|
"hendrycksTest-human_sexuality": { |
|
"acc": 0.549618320610687, |
|
"acc_stderr": 0.04363643698524779, |
|
"acc_norm": 0.549618320610687, |
|
"acc_norm_stderr": 0.04363643698524779 |
|
}, |
|
"hendrycksTest-international_law": { |
|
"acc": 0.628099173553719, |
|
"acc_stderr": 0.04412015806624504, |
|
"acc_norm": 0.628099173553719, |
|
"acc_norm_stderr": 0.04412015806624504 |
|
}, |
|
"hendrycksTest-jurisprudence": { |
|
"acc": 0.49074074074074076, |
|
"acc_stderr": 0.04832853553437055, |
|
"acc_norm": 0.49074074074074076, |
|
"acc_norm_stderr": 0.04832853553437055 |
|
}, |
|
"hendrycksTest-logical_fallacies": { |
|
"acc": 0.49079754601226994, |
|
"acc_stderr": 0.03927705600787443, |
|
"acc_norm": 0.49079754601226994, |
|
"acc_norm_stderr": 0.03927705600787443 |
|
}, |
|
"hendrycksTest-machine_learning": { |
|
"acc": 0.42857142857142855, |
|
"acc_stderr": 0.04697113923010212, |
|
"acc_norm": 0.42857142857142855, |
|
"acc_norm_stderr": 0.04697113923010212 |
|
}, |
|
"hendrycksTest-management": { |
|
"acc": 0.5339805825242718, |
|
"acc_stderr": 0.0493929144727348, |
|
"acc_norm": 0.5339805825242718, |
|
"acc_norm_stderr": 0.0493929144727348 |
|
}, |
|
"hendrycksTest-marketing": { |
|
"acc": 0.7136752136752137, |
|
"acc_stderr": 0.02961432369045665, |
|
"acc_norm": 0.7136752136752137, |
|
"acc_norm_stderr": 0.02961432369045665 |
|
}, |
|
"hendrycksTest-medical_genetics": { |
|
"acc": 0.55, |
|
"acc_stderr": 0.04999999999999999, |
|
"acc_norm": 0.55, |
|
"acc_norm_stderr": 0.04999999999999999 |
|
}, |
|
"hendrycksTest-miscellaneous": { |
|
"acc": 0.6219667943805874, |
|
"acc_stderr": 0.017339844462104594, |
|
"acc_norm": 0.6219667943805874, |
|
"acc_norm_stderr": 0.017339844462104594 |
|
}, |
|
"hendrycksTest-moral_disputes": { |
|
"acc": 0.5317919075144508, |
|
"acc_stderr": 0.02686462436675664, |
|
"acc_norm": 0.5317919075144508, |
|
"acc_norm_stderr": 0.02686462436675664 |
|
}, |
|
"hendrycksTest-moral_scenarios": { |
|
"acc": 0.2424581005586592, |
|
"acc_stderr": 0.014333522059217892, |
|
"acc_norm": 0.2424581005586592, |
|
"acc_norm_stderr": 0.014333522059217892 |
|
}, |
|
"hendrycksTest-nutrition": { |
|
"acc": 0.477124183006536, |
|
"acc_stderr": 0.028599936776089782, |
|
"acc_norm": 0.477124183006536, |
|
"acc_norm_stderr": 0.028599936776089782 |
|
}, |
|
"hendrycksTest-philosophy": { |
|
"acc": 0.5530546623794212, |
|
"acc_stderr": 0.028237769422085328, |
|
"acc_norm": 0.5530546623794212, |
|
"acc_norm_stderr": 0.028237769422085328 |
|
}, |
|
"hendrycksTest-prehistory": { |
|
"acc": 0.5308641975308642, |
|
"acc_stderr": 0.027767689606833925, |
|
"acc_norm": 0.5308641975308642, |
|
"acc_norm_stderr": 0.027767689606833925 |
|
}, |
|
"hendrycksTest-professional_accounting": { |
|
"acc": 0.375886524822695, |
|
"acc_stderr": 0.028893955412115882, |
|
"acc_norm": 0.375886524822695, |
|
"acc_norm_stderr": 0.028893955412115882 |
|
}, |
|
"hendrycksTest-professional_law": { |
|
"acc": 0.35723598435462844, |
|
"acc_stderr": 0.012238615750316505, |
|
"acc_norm": 0.35723598435462844, |
|
"acc_norm_stderr": 0.012238615750316505 |
|
}, |
|
"hendrycksTest-professional_medicine": { |
|
"acc": 0.4227941176470588, |
|
"acc_stderr": 0.03000856284500349, |
|
"acc_norm": 0.4227941176470588, |
|
"acc_norm_stderr": 0.03000856284500349 |
|
}, |
|
"hendrycksTest-professional_psychology": { |
|
"acc": 0.4591503267973856, |
|
"acc_stderr": 0.020160213617222516, |
|
"acc_norm": 0.4591503267973856, |
|
"acc_norm_stderr": 0.020160213617222516 |
|
}, |
|
"hendrycksTest-public_relations": { |
|
"acc": 0.5454545454545454, |
|
"acc_stderr": 0.04769300568972744, |
|
"acc_norm": 0.5454545454545454, |
|
"acc_norm_stderr": 0.04769300568972744 |
|
}, |
|
"hendrycksTest-security_studies": { |
|
"acc": 0.49795918367346936, |
|
"acc_stderr": 0.0320089533497105, |
|
"acc_norm": 0.49795918367346936, |
|
"acc_norm_stderr": 0.0320089533497105 |
|
}, |
|
"hendrycksTest-sociology": { |
|
"acc": 0.6318407960199005, |
|
"acc_stderr": 0.03410410565495301, |
|
"acc_norm": 0.6318407960199005, |
|
"acc_norm_stderr": 0.03410410565495301 |
|
}, |
|
"hendrycksTest-us_foreign_policy": { |
|
"acc": 0.67, |
|
"acc_stderr": 0.04725815626252609, |
|
"acc_norm": 0.67, |
|
"acc_norm_stderr": 0.04725815626252609 |
|
}, |
|
"hendrycksTest-virology": { |
|
"acc": 0.41566265060240964, |
|
"acc_stderr": 0.03836722176598053, |
|
"acc_norm": 0.41566265060240964, |
|
"acc_norm_stderr": 0.03836722176598053 |
|
}, |
|
"hendrycksTest-world_religions": { |
|
"acc": 0.6549707602339181, |
|
"acc_stderr": 0.036459813773888065, |
|
"acc_norm": 0.6549707602339181, |
|
"acc_norm_stderr": 0.036459813773888065 |
|
} |
|
}, |
|
"versions": { |
|
"hendrycksTest-abstract_algebra": 1, |
|
"hendrycksTest-anatomy": 1, |
|
"hendrycksTest-astronomy": 1, |
|
"hendrycksTest-business_ethics": 1, |
|
"hendrycksTest-clinical_knowledge": 1, |
|
"hendrycksTest-college_biology": 1, |
|
"hendrycksTest-college_chemistry": 1, |
|
"hendrycksTest-college_computer_science": 1, |
|
"hendrycksTest-college_mathematics": 1, |
|
"hendrycksTest-college_medicine": 1, |
|
"hendrycksTest-college_physics": 1, |
|
"hendrycksTest-computer_security": 1, |
|
"hendrycksTest-conceptual_physics": 1, |
|
"hendrycksTest-econometrics": 1, |
|
"hendrycksTest-electrical_engineering": 1, |
|
"hendrycksTest-elementary_mathematics": 1, |
|
"hendrycksTest-formal_logic": 1, |
|
"hendrycksTest-global_facts": 1, |
|
"hendrycksTest-high_school_biology": 1, |
|
"hendrycksTest-high_school_chemistry": 1, |
|
"hendrycksTest-high_school_computer_science": 1, |
|
"hendrycksTest-high_school_european_history": 1, |
|
"hendrycksTest-high_school_geography": 1, |
|
"hendrycksTest-high_school_government_and_politics": 1, |
|
"hendrycksTest-high_school_macroeconomics": 1, |
|
"hendrycksTest-high_school_mathematics": 1, |
|
"hendrycksTest-high_school_microeconomics": 1, |
|
"hendrycksTest-high_school_physics": 1, |
|
"hendrycksTest-high_school_psychology": 1, |
|
"hendrycksTest-high_school_statistics": 1, |
|
"hendrycksTest-high_school_us_history": 1, |
|
"hendrycksTest-high_school_world_history": 1, |
|
"hendrycksTest-human_aging": 1, |
|
"hendrycksTest-human_sexuality": 1, |
|
"hendrycksTest-international_law": 1, |
|
"hendrycksTest-jurisprudence": 1, |
|
"hendrycksTest-logical_fallacies": 1, |
|
"hendrycksTest-machine_learning": 1, |
|
"hendrycksTest-management": 1, |
|
"hendrycksTest-marketing": 1, |
|
"hendrycksTest-medical_genetics": 1, |
|
"hendrycksTest-miscellaneous": 1, |
|
"hendrycksTest-moral_disputes": 1, |
|
"hendrycksTest-moral_scenarios": 1, |
|
"hendrycksTest-nutrition": 1, |
|
"hendrycksTest-philosophy": 1, |
|
"hendrycksTest-prehistory": 1, |
|
"hendrycksTest-professional_accounting": 1, |
|
"hendrycksTest-professional_law": 1, |
|
"hendrycksTest-professional_medicine": 1, |
|
"hendrycksTest-professional_psychology": 1, |
|
"hendrycksTest-public_relations": 1, |
|
"hendrycksTest-security_studies": 1, |
|
"hendrycksTest-sociology": 1, |
|
"hendrycksTest-us_foreign_policy": 1, |
|
"hendrycksTest-virology": 1, |
|
"hendrycksTest-world_religions": 1 |
|
}, |
|
"config": { |
|
"model": "hf-causal-experimental", |
|
"model_args": { |
|
"pretrained": "/network/alexandre/research/llama2_7b_ultrachat/dense/dense_finetuning/dense_LR1e-4_E1/training", |
|
"trust_remote_code": true |
|
}, |
|
"num_fewshot": 5, |
|
"batch_size": "4", |
|
"batch_sizes": [], |
|
"device": "cuda:5", |
|
"no_cache": true, |
|
"limit": null, |
|
"bootstrap_iters": 100000, |
|
"description_dict": {} |
|
} |
|
} |