{ "results": { "arc_challenge": { "acc,none": 0.5614334470989761, "acc_stderr,none": 0.014500682618212865, "acc_norm,none": 0.613481228668942, "acc_norm_stderr,none": 0.014230084761910473, "alias": "arc_challenge" } }, "configs": { "arc_challenge": { "task": "arc_challenge", "group": [ "ai2_arc" ], "dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/ai2_arc", "dataset_name": "ARC-Challenge", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "Question: {{question}}\nAnswer:", "doc_to_target": "{{choices.label.index(answerKey)}}", "doc_to_choice": "{{choices.text}}", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "num_fewshot": 25, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", "metadata": { "version": 1.0 } } }, "versions": { "arc_challenge": 1.0 }, "n-shot": { "arc_challenge": 25 }, "config": { "model": "vllm", "model_args": "pretrained=/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/Oasis,tensor_parallel_size=1,dtype=auto,gpu_memory_utilization=0.9,data_parallel_size=1,max_model_len=4096", "batch_size": "auto:128", "batch_sizes": [], "device": "cuda", "use_cache": "/lustre07/scratch/gagan30/arocr/cache/", "limit": null, "bootstrap_iters": 100000, "gen_kwargs": null }, "git_hash": null }