pminervini commited on
Commit
d01d881
1 Parent(s): e1f29ca
app.py CHANGED
@@ -28,8 +28,8 @@ from src.display.utils import (
28
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
29
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
  from src.submission.submit import add_new_eval
31
- from src.submission.check_validity import already_submitted_models
32
- from src.tools.collections import update_collections
33
  from src.tools.plots import (
34
  create_metric_plot_obj,
35
  create_plot_df,
 
28
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
29
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
  from src.submission.submit import add_new_eval
31
+ # from src.submission.check_validity import already_submitted_models
32
+ # from src.tools.collections import update_collections
33
  from src.tools.plots import (
34
  create_metric_plot_obj,
35
  create_plot_df,
beta-cli.py CHANGED
@@ -10,7 +10,7 @@ snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="
10
  raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
11
 
12
  for entry in raw_data:
13
- if '125' in entry.eval_name:
14
- print(entry)
15
 
16
- # print(raw_data)
 
10
  raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
11
 
12
  for entry in raw_data:
13
+ # if '125m' in entry.eval_name:
14
+ print(entry)
15
 
16
+ # print(raw_data)
src/display/utils.py CHANGED
@@ -18,9 +18,10 @@ class Tasks(Enum):
18
  hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
19
  mmlu = Task("hendrycksTest", "acc", "MMLU")
20
  truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
21
- winogrande = Task("winogrande", "acc", "Winogrande")
22
- gsm8k = Task("gsm8k", "acc", "GSM8K")
23
  drop = Task("drop", "f1", "DROP")
 
24
 
25
  # These classes are for user facing column names,
26
  # to avoid having to change them all around the code
@@ -77,8 +78,8 @@ baseline_row = {
77
  AutoEvalColumn.hellaswag.name: 25.0,
78
  AutoEvalColumn.mmlu.name: 25.0,
79
  AutoEvalColumn.truthfulqa.name: 25.0,
80
- AutoEvalColumn.winogrande.name: 50.0,
81
- AutoEvalColumn.gsm8k.name: 0.21,
82
  AutoEvalColumn.drop.name: 0.47,
83
  AutoEvalColumn.dummy.name: "baseline",
84
  AutoEvalColumn.model_type.name: "",
@@ -102,8 +103,8 @@ human_baseline_row = {
102
  AutoEvalColumn.hellaswag.name: 95.0,
103
  AutoEvalColumn.mmlu.name: 89.8,
104
  AutoEvalColumn.truthfulqa.name: 94.0,
105
- AutoEvalColumn.winogrande.name: 94.0,
106
- AutoEvalColumn.gsm8k.name: 100,
107
  AutoEvalColumn.drop.name: 96.42,
108
  AutoEvalColumn.dummy.name: "human_baseline",
109
  AutoEvalColumn.model_type.name: "",
 
18
  hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
19
  mmlu = Task("hendrycksTest", "acc", "MMLU")
20
  truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
21
+ # winogrande = Task("winogrande", "acc", "Winogrande")
22
+ # gsm8k = Task("gsm8k", "acc", "GSM8K")
23
  drop = Task("drop", "f1", "DROP")
24
+ nqopen = Task("nq_open", "em", "NQ Open")
25
 
26
  # These classes are for user facing column names,
27
  # to avoid having to change them all around the code
 
78
  AutoEvalColumn.hellaswag.name: 25.0,
79
  AutoEvalColumn.mmlu.name: 25.0,
80
  AutoEvalColumn.truthfulqa.name: 25.0,
81
+ # AutoEvalColumn.winogrande.name: 50.0,
82
+ # AutoEvalColumn.gsm8k.name: 0.21,
83
  AutoEvalColumn.drop.name: 0.47,
84
  AutoEvalColumn.dummy.name: "baseline",
85
  AutoEvalColumn.model_type.name: "",
 
103
  AutoEvalColumn.hellaswag.name: 95.0,
104
  AutoEvalColumn.mmlu.name: 89.8,
105
  AutoEvalColumn.truthfulqa.name: 94.0,
106
+ # AutoEvalColumn.winogrande.name: 94.0,
107
+ # AutoEvalColumn.gsm8k.name: 100,
108
  AutoEvalColumn.drop.name: 96.42,
109
  AutoEvalColumn.dummy.name: "human_baseline",
110
  AutoEvalColumn.model_type.name: "",
src/leaderboard/read_evals.py CHANGED
@@ -96,6 +96,10 @@ class EvalResult:
96
  mean_acc = np.mean(accs) * 100.0
97
  results[task.benchmark] = mean_acc
98
 
 
 
 
 
99
  return self(
100
  eval_name=result_key,
101
  full_model=full_model,
 
96
  mean_acc = np.mean(accs) * 100.0
97
  results[task.benchmark] = mean_acc
98
 
99
+ # XXX
100
+ if 'nq_open' not in results:
101
+ results['nq_open'] = 0.0
102
+
103
  return self(
104
  eval_name=result_key,
105
  full_model=full_model,