hieunguyen1053 commited on
Commit
f8b127b
β€’
1 Parent(s): 2e61d1f
Files changed (3) hide show
  1. app.py +15 -1
  2. src/load_from_hub.py +10 -5
  3. src/tasks.py +23 -7
app.py CHANGED
@@ -17,7 +17,6 @@ fs = HfFileSystem(token=HIDDEN_TOKEN)
17
  def restart_space():
18
  api.restart_space(repo_id="vlsp-2023-vllm/VLLMs-Leaderboard", token=HIDDEN_TOKEN)
19
 
20
- leaderboard_df = load_from_hub(fs, RESULTS_REPO)
21
 
22
  demo = gr.Blocks()
23
 
@@ -27,6 +26,21 @@ with demo:
27
 
28
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
29
  with gr.TabItem("Public test", elem_id="llm-benchmark-tab-table", id=0):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  with gr.Column():
31
  with gr.Row():
32
  search_bar = gr.Textbox(
 
17
  def restart_space():
18
  api.restart_space(repo_id="vlsp-2023-vllm/VLLMs-Leaderboard", token=HIDDEN_TOKEN)
19
 
 
20
 
21
  demo = gr.Blocks()
22
 
 
26
 
27
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
28
  with gr.TabItem("Public test", elem_id="llm-benchmark-tab-table", id=0):
29
+ leaderboard_df = load_from_hub(fs, RESULTS_REPO, is_private=False)
30
+ with gr.Column():
31
+ with gr.Row():
32
+ search_bar = gr.Textbox(
33
+ placeholder=" πŸ” Search for your model and press ENTER...",
34
+ show_label=False,
35
+ elem_id="search-bar",
36
+ )
37
+ with gr.Row():
38
+ leaderboard_table = gr.components.Dataframe(
39
+ value=leaderboard_df,
40
+ )
41
+
42
+ with gr.TabItem("Private test", elem_id="llm-benchmark-tab-table", id=1):
43
+ leaderboard_df = load_from_hub(fs, RESULTS_REPO, is_private=True)
44
  with gr.Column():
45
  with gr.Row():
46
  search_bar = gr.Textbox(
src/load_from_hub.py CHANGED
@@ -5,10 +5,10 @@ import pandas as pd
5
 
6
  from src.assets.symbols import UP_ARROW, DOWN_ARROW
7
 
8
- from src.tasks import TASKS, TASK_CODES, TASK_TO_METRIC
9
 
10
 
11
- def load_from_hub(fs, repo_path):
12
  files = fs.glob(f"{repo_path}/**/*.json")
13
 
14
  set_organization_models = {}
@@ -20,7 +20,9 @@ def load_from_hub(fs, repo_path):
20
  organization_model = f"{organization}/{model}"
21
  task_code = task.replace(".json", "")
22
 
23
- if task_code not in TASK_CODES:
 
 
24
  continue
25
 
26
  set_organization_models[organization_model] = 1
@@ -38,12 +40,15 @@ def load_from_hub(fs, repo_path):
38
  organization_model = f"{organization}/{model}"
39
  task_code = task.replace(".json", "")
40
 
41
- if task_code not in TASK_CODES:
 
 
42
  continue
43
 
44
  data = json.loads(fs.open(file, "r").read())
45
 
46
- result = round(data["results"][task_code][TASK_TO_METRIC[task_code]], 4)
 
47
 
48
  table.loc[organization_model, task_code] = result
49
  table.loc[organization_model, "Organization"] = organization
 
5
 
6
  from src.assets.symbols import UP_ARROW, DOWN_ARROW
7
 
8
+ from src.tasks import TASKS
9
 
10
 
11
+ def load_from_hub(fs, repo_path, is_private=False):
12
  files = fs.glob(f"{repo_path}/**/*.json")
13
 
14
  set_organization_models = {}
 
20
  organization_model = f"{organization}/{model}"
21
  task_code = task.replace(".json", "")
22
 
23
+ if task_code not in map(lambda task: task.code, TASKS):
24
+ continue
25
+ if is_private != list(filter(lambda task: task.code == task_code, TASKS))[0].private_test:
26
  continue
27
 
28
  set_organization_models[organization_model] = 1
 
40
  organization_model = f"{organization}/{model}"
41
  task_code = task.replace(".json", "")
42
 
43
+ if task_code not in map(lambda task: task.code, TASKS):
44
+ continue
45
+ if is_private != list(filter(lambda task: task.code == task_code, TASKS))[0].private_test:
46
  continue
47
 
48
  data = json.loads(fs.open(file, "r").read())
49
 
50
+ metric = list(filter(lambda task: task.code == task_code, TASKS))[0].metric
51
+ result = round(data["results"][task_code][metric], 4)
52
 
53
  table.loc[organization_model, task_code] = result
54
  table.loc[organization_model, "Organization"] = organization
src/tasks.py CHANGED
@@ -7,6 +7,7 @@ class Task:
7
  metric: str
8
  higher_is_better: bool = True
9
  num_fewshot: int = 0
 
10
 
11
 
12
  class Lambada(Task):
@@ -15,6 +16,7 @@ class Lambada(Task):
15
  metric = "ppl"
16
  higher_is_better = False
17
  num_fewshot = 0
 
18
 
19
 
20
  class Arc(Task):
@@ -23,6 +25,7 @@ class Arc(Task):
23
  metric = "acc_norm"
24
  higher_is_better = True
25
  num_fewshot = 25
 
26
 
27
 
28
  class HellaSwag(Task):
@@ -31,6 +34,7 @@ class HellaSwag(Task):
31
  metric = "acc_norm"
32
  higher_is_better = True
33
  num_fewshot = 10
 
34
 
35
 
36
  class MMLU(Task):
@@ -39,6 +43,7 @@ class MMLU(Task):
39
  metric = "acc_norm"
40
  higher_is_better = True
41
  num_fewshot = 5
 
42
 
43
 
44
  class TruthfulQA(Task):
@@ -47,6 +52,7 @@ class TruthfulQA(Task):
47
  metric = "mc2"
48
  higher_is_better = True
49
  num_fewshot = 0
 
50
 
51
 
52
  class Grade12Exams(Task):
@@ -55,6 +61,7 @@ class Grade12Exams(Task):
55
  metric = "acc_norm"
56
  higher_is_better = True
57
  num_fewshot = 5
 
58
 
59
 
60
  class IWSLT2023_en_vi(Task):
@@ -63,16 +70,25 @@ class IWSLT2023_en_vi(Task):
63
  metric = "bleu"
64
  higher_is_better = True
65
  num_fewshot = 0
 
66
 
67
 
68
- class XNLI(Task):
69
- code = "xnli_vi"
70
- name = "XNLI"
71
- metric = "acc"
 
 
 
 
 
 
 
 
 
72
  higher_is_better = True
73
  num_fewshot = 0
 
74
 
75
 
76
- TASKS = [Lambada, Arc, HellaSwag, MMLU, TruthfulQA, Grade12Exams, IWSLT2023_en_vi, XNLI]
77
- TASK_CODES = [task.code for task in TASKS]
78
- TASK_TO_METRIC = {task.code: task.metric for task in TASKS}
 
7
  metric: str
8
  higher_is_better: bool = True
9
  num_fewshot: int = 0
10
+ private_test: bool = False
11
 
12
 
13
  class Lambada(Task):
 
16
  metric = "ppl"
17
  higher_is_better = False
18
  num_fewshot = 0
19
+ private_test: bool = True
20
 
21
 
22
  class Arc(Task):
 
25
  metric = "acc_norm"
26
  higher_is_better = True
27
  num_fewshot = 25
28
+ private_test: bool = False
29
 
30
 
31
  class HellaSwag(Task):
 
34
  metric = "acc_norm"
35
  higher_is_better = True
36
  num_fewshot = 10
37
+ private_test: bool = False
38
 
39
 
40
  class MMLU(Task):
 
43
  metric = "acc_norm"
44
  higher_is_better = True
45
  num_fewshot = 5
46
+ private_test: bool = False
47
 
48
 
49
  class TruthfulQA(Task):
 
52
  metric = "mc2"
53
  higher_is_better = True
54
  num_fewshot = 0
55
+ private_test: bool = False
56
 
57
 
58
  class Grade12Exams(Task):
 
61
  metric = "acc_norm"
62
  higher_is_better = True
63
  num_fewshot = 5
64
+ private_test: bool = False
65
 
66
 
67
  class IWSLT2023_en_vi(Task):
 
70
  metric = "bleu"
71
  higher_is_better = True
72
  num_fewshot = 0
73
+ private_test: bool = False
74
 
75
 
76
+ class WikipediaQA(Task):
77
+ code = "wikipediaqa_vi"
78
+ name = "Wikipedia QA"
79
+ metric = "acc_norm"
80
+ higher_is_better = True
81
+ num_fewshot = 5
82
+ private_test: bool = True
83
+
84
+
85
+ class Comprehension(Task):
86
+ code = "comprehension_vi"
87
+ name = "Comprehension"
88
+ metric = "acc_norm"
89
  higher_is_better = True
90
  num_fewshot = 0
91
+ private_test: bool = True
92
 
93
 
94
+ TASKS = [Arc, HellaSwag, MMLU, TruthfulQA, Grade12Exams] + [Lambada, WikipediaQA, Comprehension]