Spaces:
Runtime error
Runtime error
hieunguyen1053
commited on
Commit
β’
f8b127b
1
Parent(s):
2e61d1f
update
Browse files- app.py +15 -1
- src/load_from_hub.py +10 -5
- src/tasks.py +23 -7
app.py
CHANGED
@@ -17,7 +17,6 @@ fs = HfFileSystem(token=HIDDEN_TOKEN)
|
|
17 |
def restart_space():
|
18 |
api.restart_space(repo_id="vlsp-2023-vllm/VLLMs-Leaderboard", token=HIDDEN_TOKEN)
|
19 |
|
20 |
-
leaderboard_df = load_from_hub(fs, RESULTS_REPO)
|
21 |
|
22 |
demo = gr.Blocks()
|
23 |
|
@@ -27,6 +26,21 @@ with demo:
|
|
27 |
|
28 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
29 |
with gr.TabItem("Public test", elem_id="llm-benchmark-tab-table", id=0):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
with gr.Column():
|
31 |
with gr.Row():
|
32 |
search_bar = gr.Textbox(
|
|
|
17 |
def restart_space():
|
18 |
api.restart_space(repo_id="vlsp-2023-vllm/VLLMs-Leaderboard", token=HIDDEN_TOKEN)
|
19 |
|
|
|
20 |
|
21 |
demo = gr.Blocks()
|
22 |
|
|
|
26 |
|
27 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
28 |
with gr.TabItem("Public test", elem_id="llm-benchmark-tab-table", id=0):
|
29 |
+
leaderboard_df = load_from_hub(fs, RESULTS_REPO, is_private=False)
|
30 |
+
with gr.Column():
|
31 |
+
with gr.Row():
|
32 |
+
search_bar = gr.Textbox(
|
33 |
+
placeholder=" π Search for your model and press ENTER...",
|
34 |
+
show_label=False,
|
35 |
+
elem_id="search-bar",
|
36 |
+
)
|
37 |
+
with gr.Row():
|
38 |
+
leaderboard_table = gr.components.Dataframe(
|
39 |
+
value=leaderboard_df,
|
40 |
+
)
|
41 |
+
|
42 |
+
with gr.TabItem("Private test", elem_id="llm-benchmark-tab-table", id=1):
|
43 |
+
leaderboard_df = load_from_hub(fs, RESULTS_REPO, is_private=True)
|
44 |
with gr.Column():
|
45 |
with gr.Row():
|
46 |
search_bar = gr.Textbox(
|
src/load_from_hub.py
CHANGED
@@ -5,10 +5,10 @@ import pandas as pd
|
|
5 |
|
6 |
from src.assets.symbols import UP_ARROW, DOWN_ARROW
|
7 |
|
8 |
-
from src.tasks import TASKS
|
9 |
|
10 |
|
11 |
-
def load_from_hub(fs, repo_path):
|
12 |
files = fs.glob(f"{repo_path}/**/*.json")
|
13 |
|
14 |
set_organization_models = {}
|
@@ -20,7 +20,9 @@ def load_from_hub(fs, repo_path):
|
|
20 |
organization_model = f"{organization}/{model}"
|
21 |
task_code = task.replace(".json", "")
|
22 |
|
23 |
-
if task_code not in
|
|
|
|
|
24 |
continue
|
25 |
|
26 |
set_organization_models[organization_model] = 1
|
@@ -38,12 +40,15 @@ def load_from_hub(fs, repo_path):
|
|
38 |
organization_model = f"{organization}/{model}"
|
39 |
task_code = task.replace(".json", "")
|
40 |
|
41 |
-
if task_code not in
|
|
|
|
|
42 |
continue
|
43 |
|
44 |
data = json.loads(fs.open(file, "r").read())
|
45 |
|
46 |
-
|
|
|
47 |
|
48 |
table.loc[organization_model, task_code] = result
|
49 |
table.loc[organization_model, "Organization"] = organization
|
|
|
5 |
|
6 |
from src.assets.symbols import UP_ARROW, DOWN_ARROW
|
7 |
|
8 |
+
from src.tasks import TASKS
|
9 |
|
10 |
|
11 |
+
def load_from_hub(fs, repo_path, is_private=False):
|
12 |
files = fs.glob(f"{repo_path}/**/*.json")
|
13 |
|
14 |
set_organization_models = {}
|
|
|
20 |
organization_model = f"{organization}/{model}"
|
21 |
task_code = task.replace(".json", "")
|
22 |
|
23 |
+
if task_code not in map(lambda task: task.code, TASKS):
|
24 |
+
continue
|
25 |
+
if is_private != list(filter(lambda task: task.code == task_code, TASKS))[0].private_test:
|
26 |
continue
|
27 |
|
28 |
set_organization_models[organization_model] = 1
|
|
|
40 |
organization_model = f"{organization}/{model}"
|
41 |
task_code = task.replace(".json", "")
|
42 |
|
43 |
+
if task_code not in map(lambda task: task.code, TASKS):
|
44 |
+
continue
|
45 |
+
if is_private != list(filter(lambda task: task.code == task_code, TASKS))[0].private_test:
|
46 |
continue
|
47 |
|
48 |
data = json.loads(fs.open(file, "r").read())
|
49 |
|
50 |
+
metric = list(filter(lambda task: task.code == task_code, TASKS))[0].metric
|
51 |
+
result = round(data["results"][task_code][metric], 4)
|
52 |
|
53 |
table.loc[organization_model, task_code] = result
|
54 |
table.loc[organization_model, "Organization"] = organization
|
src/tasks.py
CHANGED
@@ -7,6 +7,7 @@ class Task:
|
|
7 |
metric: str
|
8 |
higher_is_better: bool = True
|
9 |
num_fewshot: int = 0
|
|
|
10 |
|
11 |
|
12 |
class Lambada(Task):
|
@@ -15,6 +16,7 @@ class Lambada(Task):
|
|
15 |
metric = "ppl"
|
16 |
higher_is_better = False
|
17 |
num_fewshot = 0
|
|
|
18 |
|
19 |
|
20 |
class Arc(Task):
|
@@ -23,6 +25,7 @@ class Arc(Task):
|
|
23 |
metric = "acc_norm"
|
24 |
higher_is_better = True
|
25 |
num_fewshot = 25
|
|
|
26 |
|
27 |
|
28 |
class HellaSwag(Task):
|
@@ -31,6 +34,7 @@ class HellaSwag(Task):
|
|
31 |
metric = "acc_norm"
|
32 |
higher_is_better = True
|
33 |
num_fewshot = 10
|
|
|
34 |
|
35 |
|
36 |
class MMLU(Task):
|
@@ -39,6 +43,7 @@ class MMLU(Task):
|
|
39 |
metric = "acc_norm"
|
40 |
higher_is_better = True
|
41 |
num_fewshot = 5
|
|
|
42 |
|
43 |
|
44 |
class TruthfulQA(Task):
|
@@ -47,6 +52,7 @@ class TruthfulQA(Task):
|
|
47 |
metric = "mc2"
|
48 |
higher_is_better = True
|
49 |
num_fewshot = 0
|
|
|
50 |
|
51 |
|
52 |
class Grade12Exams(Task):
|
@@ -55,6 +61,7 @@ class Grade12Exams(Task):
|
|
55 |
metric = "acc_norm"
|
56 |
higher_is_better = True
|
57 |
num_fewshot = 5
|
|
|
58 |
|
59 |
|
60 |
class IWSLT2023_en_vi(Task):
|
@@ -63,16 +70,25 @@ class IWSLT2023_en_vi(Task):
|
|
63 |
metric = "bleu"
|
64 |
higher_is_better = True
|
65 |
num_fewshot = 0
|
|
|
66 |
|
67 |
|
68 |
-
class
|
69 |
-
code = "
|
70 |
-
name = "
|
71 |
-
metric = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
higher_is_better = True
|
73 |
num_fewshot = 0
|
|
|
74 |
|
75 |
|
76 |
-
TASKS = [
|
77 |
-
TASK_CODES = [task.code for task in TASKS]
|
78 |
-
TASK_TO_METRIC = {task.code: task.metric for task in TASKS}
|
|
|
7 |
metric: str
|
8 |
higher_is_better: bool = True
|
9 |
num_fewshot: int = 0
|
10 |
+
private_test: bool = False
|
11 |
|
12 |
|
13 |
class Lambada(Task):
|
|
|
16 |
metric = "ppl"
|
17 |
higher_is_better = False
|
18 |
num_fewshot = 0
|
19 |
+
private_test: bool = True
|
20 |
|
21 |
|
22 |
class Arc(Task):
|
|
|
25 |
metric = "acc_norm"
|
26 |
higher_is_better = True
|
27 |
num_fewshot = 25
|
28 |
+
private_test: bool = False
|
29 |
|
30 |
|
31 |
class HellaSwag(Task):
|
|
|
34 |
metric = "acc_norm"
|
35 |
higher_is_better = True
|
36 |
num_fewshot = 10
|
37 |
+
private_test: bool = False
|
38 |
|
39 |
|
40 |
class MMLU(Task):
|
|
|
43 |
metric = "acc_norm"
|
44 |
higher_is_better = True
|
45 |
num_fewshot = 5
|
46 |
+
private_test: bool = False
|
47 |
|
48 |
|
49 |
class TruthfulQA(Task):
|
|
|
52 |
metric = "mc2"
|
53 |
higher_is_better = True
|
54 |
num_fewshot = 0
|
55 |
+
private_test: bool = False
|
56 |
|
57 |
|
58 |
class Grade12Exams(Task):
|
|
|
61 |
metric = "acc_norm"
|
62 |
higher_is_better = True
|
63 |
num_fewshot = 5
|
64 |
+
private_test: bool = False
|
65 |
|
66 |
|
67 |
class IWSLT2023_en_vi(Task):
|
|
|
70 |
metric = "bleu"
|
71 |
higher_is_better = True
|
72 |
num_fewshot = 0
|
73 |
+
private_test: bool = False
|
74 |
|
75 |
|
76 |
+
class WikipediaQA(Task):
|
77 |
+
code = "wikipediaqa_vi"
|
78 |
+
name = "Wikipedia QA"
|
79 |
+
metric = "acc_norm"
|
80 |
+
higher_is_better = True
|
81 |
+
num_fewshot = 5
|
82 |
+
private_test: bool = True
|
83 |
+
|
84 |
+
|
85 |
+
class Comprehension(Task):
|
86 |
+
code = "comprehension_vi"
|
87 |
+
name = "Comprehension"
|
88 |
+
metric = "acc_norm"
|
89 |
higher_is_better = True
|
90 |
num_fewshot = 0
|
91 |
+
private_test: bool = True
|
92 |
|
93 |
|
94 |
+
TASKS = [Arc, HellaSwag, MMLU, TruthfulQA, Grade12Exams] + [Lambada, WikipediaQA, Comprehension]
|
|
|
|