Spaces:

vlsp-2023-vllm
/

VLLMs-Leaderboard

Runtime error

hieunguyen1053 commited on Nov 19, 2023

Commit

f8b127b

•

1 Parent(s): 2e61d1f

update

Files changed (3) hide show

app.py CHANGED Viewed

@@ -17,7 +17,6 @@ fs = HfFileSystem(token=HIDDEN_TOKEN)
 def restart_space():
     api.restart_space(repo_id="vlsp-2023-vllm/VLLMs-Leaderboard", token=HIDDEN_TOKEN)
-leaderboard_df = load_from_hub(fs, RESULTS_REPO)
 demo = gr.Blocks()
@@ -27,6 +26,21 @@ with demo:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("Public test", elem_id="llm-benchmark-tab-table", id=0):
             with gr.Column():
                 with gr.Row():
                     search_bar = gr.Textbox(

 def restart_space():
     api.restart_space(repo_id="vlsp-2023-vllm/VLLMs-Leaderboard", token=HIDDEN_TOKEN)
 demo = gr.Blocks()
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("Public test", elem_id="llm-benchmark-tab-table", id=0):
+            leaderboard_df = load_from_hub(fs, RESULTS_REPO, is_private=False)
+            with gr.Column():
+                with gr.Row():
+                    search_bar = gr.Textbox(
+                        placeholder=" 🔍 Search for your model and press ENTER...",
+                        show_label=False,
+                        elem_id="search-bar",
+                    )
+                with gr.Row():
+                    leaderboard_table = gr.components.Dataframe(
+                        value=leaderboard_df,
+                    )
+        with gr.TabItem("Private test", elem_id="llm-benchmark-tab-table", id=1):
+            leaderboard_df = load_from_hub(fs, RESULTS_REPO, is_private=True)
             with gr.Column():
                 with gr.Row():
                     search_bar = gr.Textbox(

src/load_from_hub.py CHANGED Viewed

@@ -5,10 +5,10 @@ import pandas as pd
 from src.assets.symbols import UP_ARROW, DOWN_ARROW
-from src.tasks import TASKS, TASK_CODES, TASK_TO_METRIC
-def load_from_hub(fs, repo_path):
     files = fs.glob(f"{repo_path}/**/*.json")
     set_organization_models = {}
@@ -20,7 +20,9 @@ def load_from_hub(fs, repo_path):
         organization_model = f"{organization}/{model}"
         task_code = task.replace(".json", "")
-        if task_code not in TASK_CODES:
             continue
         set_organization_models[organization_model] = 1
@@ -38,12 +40,15 @@ def load_from_hub(fs, repo_path):
         organization_model = f"{organization}/{model}"
         task_code = task.replace(".json", "")
-        if task_code not in TASK_CODES:
             continue
         data = json.loads(fs.open(file, "r").read())
-        result = round(data["results"][task_code][TASK_TO_METRIC[task_code]], 4)
         table.loc[organization_model, task_code] = result
         table.loc[organization_model, "Organization"] = organization

 from src.assets.symbols import UP_ARROW, DOWN_ARROW
+from src.tasks import TASKS
+def load_from_hub(fs, repo_path, is_private=False):
     files = fs.glob(f"{repo_path}/**/*.json")
     set_organization_models = {}
         organization_model = f"{organization}/{model}"
         task_code = task.replace(".json", "")
+        if task_code not in map(lambda task: task.code, TASKS):
+            continue
+        if is_private != list(filter(lambda task: task.code == task_code, TASKS))[0].private_test:
             continue
         set_organization_models[organization_model] = 1
         organization_model = f"{organization}/{model}"
         task_code = task.replace(".json", "")
+        if task_code not in map(lambda task: task.code, TASKS):
+            continue
+        if is_private != list(filter(lambda task: task.code == task_code, TASKS))[0].private_test:
             continue
         data = json.loads(fs.open(file, "r").read())
+        metric = list(filter(lambda task: task.code == task_code, TASKS))[0].metric
+        result = round(data["results"][task_code][metric], 4)
         table.loc[organization_model, task_code] = result
         table.loc[organization_model, "Organization"] = organization

src/tasks.py CHANGED Viewed

@@ -7,6 +7,7 @@ class Task:
     metric: str
     higher_is_better: bool = True
     num_fewshot: int = 0
 class Lambada(Task):
@@ -15,6 +16,7 @@ class Lambada(Task):
     metric = "ppl"
     higher_is_better = False
     num_fewshot = 0
 class Arc(Task):
@@ -23,6 +25,7 @@ class Arc(Task):
     metric = "acc_norm"
     higher_is_better = True
     num_fewshot = 25
 class HellaSwag(Task):
@@ -31,6 +34,7 @@ class HellaSwag(Task):
     metric = "acc_norm"
     higher_is_better = True
     num_fewshot = 10
 class MMLU(Task):
@@ -39,6 +43,7 @@ class MMLU(Task):
     metric = "acc_norm"
     higher_is_better = True
     num_fewshot = 5
 class TruthfulQA(Task):
@@ -47,6 +52,7 @@ class TruthfulQA(Task):
     metric = "mc2"
     higher_is_better = True
     num_fewshot = 0
 class Grade12Exams(Task):
@@ -55,6 +61,7 @@ class Grade12Exams(Task):
     metric = "acc_norm"
     higher_is_better = True
     num_fewshot = 5
 class IWSLT2023_en_vi(Task):
@@ -63,16 +70,25 @@ class IWSLT2023_en_vi(Task):
     metric = "bleu"
     higher_is_better = True
     num_fewshot = 0
-class XNLI(Task):
-    code = "xnli_vi"
-    name = "XNLI"
-    metric = "acc"
     higher_is_better = True
     num_fewshot = 0
-TASKS = [Lambada, Arc, HellaSwag, MMLU, TruthfulQA, Grade12Exams, IWSLT2023_en_vi, XNLI]
-TASK_CODES = [task.code for task in TASKS]
-TASK_TO_METRIC = {task.code: task.metric for task in TASKS}

     metric: str
     higher_is_better: bool = True
     num_fewshot: int = 0
+    private_test: bool = False
 class Lambada(Task):
     metric = "ppl"
     higher_is_better = False
     num_fewshot = 0
+    private_test: bool = True
 class Arc(Task):
     metric = "acc_norm"
     higher_is_better = True
     num_fewshot = 25
+    private_test: bool = False
 class HellaSwag(Task):
     metric = "acc_norm"
     higher_is_better = True
     num_fewshot = 10
+    private_test: bool = False
 class MMLU(Task):
     metric = "acc_norm"
     higher_is_better = True
     num_fewshot = 5
+    private_test: bool = False
 class TruthfulQA(Task):
     metric = "mc2"
     higher_is_better = True
     num_fewshot = 0
+    private_test: bool = False
 class Grade12Exams(Task):
     metric = "acc_norm"
     higher_is_better = True
     num_fewshot = 5
+    private_test: bool = False
 class IWSLT2023_en_vi(Task):
     metric = "bleu"
     higher_is_better = True
     num_fewshot = 0
+    private_test: bool = False
+class WikipediaQA(Task):
+    code = "wikipediaqa_vi"
+    name = "Wikipedia QA"
+    metric = "acc_norm"
+    higher_is_better = True
+    num_fewshot = 5
+    private_test: bool = True
+class Comprehension(Task):
+    code = "comprehension_vi"
+    name = "Comprehension"
+    metric = "acc_norm"
     higher_is_better = True
     num_fewshot = 0
+    private_test: bool = True
+TASKS = [Arc, HellaSwag, MMLU, TruthfulQA, Grade12Exams] + [Lambada, WikipediaQA, Comprehension]