Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Clémentine
commited on
Commit
•
db7f350
1
Parent(s):
3bbac29
init
Browse files- README.md +7 -6
- app.py +179 -0
- content.py +32 -0
- requirements.txt +8 -0
README.md
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: BALM Leaderboard
|
3 |
+
emoji: 🦾
|
4 |
+
colorFrom: orange
|
5 |
+
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.27.0
|
8 |
app_file: app.py
|
9 |
+
pinned: true
|
10 |
+
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from email.utils import parseaddr
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
from datasets import load_dataset
|
8 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
9 |
+
from huggingface_hub import HfApi
|
10 |
+
|
11 |
+
# InfoStrings
|
12 |
+
from content import *
|
13 |
+
|
14 |
+
BALM_TOKEN = os.environ.get("BALM_TOKEN", None)
|
15 |
+
owner="clefourrier" # change to balm once possible
|
16 |
+
|
17 |
+
api = HfApi()
|
18 |
+
|
19 |
+
eval_results = {}
|
20 |
+
eval_dataframe = {}
|
21 |
+
for level in range(1, 4):
|
22 |
+
eval_results[level] = load_dataset(f"{owner}/BALM_ResultsDev{level}", token=BALM_TOKEN, split="dev")
|
23 |
+
eval_dataframe[level] = pd.DataFrame(eval_results[level].remove_column("mail"))
|
24 |
+
|
25 |
+
def restart_space():
|
26 |
+
api.restart_space(repo_id=f"{owner}/BALM_Leaderboard", token=BALM_TOKEN)
|
27 |
+
|
28 |
+
|
29 |
+
COLS = ["Model", "Organisation", "Reported accuracy ⬆️"]
|
30 |
+
TYPES = ["str", "str", "number",]
|
31 |
+
|
32 |
+
def add_new_eval(
|
33 |
+
level_of_dev: str,
|
34 |
+
model: str,
|
35 |
+
score: float,
|
36 |
+
organisation: str,
|
37 |
+
mail: str,
|
38 |
+
):
|
39 |
+
level = int(level_of_dev.split(" ")[-1])
|
40 |
+
|
41 |
+
# Very basic email parsing
|
42 |
+
_, parsed_mail = parseaddr(mail)
|
43 |
+
if not "@" in parsed_mail:
|
44 |
+
valid_mail = "Please provide a valid email adress."
|
45 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{valid_mail}</p>"
|
46 |
+
|
47 |
+
print("Adding new eval")
|
48 |
+
|
49 |
+
# Check if the combination model/org already exists and prints a warning message if yes
|
50 |
+
if model.lower() in set(eval_results[level]["model"]) and organisation.lower() in set(eval_results[level]["organisation"]):
|
51 |
+
duplicate_request_message = "This model has been already submitted."
|
52 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{duplicate_request_message}</p>"
|
53 |
+
|
54 |
+
# Actual submission
|
55 |
+
eval_entry = {
|
56 |
+
"model": model,
|
57 |
+
"score": score,
|
58 |
+
"organisation": organisation,
|
59 |
+
"mail": mail,
|
60 |
+
}
|
61 |
+
eval_results[level].add_item(eval_entry)
|
62 |
+
|
63 |
+
success_message = f"Model {model} submitted by {organisation}."
|
64 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{success_message}</p>"
|
65 |
+
|
66 |
+
|
67 |
+
def refresh():
|
68 |
+
eval_results = {}
|
69 |
+
eval_dataframe = {}
|
70 |
+
for level in range(1, 4):
|
71 |
+
eval_results[level] = load_dataset(f"{owner}/BALM_ResultsDev{level}", token=BALM_TOKEN, split="dev")
|
72 |
+
eval_dataframe[level] = pd.DataFrame(eval_results[level].remove_column("mail"))
|
73 |
+
return eval_dataframe[1], eval_dataframe[2], eval_dataframe[3]
|
74 |
+
|
75 |
+
|
76 |
+
custom_css = """
|
77 |
+
#changelog-text {
|
78 |
+
font-size: 16px !important;
|
79 |
+
}
|
80 |
+
|
81 |
+
#changelog-text h2 {
|
82 |
+
font-size: 18px !important;
|
83 |
+
}
|
84 |
+
|
85 |
+
.markdown-text {
|
86 |
+
font-size: 16px !important;
|
87 |
+
}
|
88 |
+
|
89 |
+
#citation-button span {
|
90 |
+
font-size: 16px !important;
|
91 |
+
}
|
92 |
+
|
93 |
+
#citation-button textarea {
|
94 |
+
font-size: 16px !important;
|
95 |
+
}
|
96 |
+
|
97 |
+
#citation-button > label > button {
|
98 |
+
margin: 6px;
|
99 |
+
transform: scale(1.3);
|
100 |
+
}
|
101 |
+
"""
|
102 |
+
|
103 |
+
demo = gr.Blocks(css=custom_css)
|
104 |
+
with demo:
|
105 |
+
gr.HTML(TITLE)
|
106 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
107 |
+
|
108 |
+
with gr.Row():
|
109 |
+
with gr.Column():
|
110 |
+
with gr.Accordion("📙 Citation", open=False):
|
111 |
+
citation_button = gr.Textbox(
|
112 |
+
value=CITATION_BUTTON_TEXT,
|
113 |
+
label=CITATION_BUTTON_LABEL,
|
114 |
+
elem_id="citation-button",
|
115 |
+
).style(show_copy_button=True)
|
116 |
+
with gr.Column():
|
117 |
+
with gr.Accordion("✨ CHANGELOG", open=False):
|
118 |
+
changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
|
119 |
+
|
120 |
+
with gr.Tab("Results: Level 1"):
|
121 |
+
with gr.Tab("Results on Dev Set"):
|
122 |
+
leaderboard_table_1 = gr.components.Dataframe(
|
123 |
+
value=eval_dataframe[1], headers=COLS, datatype=TYPES, max_rows=20
|
124 |
+
)
|
125 |
+
with gr.Tab("Results on Test Set"):
|
126 |
+
gr.Textbox(value="The test set is currently private! Come back when performances on the dev set increased!")
|
127 |
+
with gr.Tab("Results: Level 2"):
|
128 |
+
with gr.Tab("Results on Dev Set"):
|
129 |
+
leaderboard_table_2 = gr.components.Dataframe(
|
130 |
+
value=eval_dataframe[2], headers=COLS, datatype=TYPES, max_rows=20
|
131 |
+
)
|
132 |
+
with gr.Tab("Results on Test Set"):
|
133 |
+
gr.Textbox(value="The test set is currently private! Come back when performances on the dev set increased!")
|
134 |
+
with gr.Tab("Results: Level 3"):
|
135 |
+
with gr.Tab("Results on Dev Set"):
|
136 |
+
leaderboard_table_3 = gr.components.Dataframe(
|
137 |
+
value=eval_dataframe[3], headers=COLS, datatype=TYPES, max_rows=20
|
138 |
+
)
|
139 |
+
with gr.Tab("Results on Test Set"):
|
140 |
+
gr.Textbox(value="The test set is currently private! Come back when performances on the dev set increased!")
|
141 |
+
|
142 |
+
refresh_button = gr.Button("Refresh")
|
143 |
+
refresh_button.click(
|
144 |
+
refresh,
|
145 |
+
inputs=[],
|
146 |
+
outputs=[
|
147 |
+
eval_dataframe[1],
|
148 |
+
eval_dataframe[2],
|
149 |
+
eval_dataframe[3],
|
150 |
+
],
|
151 |
+
)
|
152 |
+
|
153 |
+
with gr.Accordion("Submit a new model for evaluation"):
|
154 |
+
#with gr.Row():
|
155 |
+
with gr.Column():
|
156 |
+
level_of_dev = gr.Radio(["Level 1", "Level 2", "Level 3"], value="Level 1", label="Dev set")
|
157 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
158 |
+
score = gr.Textbox(label="Score")
|
159 |
+
organisation = gr.Textbox(label="Organisation")
|
160 |
+
mail = gr.Textbox(label="Contact email")
|
161 |
+
|
162 |
+
submit_button = gr.Button("Submit Eval")
|
163 |
+
submission_result = gr.Markdown()
|
164 |
+
submit_button.click(
|
165 |
+
add_new_eval,
|
166 |
+
[
|
167 |
+
level_of_dev,
|
168 |
+
model_name_textbox,
|
169 |
+
score,
|
170 |
+
organisation,
|
171 |
+
mail
|
172 |
+
],
|
173 |
+
submission_result,
|
174 |
+
)
|
175 |
+
|
176 |
+
scheduler = BackgroundScheduler()
|
177 |
+
scheduler.add_job(restart_space, "interval", seconds=3600)
|
178 |
+
scheduler.start()
|
179 |
+
demo.launch()
|
content.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CHANGELOG_TEXT = f"""
|
2 |
+
## [2023-06-02]
|
3 |
+
- Beta internal version of the leaderboard
|
4 |
+
"""
|
5 |
+
|
6 |
+
TITLE = """<h1 align="center" id="space-title">BALM Leaderboard</h1>"""
|
7 |
+
|
8 |
+
CANARY_STRING = "" # TODO
|
9 |
+
|
10 |
+
INTRODUCTION_TEXT = f"""
|
11 |
+
Large language models have seen their potential capabilities increased by several orders of magnitude with the introduction of augmentations, from simple prompting adjustement to actual external tooling (calculators, vision models, ...) or online web retrieval.
|
12 |
+
|
13 |
+
To evaluate the next generation of LLMs, we argue for a new kind of benchmark, simple and yet effective to measure actual progress on augmented capabilities,
|
14 |
+
We therefore present BALM.
|
15 |
+
|
16 |
+
BALM is made of 3 evaluation levels, depending on the added level of tooling and autonomy the model needs.
|
17 |
+
We expect the level 1 to be breakable by very good LLMs, and the level 3 to indicate a strong jump in model capabilities.
|
18 |
+
|
19 |
+
Each of these levels is divided into two sets: a public dev set, on which people can self report their results, and a private test set, which will be unlocked once public performance passes a threshold on the dev set.
|
20 |
+
|
21 |
+
Please do not repost the public dev set, nor use it in training data for your models. Its canary string is """ + CANARY_STRING + """ and files containing this string should be removed from training data.
|
22 |
+
"""
|
23 |
+
|
24 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
25 |
+
CITATION_BUTTON_TEXT = r"""@misc{balm, # TODO
|
26 |
+
author = {tbd},
|
27 |
+
title = {Benchmark for Augmented Language Models},
|
28 |
+
year = {2023},
|
29 |
+
#publisher = {Hugging Face},
|
30 |
+
#howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
|
31 |
+
}"""
|
32 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets
|
2 |
+
gradio==3.27.0
|
3 |
+
gradio_client==0.1.3
|
4 |
+
huggingface-hub==0.13.4
|
5 |
+
pandas==2.0.0
|
6 |
+
tqdm==4.65.0
|
7 |
+
transformers==4.28.1
|
8 |
+
typing_extensions==4.5.0
|