Spaces:
Runtime error
Runtime error
Upload 8 files
Browse files- LICENSE +21 -0
- app.py +215 -0
- requirements.txt +3 -0
- src/assets/text_content.py +53 -0
- src/leaderboard_utils.py +142 -0
- src/plot_utils.py +133 -0
- src/reload.py +78 -0
- src/reload_utils.py +82 -0
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 clembench
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
app.py
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from src.assets.text_content import TITLE, INTRODUCTION_TEXT
|
4 |
+
from src.leaderboard_utils import filter_search, get_github_data
|
5 |
+
from src.plot_utils import split_models, compare_plots
|
6 |
+
|
7 |
+
# from src.reload_utils import ReloadData
|
8 |
+
from src.reload import get_primary_leaderboard, get_open_models, get_closed_models, get_plot_df, get_version_names, get_version_df, get_prev_df
|
9 |
+
|
10 |
+
reload_time = 5
|
11 |
+
|
12 |
+
# # For Leaderboards
|
13 |
+
# # Get CSV data
|
14 |
+
# global primary_leaderboard_df, version_dfs, version_names
|
15 |
+
# primary_leaderboard_df, version_dfs, version_names = get_github_data()
|
16 |
+
|
17 |
+
# global prev_df
|
18 |
+
# prev_df = version_dfs[0]
|
19 |
+
# def select_prev_df(name):
|
20 |
+
# ind = version_names.index(name)
|
21 |
+
# prev_df = version_dfs[ind]
|
22 |
+
# return prev_df
|
23 |
+
|
24 |
+
# # For Plots
|
25 |
+
# global plot_df, OPEN_MODELS, CLOSED_MODELS
|
26 |
+
# plot_df = primary_leaderboard_df[0]
|
27 |
+
# MODELS = list(plot_df[list(plot_df.columns)[0]].unique())
|
28 |
+
# OPEN_MODELS, CLOSED_MODELS = split_models(MODELS)
|
29 |
+
|
30 |
+
|
31 |
+
# MAIN APPLICATION s
|
32 |
+
main_app = gr.Blocks()
|
33 |
+
with main_app:
|
34 |
+
gr.HTML(TITLE)
|
35 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
36 |
+
|
37 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
38 |
+
with gr.TabItem("🥇 CLEM Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
39 |
+
with gr.Row():
|
40 |
+
search_bar = gr.Textbox(
|
41 |
+
placeholder=" 🔍 Search for models - separate multiple queries with `;` and press ENTER...",
|
42 |
+
show_label=False,
|
43 |
+
elem_id="search-bar",
|
44 |
+
)
|
45 |
+
|
46 |
+
leaderboard_table = gr.DataFrame(
|
47 |
+
value=get_primary_leaderboard,
|
48 |
+
elem_id="leaderboard-table",
|
49 |
+
interactive=False,
|
50 |
+
visible=True,
|
51 |
+
every=reload_time
|
52 |
+
)
|
53 |
+
|
54 |
+
# Add a dummy leaderboard to handle search queries from the primary_leaderboard_df and not update primary_leaderboard_df
|
55 |
+
dummy_leaderboard_table = gr.Dataframe(
|
56 |
+
value=get_primary_leaderboard,
|
57 |
+
elem_id="leaderboard-table",
|
58 |
+
interactive=False,
|
59 |
+
visible=False,
|
60 |
+
every=reload_time
|
61 |
+
)
|
62 |
+
|
63 |
+
search_bar.submit(
|
64 |
+
filter_search,
|
65 |
+
[dummy_leaderboard_table, search_bar],
|
66 |
+
leaderboard_table,
|
67 |
+
queue=True
|
68 |
+
)
|
69 |
+
|
70 |
+
with gr.TabItem("📈 Plot", id=3):
|
71 |
+
with gr.Row():
|
72 |
+
open_models_selection = gr.CheckboxGroup(
|
73 |
+
choices=get_open_models(),
|
74 |
+
label="Open-weight Models 🌐",
|
75 |
+
value=[],
|
76 |
+
elem_id="value-select",
|
77 |
+
interactive=True,
|
78 |
+
every=reload_time
|
79 |
+
)
|
80 |
+
|
81 |
+
with gr.Row():
|
82 |
+
closed_models_selection = gr.CheckboxGroup(
|
83 |
+
choices=get_closed_models(),
|
84 |
+
label="Closed-weight Models 💼",
|
85 |
+
value=[],
|
86 |
+
elem_id="value-select-2",
|
87 |
+
interactive=True,
|
88 |
+
every=reload_time
|
89 |
+
)
|
90 |
+
|
91 |
+
with gr.Row():
|
92 |
+
with gr.Column():
|
93 |
+
show_all = gr.CheckboxGroup(
|
94 |
+
["Select All Models"],
|
95 |
+
label="Show plot for all models 🤖",
|
96 |
+
value=[],
|
97 |
+
elem_id="value-select-3",
|
98 |
+
interactive=True,
|
99 |
+
)
|
100 |
+
|
101 |
+
with gr.Column():
|
102 |
+
show_names = gr.CheckboxGroup(
|
103 |
+
["Show Names"],
|
104 |
+
label ="Show names of models on the plot 🏷️",
|
105 |
+
value=[],
|
106 |
+
elem_id="value-select-4",
|
107 |
+
interactive=True,
|
108 |
+
)
|
109 |
+
|
110 |
+
with gr.Column():
|
111 |
+
show_legend = gr.CheckboxGroup(
|
112 |
+
["Show Legend"],
|
113 |
+
label ="Show legend on the plot 💡",
|
114 |
+
value=[],
|
115 |
+
elem_id="value-select-5",
|
116 |
+
interactive=True,
|
117 |
+
)
|
118 |
+
|
119 |
+
with gr.Row():
|
120 |
+
dummy_plot_df = gr.DataFrame(
|
121 |
+
value=get_plot_df,
|
122 |
+
visible=False,
|
123 |
+
every=reload_time
|
124 |
+
)
|
125 |
+
|
126 |
+
with gr.Row():
|
127 |
+
with gr.Column():
|
128 |
+
# Output block for the plot
|
129 |
+
plot_output = gr.Plot()
|
130 |
+
|
131 |
+
open_models_selection.change(
|
132 |
+
compare_plots,
|
133 |
+
[dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend],
|
134 |
+
plot_output,
|
135 |
+
queue=True
|
136 |
+
)
|
137 |
+
|
138 |
+
closed_models_selection.change(
|
139 |
+
compare_plots,
|
140 |
+
[dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend],
|
141 |
+
plot_output,
|
142 |
+
queue=True
|
143 |
+
)
|
144 |
+
|
145 |
+
show_all.change(
|
146 |
+
compare_plots,
|
147 |
+
[dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend],
|
148 |
+
plot_output,
|
149 |
+
queue=True
|
150 |
+
)
|
151 |
+
|
152 |
+
show_names.change(
|
153 |
+
compare_plots,
|
154 |
+
[dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend],
|
155 |
+
plot_output,
|
156 |
+
queue=True
|
157 |
+
)
|
158 |
+
|
159 |
+
show_legend.change(
|
160 |
+
compare_plots,
|
161 |
+
[dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend],
|
162 |
+
plot_output,
|
163 |
+
queue=True
|
164 |
+
)
|
165 |
+
|
166 |
+
with gr.TabItem("🔄 Versions and Details", elem_id="details", id=2):
|
167 |
+
with gr.Row():
|
168 |
+
version_select = gr.Dropdown(
|
169 |
+
choices=get_version_names(),
|
170 |
+
label="Select Version 🕹️",
|
171 |
+
value=get_version_names()[0],
|
172 |
+
every=reload_time
|
173 |
+
)
|
174 |
+
with gr.Row():
|
175 |
+
search_bar_prev = gr.Textbox(
|
176 |
+
placeholder=" 🔍 Search for models - separate multiple queries with `;` and press ENTER...",
|
177 |
+
show_label=False,
|
178 |
+
elem_id="search-bar-2",
|
179 |
+
)
|
180 |
+
|
181 |
+
prev_table = gr.Dataframe(
|
182 |
+
value=get_prev_df,
|
183 |
+
elem_id="leaderboard-table",
|
184 |
+
interactive=False,
|
185 |
+
visible=True,
|
186 |
+
every=reload_time
|
187 |
+
)
|
188 |
+
|
189 |
+
dummy_prev_table = gr.Dataframe(
|
190 |
+
value=get_prev_df,
|
191 |
+
elem_id="leaderboard-table",
|
192 |
+
interactive=False,
|
193 |
+
visible=False,
|
194 |
+
every=reload_time
|
195 |
+
)
|
196 |
+
|
197 |
+
search_bar_prev.submit(
|
198 |
+
filter_search,
|
199 |
+
[dummy_prev_table, search_bar_prev],
|
200 |
+
prev_table,
|
201 |
+
queue=True
|
202 |
+
)
|
203 |
+
|
204 |
+
version_select.change(
|
205 |
+
get_prev_df,
|
206 |
+
[version_select],
|
207 |
+
prev_table,
|
208 |
+
queue=True,
|
209 |
+
every=reload_time
|
210 |
+
)
|
211 |
+
main_app.load()
|
212 |
+
|
213 |
+
main_app.queue()
|
214 |
+
|
215 |
+
main_app.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
gradio==3.43.2
|
2 |
+
pandas==2.0.0
|
3 |
+
plotly==5.18.0
|
src/assets/text_content.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TITLE = """<h1 align="center" id="space-title"> 🏆 CLEM Leaderboard</h1>"""
|
2 |
+
|
3 |
+
INTRODUCTION_TEXT = """
|
4 |
+
<h6 align="center">
|
5 |
+
The CLEM Leaderboard aims to track, rank and evaluate current cLLMs (chat-optimized Large Language Models) with the suggested pronounciation “clems”.
|
6 |
+
|
7 |
+
The benchmarking approach is described in [Clembench: Using Game Play to Evaluate Chat-Optimized Language Models as Conversational Agents](https://arxiv.org/abs/2305.13455).
|
8 |
+
|
9 |
+
Source code for benchmarking "clems" is available here: [Clembench](https://github.com/clembench/clembench)
|
10 |
+
|
11 |
+
All generated files and results from the benchmark runs are available here: [clembench-runs](https://github.com/clembench/clembench-runs) </h6>
|
12 |
+
"""
|
13 |
+
|
14 |
+
SHORT_NAMES = {
|
15 |
+
"t0.0": "",
|
16 |
+
"claude-v1.3": "cl-1.3",
|
17 |
+
"claude-2": "cl-2",
|
18 |
+
"claude-2.1": "cl-2.1",
|
19 |
+
"claude-instant-1.2": "cl-ins-1.2",
|
20 |
+
"gpt-3.5-turbo-0613": "3.5-0613",
|
21 |
+
"gpt-3.5-turbo-1106": "3.5-1106",
|
22 |
+
"gpt-4-0613": "4-0613",
|
23 |
+
"gpt-4-1106-preview": "4-1106",
|
24 |
+
"gpt-4-0314": "4-0314",
|
25 |
+
"gpt-4": "4",
|
26 |
+
"text-davinci-003": "3",
|
27 |
+
"luminous-supreme": "lm",
|
28 |
+
"koala-13b": "k-13b",
|
29 |
+
"falcon-40b": "fal-40b",
|
30 |
+
"falcon-7b-instruct": "fal-7b",
|
31 |
+
"falcon-40b-instruct": "flc-i-40b",
|
32 |
+
"oasst-12b": "oas-12b",
|
33 |
+
"oasst-sft-4-pythia-12b-epoch-3.5": "ost-12b",
|
34 |
+
"vicuna-13b": "vic-13b",
|
35 |
+
"vicuna-33b-v1.3": "vic-33b-v1.3",
|
36 |
+
"sheep-duck-llama-2-70b-v1.1": "sd-l2-70b-v1.1",
|
37 |
+
"sheep-duck-llama-2-13b": "sd-l2-13b",
|
38 |
+
"WizardLM-70b-v1.0": "w-70b-v1.0",
|
39 |
+
"CodeLlama-34b-Instruct-hf": "cl-34b",
|
40 |
+
"command": "com",
|
41 |
+
"Mistral-7B-Instruct-v0.1": "m-i-7b-v0.1",
|
42 |
+
"Wizard-Vicuna-13B-Uncensored-HF": "vcn-13b",
|
43 |
+
"llama-2-13b-chat-hf": "l2-13b",
|
44 |
+
"llama-2-70b-chat-hf": "l2-70b",
|
45 |
+
"llama-2-7b-chat-hf": "l2-7b",
|
46 |
+
"koala-13B-HF": "k-13b",
|
47 |
+
"WizardLM-13b-v1.2": "w-13b-v1.2",
|
48 |
+
"vicuna-7b-v1.5": "vic-7b-v1.5",
|
49 |
+
"vicuna-13b-v1.5": "vic-13b-v1.5",
|
50 |
+
"gpt4all-13b-snoozy": "g4a-13b-s",
|
51 |
+
"zephyr-7b-alpha":"z-7b-a",
|
52 |
+
"zephyr-7b-beta":"z-7b-b"
|
53 |
+
}
|
src/leaderboard_utils.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import requests, json
|
4 |
+
from io import StringIO
|
5 |
+
|
6 |
+
from datetime import datetime
|
7 |
+
|
8 |
+
|
9 |
+
def get_github_data():
|
10 |
+
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
11 |
+
print(f"LOADING GITHUB DATAAA.... at time = {current_time}")
|
12 |
+
'''
|
13 |
+
Get data from csv files on Github
|
14 |
+
Args:
|
15 |
+
None
|
16 |
+
Returns:
|
17 |
+
latest_df: singular list containing dataframe of the latest version of the leaderboard with only 4 columns
|
18 |
+
all_dfs: list of dataframes for previous versions + latest version including columns for all games
|
19 |
+
all_vnames: list of the names for the previous versions + latest version (For Details and Versions Tab Dropdown)
|
20 |
+
'''
|
21 |
+
uname = "kushal-10"
|
22 |
+
repo = "clembench-runs"
|
23 |
+
json_url = f"https://raw.githubusercontent.com/{uname}/{repo}/main/benchmark_runs.json"
|
24 |
+
resp = requests.get(json_url)
|
25 |
+
if resp.status_code == 200:
|
26 |
+
json_data = json.loads(resp.text)
|
27 |
+
versions = json_data['versions']
|
28 |
+
version_names = []
|
29 |
+
csv_url = f"https://raw.githubusercontent.com/{uname}/{repo}/main/"
|
30 |
+
for ver in versions:
|
31 |
+
version_names.append(ver['version'])
|
32 |
+
csv_path = ver['result_file'].split('/')[1:]
|
33 |
+
csv_path = '/'.join(csv_path)
|
34 |
+
|
35 |
+
#Sort by latest version
|
36 |
+
float_content = [float(s[1:]) for s in version_names]
|
37 |
+
float_content.sort(reverse=True)
|
38 |
+
version_names = ['v'+str(s) for s in float_content]
|
39 |
+
|
40 |
+
DFS = []
|
41 |
+
for version in version_names:
|
42 |
+
result_url = csv_url+ version + '/' + csv_path
|
43 |
+
csv_response = requests.get(result_url)
|
44 |
+
if csv_response.status_code == 200:
|
45 |
+
df = pd.read_csv(StringIO(csv_response.text))
|
46 |
+
df = process_df(df)
|
47 |
+
df = df.sort_values(by=list(df.columns)[1], ascending=False) # Sort by clemscore
|
48 |
+
DFS.append(df)
|
49 |
+
else:
|
50 |
+
print(f"Failed to read CSV file for version : {version}. Status Code : {resp.status_code}")
|
51 |
+
|
52 |
+
# Only keep relavant columns for the main leaderboard
|
53 |
+
latest_df_dummy = DFS[0]
|
54 |
+
all_columns = list(latest_df_dummy.columns)
|
55 |
+
keep_columns = all_columns[0:4]
|
56 |
+
latest_df_dummy = latest_df_dummy.drop(columns=[c for c in all_columns if c not in keep_columns])
|
57 |
+
|
58 |
+
latest_df = [latest_df_dummy]
|
59 |
+
all_dfs = []
|
60 |
+
all_vnames = []
|
61 |
+
for df, name in zip(DFS, version_names):
|
62 |
+
all_dfs.append(df)
|
63 |
+
all_vnames.append(name)
|
64 |
+
return latest_df, all_dfs, all_vnames
|
65 |
+
|
66 |
+
else:
|
67 |
+
print(f"Failed to read JSON file: Status Code : {resp.status_code}")
|
68 |
+
|
69 |
+
def process_df(df: pd.DataFrame) -> pd.DataFrame:
|
70 |
+
'''
|
71 |
+
Process dataframe
|
72 |
+
- Remove repition in model names
|
73 |
+
- Convert datatypes to sort by "float" instead of "str" for sorting
|
74 |
+
- Update column names
|
75 |
+
Args:
|
76 |
+
df: Unprocessed Dataframe (after using update_cols)
|
77 |
+
Returns:
|
78 |
+
df: Processed Dataframe
|
79 |
+
'''
|
80 |
+
|
81 |
+
# Change column type to float from str
|
82 |
+
list_column_names = list(df.columns)
|
83 |
+
model_col_name = list_column_names[0]
|
84 |
+
for col in list_column_names:
|
85 |
+
if col != model_col_name:
|
86 |
+
df[col] = df[col].astype(float)
|
87 |
+
|
88 |
+
# Remove repetition in model names, if any
|
89 |
+
models_list = []
|
90 |
+
for i in range(len(df)):
|
91 |
+
model_name = df.iloc[i][model_col_name]
|
92 |
+
splits = model_name.split('--')
|
93 |
+
splits = [split.replace('-t0.0', '') for split in splits] # Comment to not remove -t0.0
|
94 |
+
if splits[0] == splits[1]:
|
95 |
+
models_list.append(splits[0])
|
96 |
+
else:
|
97 |
+
models_list.append(splits[0] + "--" + splits[1])
|
98 |
+
df[model_col_name] = models_list
|
99 |
+
|
100 |
+
# Update column names
|
101 |
+
update = ['Model', 'Clemscore', '% Played', 'Quality Score']
|
102 |
+
game_metrics = list_column_names[4:]
|
103 |
+
|
104 |
+
for col in game_metrics:
|
105 |
+
splits = col.split(',')
|
106 |
+
update.append(splits[0].capitalize() + "" + splits[1])
|
107 |
+
|
108 |
+
map_cols = {}
|
109 |
+
for i in range(len(update)):
|
110 |
+
map_cols[list_column_names[i]] = str(update[i])
|
111 |
+
|
112 |
+
df = df.rename(columns=map_cols)
|
113 |
+
return df
|
114 |
+
|
115 |
+
def filter_search(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
116 |
+
'''
|
117 |
+
Filter the dataframe based on the search query
|
118 |
+
Args:
|
119 |
+
df: Unfiltered dataframe
|
120 |
+
query: a string of queries separated by ";"
|
121 |
+
Return:
|
122 |
+
filtered_df: Dataframe containing searched queries in the 'Model' column
|
123 |
+
'''
|
124 |
+
queries = query.split(';')
|
125 |
+
list_cols = list(df.columns)
|
126 |
+
df_len = len(df)
|
127 |
+
filtered_models = []
|
128 |
+
models_list = list(df[list_cols[0]])
|
129 |
+
for q in queries:
|
130 |
+
q = q.lower()
|
131 |
+
q = q.strip()
|
132 |
+
for i in range(df_len):
|
133 |
+
model_name = models_list[i]
|
134 |
+
if q in model_name.lower():
|
135 |
+
filtered_models.append(model_name) # Append model names containing query q
|
136 |
+
|
137 |
+
filtered_df = df[df[list_cols[0]].isin(filtered_models)]
|
138 |
+
|
139 |
+
if query == "":
|
140 |
+
return df
|
141 |
+
|
142 |
+
return filtered_df
|
src/plot_utils.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import plotly.express as px
|
3 |
+
|
4 |
+
from src.assets.text_content import SHORT_NAMES
|
5 |
+
|
6 |
+
def plotly_plot(df:pd.DataFrame, LIST:list, ALL:list, NAMES:list, LEGEND:list):
|
7 |
+
'''
|
8 |
+
Takes in a list of models for a plotly plot
|
9 |
+
Args:
|
10 |
+
df: A dummy dataframe of latest version
|
11 |
+
LIST: List of models to plot
|
12 |
+
ALL: Either [] or ["Show All Models"] - toggle view to plot all models
|
13 |
+
NAMES: Either [] or ["Show Names"] - toggle view to show model names on plot
|
14 |
+
LEGEND: Either [] or ["Show Legend"] - toggle view to show legend on plot
|
15 |
+
Returns:
|
16 |
+
Fig: plotly figure
|
17 |
+
'''
|
18 |
+
|
19 |
+
# Get list of all models and append short names column to df
|
20 |
+
list_columns = list(df.columns)
|
21 |
+
ALL_LIST = list(df[list_columns[0]].unique())
|
22 |
+
short_names = label_map(ALL_LIST)
|
23 |
+
list_short_names = list(short_names.values())
|
24 |
+
df["Short"] = list_short_names
|
25 |
+
|
26 |
+
if ALL:
|
27 |
+
LIST = ALL_LIST
|
28 |
+
# Filter dataframe based on the provided list of models
|
29 |
+
df = df[df[list_columns[0]].isin(LIST)]
|
30 |
+
|
31 |
+
|
32 |
+
if NAMES:
|
33 |
+
fig = px.scatter(df, x=list_columns[2], y=list_columns[3], color=list_columns[0], symbol=list_columns[0],
|
34 |
+
color_discrete_map={"category1": "blue", "category2": "red"},
|
35 |
+
hover_name=list_columns[0], template="plotly_white", text="Short")
|
36 |
+
fig.update_traces(textposition='top center')
|
37 |
+
else:
|
38 |
+
fig = px.scatter(df, x=list_columns[2], y=list_columns[3], color=list_columns[0], symbol=list_columns[0],
|
39 |
+
color_discrete_map={"category1": "blue", "category2": "red"},
|
40 |
+
hover_name=list_columns[0], template="plotly_white")
|
41 |
+
|
42 |
+
if not LEGEND:
|
43 |
+
fig.update_layout(showlegend=False)
|
44 |
+
|
45 |
+
fig.update_layout(
|
46 |
+
xaxis_title='% Played',
|
47 |
+
yaxis_title='Quality Score',
|
48 |
+
title='Overview of benchmark results',
|
49 |
+
height=1000
|
50 |
+
)
|
51 |
+
|
52 |
+
fig.update_xaxes(range=[-5, 105])
|
53 |
+
fig.update_yaxes(range=[-5, 105])
|
54 |
+
|
55 |
+
return fig
|
56 |
+
|
57 |
+
|
58 |
+
# ['Model', 'Clemscore', 'All(Played)', 'All(Quality Score)']
|
59 |
+
def compare_plots(df: pd.DataFrame, LIST1: list, LIST2: list, ALL:list, NAMES:list, LEGEND: list):
|
60 |
+
'''
|
61 |
+
Quality Score v/s % Played plot by selecting models
|
62 |
+
Args:
|
63 |
+
df: A dummy dataframe of latest version
|
64 |
+
LIST1: The list of open source models to show in the plot, updated from frontend
|
65 |
+
LIST2: The list of commercial models to show in the plot, updated from frontend
|
66 |
+
ALL: Either [] or ["Show All Models"] - toggle view to plot all models
|
67 |
+
NAMES: Either [] or ["Show Names"] - toggle view to show model names on plot
|
68 |
+
LEGEND: Either [] or ["Show Legend"] - toggle view to show legend on plot
|
69 |
+
Returns:
|
70 |
+
fig: The plot
|
71 |
+
'''
|
72 |
+
|
73 |
+
# Combine lists for Open source and commercial models
|
74 |
+
LIST = LIST1 + LIST2
|
75 |
+
fig = plotly_plot(df, LIST, ALL, NAMES, LEGEND)
|
76 |
+
|
77 |
+
return fig
|
78 |
+
|
79 |
+
def shorten_model_name(full_name):
|
80 |
+
# Split the name into parts
|
81 |
+
parts = full_name.split('-')
|
82 |
+
|
83 |
+
# Process the name parts to keep only the parts with digits (model sizes and versions)
|
84 |
+
short_name_parts = [part for part in parts if any(char.isdigit() for char in part)]
|
85 |
+
|
86 |
+
if len(parts) == 1:
|
87 |
+
short_name = ''.join(full_name[0:min(3, len(full_name))])
|
88 |
+
else:
|
89 |
+
# Join the parts to form the short name
|
90 |
+
short_name = '-'.join(short_name_parts)
|
91 |
+
|
92 |
+
# Remove any leading or trailing hyphens
|
93 |
+
short_name = full_name[0] + '-'+ short_name.strip('-')
|
94 |
+
|
95 |
+
return short_name
|
96 |
+
|
97 |
+
def label_map(model_list: list) -> dict:
|
98 |
+
'''
|
99 |
+
Generate a map from long names to short names, to plot them in frontend graph
|
100 |
+
Define the short names in src/assets/text_content.py
|
101 |
+
Args:
|
102 |
+
model_list: A list of long model names
|
103 |
+
Returns:
|
104 |
+
short_name: A dict from long to short name
|
105 |
+
'''
|
106 |
+
short_names = {}
|
107 |
+
for model_name in model_list:
|
108 |
+
if model_name in SHORT_NAMES:
|
109 |
+
short_name = SHORT_NAMES[model_name]
|
110 |
+
else:
|
111 |
+
short_name = shorten_model_name(model_name)
|
112 |
+
|
113 |
+
# Define the short name and indicate both models are same
|
114 |
+
short_names[model_name] = short_name
|
115 |
+
|
116 |
+
return short_names
|
117 |
+
|
118 |
+
def split_models(MODEL_LIST: list):
|
119 |
+
'''
|
120 |
+
Split the models into open source and commercial
|
121 |
+
'''
|
122 |
+
open_models = []
|
123 |
+
comm_models = []
|
124 |
+
|
125 |
+
for model in MODEL_LIST:
|
126 |
+
if model.startswith(('gpt-', 'claude-', 'command')):
|
127 |
+
comm_models.append(model)
|
128 |
+
else:
|
129 |
+
open_models.append(model)
|
130 |
+
|
131 |
+
open_models.sort(key=lambda o: o.upper())
|
132 |
+
comm_models.sort(key=lambda c: c.upper())
|
133 |
+
return open_models, comm_models
|
src/reload.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Isolated functions to reload the leaderboard data and plot the results
|
2 |
+
|
3 |
+
from src.leaderboard_utils import filter_search, get_github_data
|
4 |
+
from src.plot_utils import split_models, compare_plots
|
5 |
+
|
6 |
+
def get_primary_leaderboard():
|
7 |
+
'''
|
8 |
+
Returns
|
9 |
+
primary_leaderboard_df[0]: Dataframe containing the primary leaderboard (laterst version of the benchmark results)
|
10 |
+
'''
|
11 |
+
print("Initializing Reload...........")
|
12 |
+
|
13 |
+
primary_leaderboard_df, _, _ = get_github_data()
|
14 |
+
print(primary_leaderboard_df)
|
15 |
+
return primary_leaderboard_df[0]
|
16 |
+
|
17 |
+
def get_open_models():
|
18 |
+
'''
|
19 |
+
Returns
|
20 |
+
open_models: Checkbox group containing the open models
|
21 |
+
'''
|
22 |
+
primary_leaderboard_df, _, _ = get_github_data()
|
23 |
+
temp_df = primary_leaderboard_df[0]
|
24 |
+
models = list(temp_df[list(temp_df.columns)[0]].unique())
|
25 |
+
open_models, _ = split_models(models)
|
26 |
+
return open_models
|
27 |
+
|
28 |
+
def get_closed_models():
|
29 |
+
'''
|
30 |
+
Returns
|
31 |
+
closed_models: Checkbox group containing the closed models
|
32 |
+
'''
|
33 |
+
primary_leaderboard_df, _, _ = get_github_data()
|
34 |
+
temp_df = primary_leaderboard_df[0]
|
35 |
+
models = list(temp_df[list(temp_df.columns)[0]].unique())
|
36 |
+
_, closed_models = split_models(models)
|
37 |
+
return closed_models
|
38 |
+
|
39 |
+
def get_plot_df():
|
40 |
+
'''
|
41 |
+
Returns
|
42 |
+
plot_df: Dataframe containing the results of latest version for plotting
|
43 |
+
'''
|
44 |
+
primary_leaderboard_df, _, _ = get_github_data()
|
45 |
+
plot_df = primary_leaderboard_df[0]
|
46 |
+
return plot_df
|
47 |
+
|
48 |
+
def get_version_names():
|
49 |
+
'''
|
50 |
+
Returns
|
51 |
+
version_names: List containing the versions of the benchmark results for dropdown selection
|
52 |
+
'''
|
53 |
+
_, _, version_names = get_github_data()
|
54 |
+
return version_names
|
55 |
+
|
56 |
+
def get_version_df():
|
57 |
+
'''
|
58 |
+
Returns
|
59 |
+
version_dfs: Dataframe containing the benchmark results for all versions
|
60 |
+
'''
|
61 |
+
_, version_dfs, _ = get_github_data()
|
62 |
+
return version_dfs
|
63 |
+
|
64 |
+
def get_prev_df(name='initial'):
|
65 |
+
'''
|
66 |
+
Returns
|
67 |
+
prev_df: Dataframe containing the benchmark results for the previous versions (default = latest version)
|
68 |
+
'''
|
69 |
+
_, version_dfs, version_names = get_github_data()
|
70 |
+
|
71 |
+
if name == 'initial':
|
72 |
+
name = version_names[0]
|
73 |
+
|
74 |
+
ind = version_names.index(name)
|
75 |
+
prev_df = version_dfs[ind]
|
76 |
+
return prev_df
|
77 |
+
|
78 |
+
|
src/reload_utils.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Isolated functions to reload the leaderboard data and plot the results
|
2 |
+
|
3 |
+
from src.leaderboard_utils import filter_search, get_github_data
|
4 |
+
from src.plot_utils import split_models, compare_plots
|
5 |
+
|
6 |
+
#COMPONENTS TO RELOAD EVERY TIME
|
7 |
+
# leaderboard_table, dummy_leaderboard_table,
|
8 |
+
# open_models_selection, closed_models_selection, show_all, show_names, show_legend
|
9 |
+
#version_select, prev_table, dummy_prev_table
|
10 |
+
|
11 |
+
class ReloadData():
|
12 |
+
'''
|
13 |
+
A class containing methods to reload the leaderboard data and plot the results
|
14 |
+
The methods return individual component values directly to use 'every' arg in the component
|
15 |
+
'''
|
16 |
+
|
17 |
+
def __init__(self):
|
18 |
+
print("Initializing Reload...........")
|
19 |
+
self.primary_leaderboard_df, self.version_dfs, self.version_names = get_github_data()
|
20 |
+
self.plot_df = self.primary_leaderboard_df[0]
|
21 |
+
self.models = list(self.plot_df[list(self.plot_df.columns)[0]].unique())
|
22 |
+
print("Reload completed ....... Here's a reloaded dataframe for latest version")
|
23 |
+
print(self.primary_leaderboard_df)
|
24 |
+
|
25 |
+
def get_primary_leaderboard(self):
|
26 |
+
'''
|
27 |
+
Returns
|
28 |
+
self.primary_leaderboard_df[0]: Dataframe containing the primary leaderboard (laterst version of the benchmark results)
|
29 |
+
'''
|
30 |
+
return self.primary_leaderboard_df[0]
|
31 |
+
|
32 |
+
def get_open_models(self):
|
33 |
+
'''
|
34 |
+
Returns
|
35 |
+
open_models: Checkbox group containing the open models
|
36 |
+
'''
|
37 |
+
self.open_models, _ = split_models(self.models)
|
38 |
+
return self.open_models
|
39 |
+
|
40 |
+
def get_closed_models(self):
|
41 |
+
'''
|
42 |
+
Returns
|
43 |
+
closed_models: Checkbox group containing the closed models
|
44 |
+
'''
|
45 |
+
_, self.closed_models = split_models(self.models)
|
46 |
+
return self.closed_models
|
47 |
+
|
48 |
+
def get_plot_df(self):
|
49 |
+
'''
|
50 |
+
Returns
|
51 |
+
plot_df: Dataframe containing the results of latest version for plotting
|
52 |
+
'''
|
53 |
+
return self.plot_df
|
54 |
+
|
55 |
+
def get_version_names(self):
|
56 |
+
'''
|
57 |
+
Returns
|
58 |
+
version_names: List containing the versions of the benchmark results for dropdown selection
|
59 |
+
'''
|
60 |
+
return self.version_names
|
61 |
+
|
62 |
+
def get_version_df(self):
|
63 |
+
'''
|
64 |
+
Returns
|
65 |
+
version_dfs: Dataframe containing the benchmark results for all versions
|
66 |
+
'''
|
67 |
+
return self.version_dfs
|
68 |
+
|
69 |
+
def get_prev_df(self, name='initial'):
|
70 |
+
'''
|
71 |
+
Returns
|
72 |
+
prev_df: Dataframe containing the benchmark results for the previous versions (default = latest version)
|
73 |
+
'''
|
74 |
+
if name == 'initial':
|
75 |
+
name = self.version_names[0]
|
76 |
+
|
77 |
+
ind = self.version_names.index(name)
|
78 |
+
self.prev_df = self.version_dfs[ind]
|
79 |
+
return self.prev_df
|
80 |
+
|
81 |
+
|
82 |
+
|