Koshti10 commited on
Commit
9d6ff9a
β€’
1 Parent(s): f5ad77e

Upload 7 files

Browse files
README.md CHANGED
@@ -1,12 +1,33 @@
1
  ---
2
- title: RELOADBOARD
3
- emoji: 😻
4
- colorFrom: indigo
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 4.24.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Clembench
3
+ emoji: πŸ†
4
+ colorFrom: yellow
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.3.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ ["clembench: A Framework for the Systematic Evaluation of Chat-Optimized Language Models as Conversational Agents"](https://aclanthology.org/2023.emnlp-main.689/)
13
+
14
+
15
+ ```
16
+ @inproceedings{chalamalasetti-etal-2023-clembench,
17
+ title = "clembench: Using Game Play to Evaluate Chat-Optimized Language Models as Conversational Agents",
18
+ author = {Chalamalasetti, Kranti and
19
+ G{\"o}tze, Jana and
20
+ Hakimov, Sherzod and
21
+ Madureira, Brielen and
22
+ Sadler, Philipp and
23
+ Schlangen, David},
24
+ booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
25
+ month = dec,
26
+ year = "2023",
27
+ address = "Singapore",
28
+ publisher = "Association for Computational Linguistics",
29
+ url = "https://aclanthology.org/2023.emnlp-main.689",
30
+ pages = "11174--11219"
31
+ }
32
+
33
+ ```
app.py CHANGED
@@ -1,41 +1,76 @@
1
  import gradio as gr
 
 
 
 
2
 
3
- from src.assets.text_content import TITLE, INTRODUCTION_TEXT
4
- from src.leaderboard_utils import filter_search, get_github_data
5
- from src.plot_utils import split_models, compare_plots
 
 
6
 
7
- # from src.reload_utils import ReloadData
8
- from src.reload import get_primary_leaderboard, get_open_models, get_closed_models, get_plot_df, get_version_names, get_version_df, get_prev_df
 
 
 
 
 
9
 
10
- reload_time = 5
11
 
12
- # # For Leaderboards
13
- # # Get CSV data
14
- # global primary_leaderboard_df, version_dfs, version_names
15
- # primary_leaderboard_df, version_dfs, version_names = get_github_data()
 
16
 
17
- # global prev_df
18
- # prev_df = version_dfs[0]
19
- # def select_prev_df(name):
20
- # ind = version_names.index(name)
21
- # prev_df = version_dfs[ind]
22
- # return prev_df
23
 
24
- # # For Plots
25
- # global plot_df, OPEN_MODELS, CLOSED_MODELS
26
- # plot_df = primary_leaderboard_df[0]
27
- # MODELS = list(plot_df[list(plot_df.columns)[0]].unique())
28
- # OPEN_MODELS, CLOSED_MODELS = split_models(MODELS)
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- # MAIN APPLICATION s
32
- main_app = gr.Blocks()
33
- with main_app:
34
  gr.HTML(TITLE)
35
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
36
 
37
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
38
- with gr.TabItem("πŸ₯‡ CLEM Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
 
 
 
39
  with gr.Row():
40
  search_bar = gr.Textbox(
41
  placeholder=" πŸ” Search for models - separate multiple queries with `;` and press ENTER...",
@@ -43,51 +78,104 @@ with main_app:
43
  elem_id="search-bar",
44
  )
45
 
46
- leaderboard_table = gr.DataFrame(
47
- value=get_primary_leaderboard,
48
- elem_id="leaderboard-table",
49
  interactive=False,
50
  visible=True,
51
- every=reload_time
52
- )
 
 
 
 
53
 
54
- # Add a dummy leaderboard to handle search queries from the primary_leaderboard_df and not update primary_leaderboard_df
 
55
  dummy_leaderboard_table = gr.Dataframe(
56
- value=get_primary_leaderboard,
57
- elem_id="leaderboard-table",
58
  interactive=False,
59
- visible=False,
60
- every=reload_time
61
  )
62
-
 
63
  search_bar.submit(
64
- filter_search,
65
  [dummy_leaderboard_table, search_bar],
66
  leaderboard_table,
67
  queue=True
68
  )
69
 
70
- with gr.TabItem("πŸ“ˆ Plot", id=3):
 
 
 
71
  with gr.Row():
72
- open_models_selection = gr.CheckboxGroup(
73
- choices=get_open_models(),
74
- label="Open-weight Models 🌐",
75
- value=[],
76
- elem_id="value-select",
77
- interactive=True,
78
- every=reload_time
79
  )
80
 
81
- with gr.Row():
82
- closed_models_selection = gr.CheckboxGroup(
83
- choices=get_closed_models(),
84
- label="Closed-weight Models πŸ’Ό",
85
- value=[],
86
- elem_id="value-select-2",
87
- interactive=True,
88
- every=reload_time
89
- )
90
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  with gr.Row():
92
  with gr.Column():
93
  show_all = gr.CheckboxGroup(
@@ -97,30 +185,42 @@ with main_app:
97
  elem_id="value-select-3",
98
  interactive=True,
99
  )
100
-
101
  with gr.Column():
102
  show_names = gr.CheckboxGroup(
103
  ["Show Names"],
104
- label ="Show names of models on the plot 🏷️",
105
  value=[],
106
  elem_id="value-select-4",
107
  interactive=True,
108
- )
109
 
110
  with gr.Column():
111
  show_legend = gr.CheckboxGroup(
112
  ["Show Legend"],
113
- label ="Show legend on the plot πŸ’‘",
114
  value=[],
115
  elem_id="value-select-5",
116
  interactive=True,
117
- )
 
 
 
 
 
 
 
 
118
 
 
 
 
 
 
119
  with gr.Row():
120
  dummy_plot_df = gr.DataFrame(
121
- value=get_plot_df,
122
- visible=False,
123
- every=reload_time
124
  )
125
 
126
  with gr.Row():
@@ -128,88 +228,184 @@ with main_app:
128
  # Output block for the plot
129
  plot_output = gr.Plot()
130
 
 
 
 
 
131
  open_models_selection.change(
132
- compare_plots,
133
- [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend],
134
- plot_output,
 
135
  queue=True
136
  )
137
 
138
  closed_models_selection.change(
139
- compare_plots,
140
- [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend],
141
- plot_output,
 
142
  queue=True
143
  )
144
-
145
  show_all.change(
146
- compare_plots,
147
- [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend],
148
- plot_output,
 
149
  queue=True
150
  )
151
 
152
  show_names.change(
153
- compare_plots,
154
- [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend],
155
- plot_output,
 
156
  queue=True
157
  )
158
 
159
  show_legend.change(
160
- compare_plots,
161
- [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend],
162
- plot_output,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  queue=True
164
  )
165
 
166
- with gr.TabItem("πŸ”„ Versions and Details", elem_id="details", id=2):
 
 
 
167
  with gr.Row():
168
  version_select = gr.Dropdown(
169
- choices=get_version_names(),
170
- label="Select Version πŸ•ΉοΈ",
171
- value=get_version_names()[0],
172
- every=reload_time
173
  )
174
  with gr.Row():
175
  search_bar_prev = gr.Textbox(
176
  placeholder=" πŸ” Search for models - separate multiple queries with `;` and press ENTER...",
177
  show_label=False,
178
- elem_id="search-bar-2",
179
  )
180
 
181
  prev_table = gr.Dataframe(
182
- value=get_prev_df,
183
- elem_id="leaderboard-table",
184
  interactive=False,
185
  visible=True,
186
- every=reload_time
187
  )
188
 
189
  dummy_prev_table = gr.Dataframe(
190
- value=get_prev_df,
191
- elem_id="leaderboard-table",
192
  interactive=False,
193
- visible=False,
194
- every=reload_time
195
  )
196
 
 
 
 
197
  search_bar_prev.submit(
198
- filter_search,
199
  [dummy_prev_table, search_bar_prev],
200
  prev_table,
201
  queue=True
202
  )
203
 
204
  version_select.change(
205
- get_prev_df,
206
  [version_select],
207
  prev_table,
208
- queue=True,
209
- every=reload_time
210
  )
211
- main_app.load()
212
 
213
- main_app.queue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
- main_app.launch()
 
1
  import gradio as gr
2
+ import os
3
+ from apscheduler.schedulers.background import BackgroundScheduler
4
+ from huggingface_hub import HfApi
5
+ from datetime import datetime, timedelta
6
 
7
+ from src.assets.text_content import TITLE, INTRODUCTION_TEXT, CLEMSCORE_TEXT, MULTIMODAL_NAME, TEXT_NAME, HF_REPO
8
+ from src.leaderboard_utils import query_search, get_github_data
9
+ from src.plot_utils import split_models, plotly_plot, get_plot_df, update_open_models, update_closed_models
10
+ from src.plot_utils import reset_show_all, reset_show_names, reset_show_legend, reset_mobile_view
11
+ from src.version_utils import get_versions_data
12
 
13
+ """
14
+ CONSTANTS
15
+ """
16
+ # For restarting the gradio application
17
+ TIME = 200 # in seconds # Reload will not work locally - requires HFToken # The app launches locally as expected - only without the reload utility
18
+ # For Leaderboard table
19
+ dataframe_height = 800 # Height of the table in pixels # Set on average considering all possible devices
20
 
 
21
 
22
+ """
23
+ AUTO RESTART HF SPACE
24
+ """
25
+ HF_TOKEN = os.environ.get("H4_TOKEN", None)
26
+ api = HfApi()
27
 
28
+ def restart_space():
29
+ api.restart_space(repo_id=HF_REPO, token=HF_TOKEN)
 
 
 
 
30
 
 
 
 
 
 
31
 
32
+ """
33
+ GITHUB UTILS
34
+ """
35
+ github_data = get_github_data()
36
+ text_leaderboard = github_data["text"][0] # Get the text-only leaderboard for its available latest version
37
+ multimodal_leaderboard = github_data["multimodal"][0] # Get multimodal leaderboard for its available latest version.
38
+
39
+ # Show only First 4 columns for the leaderboards
40
+ text_leaderboard = text_leaderboard.iloc[:, :4]
41
+ print(f"Showing the following columns for the latest leaderboard: {text_leaderboard.columns}")
42
+ multimodal_leaderboard = multimodal_leaderboard.iloc[:, :4]
43
+ print(f"Showing the following columns for the multimodal leaderboard: {multimodal_leaderboard.columns}")
44
+
45
+
46
+ """
47
+ VERSIONS UTILS
48
+ """
49
+ versions_data = get_versions_data()
50
+ latest_version = versions_data['latest'] # Always show latest version in text-only benchmark
51
+ last_updated_date = versions_data['date']
52
+ version_names = list(versions_data.keys())
53
+ version_names = [v for v in version_names if v.startswith("v")] # Remove "latest" and "date" keys
54
+
55
+ global version_df
56
+ version_df = versions_data[latest_version]
57
+ def select_version_df(name):
58
+ return versions_data[name]
59
+
60
+ """
61
+ MAIN APPLICATION
62
+ """
63
+ hf_app = gr.Blocks()
64
+ with hf_app:
65
 
 
 
 
66
  gr.HTML(TITLE)
67
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
68
 
69
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
70
+ """
71
+ ####################### FIRST TAB - TEXT-LEADERBOARD #######################
72
+ """
73
+ with gr.TabItem(TEXT_NAME, elem_id="llm-benchmark-tab-table", id=0):
74
  with gr.Row():
75
  search_bar = gr.Textbox(
76
  placeholder=" πŸ” Search for models - separate multiple queries with `;` and press ENTER...",
 
78
  elem_id="search-bar",
79
  )
80
 
81
+ leaderboard_table = gr.Dataframe(
82
+ value=text_leaderboard,
83
+ elem_id="text-leaderboard-table",
84
  interactive=False,
85
  visible=True,
86
+ height=dataframe_height
87
+ )
88
+
89
+ # Show information about the clemscore and last updated date below the table
90
+ gr.HTML(CLEMSCORE_TEXT)
91
+ gr.HTML(f"Last updated - {github_data['date']}")
92
 
93
+ # Add a dummy leaderboard to handle search queries in leaderboard_table
94
+ # This will show a temporary leaderboard based on the searched value
95
  dummy_leaderboard_table = gr.Dataframe(
96
+ value=text_leaderboard,
97
+ elem_id="text-leaderboard-table-dummy",
98
  interactive=False,
99
+ visible=False
 
100
  )
101
+
102
+ # Action after submitting a query to the search bar
103
  search_bar.submit(
104
+ query_search,
105
  [dummy_leaderboard_table, search_bar],
106
  leaderboard_table,
107
  queue=True
108
  )
109
 
110
+ """
111
+ ####################### SECOND TAB - MULTIMODAL LEADERBOARD #######################
112
+ """
113
+ with gr.TabItem(MULTIMODAL_NAME, elem_id="mm-llm-benchmark-tab-table", id=1):
114
  with gr.Row():
115
+ mm_search_bar = gr.Textbox(
116
+ placeholder=" πŸ” Search for models - separate multiple queries with `;` and press ENTER...",
117
+ show_label=False,
118
+ elem_id="search-bar",
 
 
 
119
  )
120
 
121
+ mm_leaderboard_table = gr.Dataframe(
122
+ value=multimodal_leaderboard,
123
+ elem_id="mm-leaderboard-table",
124
+ interactive=False,
125
+ visible=True,
126
+ height=dataframe_height
127
+ )
128
+
129
+ # Show information about the clemscore and last updated date below the table
130
+ gr.HTML(CLEMSCORE_TEXT)
131
+ gr.HTML(f"Last updated - {github_data['date']}")
132
+
133
+ # Add a dummy leaderboard to handle search queries in leaderboard_table
134
+ # This will show a temporary leaderboard based on the searched value
135
+ mm_dummy_leaderboard_table = gr.Dataframe(
136
+ value=multimodal_leaderboard,
137
+ elem_id="mm-leaderboard-table-dummy",
138
+ interactive=False,
139
+ visible=False
140
+ )
141
+
142
+ # Action after submitting a query to the search bar
143
+ mm_search_bar.submit(
144
+ query_search,
145
+ [mm_dummy_leaderboard_table, mm_search_bar],
146
+ mm_leaderboard_table,
147
+ queue=True
148
+ )
149
+
150
+ """
151
+ ####################### THIRD TAB - PLOTS - %PLAYED V/S QUALITY SCORE #######################
152
+ """
153
+ with gr.TabItem("πŸ“ˆ Plots", elem_id="plots", id=2):
154
+ """
155
+ DropDown Select for Text/Multimodal Leaderboard
156
+ """
157
+ leaderboard_selection = gr.Dropdown(
158
+ choices=[TEXT_NAME, MULTIMODAL_NAME],
159
+ value=TEXT_NAME,
160
+ label="Select Leaderboard πŸŽ–οΈπŸ”½",
161
+ elem_id="value-select-0",
162
+ interactive=True
163
+ )
164
+
165
+ """
166
+ Accordion Groups to select individual models - Hidden by default
167
+ """
168
+ with gr.Accordion("Select Open-weight Models 🌐", open=False):
169
+ open_models_selection = update_open_models()
170
+ clear_button_1 = gr.ClearButton(open_models_selection)
171
+
172
+ with gr.Accordion("Select Closed-weight Models πŸ’Ό", open=False):
173
+ closed_models_selection = update_closed_models()
174
+ clear_button_2 = gr.ClearButton(closed_models_selection)
175
+
176
+ """
177
+ Checkbox group to control the layout of the plot
178
+ """
179
  with gr.Row():
180
  with gr.Column():
181
  show_all = gr.CheckboxGroup(
 
185
  elem_id="value-select-3",
186
  interactive=True,
187
  )
188
+
189
  with gr.Column():
190
  show_names = gr.CheckboxGroup(
191
  ["Show Names"],
192
+ label="Show names of models on the plot 🏷️",
193
  value=[],
194
  elem_id="value-select-4",
195
  interactive=True,
196
+ )
197
 
198
  with gr.Column():
199
  show_legend = gr.CheckboxGroup(
200
  ["Show Legend"],
201
+ label="Show legend on the plot πŸ’‘",
202
  value=[],
203
  elem_id="value-select-5",
204
  interactive=True,
205
+ )
206
+ with gr.Column():
207
+ mobile_view = gr.CheckboxGroup(
208
+ ["Mobile View"],
209
+ label="View plot on smaller screens πŸ“±",
210
+ value=[],
211
+ elem_id="value-select-6",
212
+ interactive=True,
213
+ )
214
 
215
+ """
216
+ PLOT BLOCK
217
+ """
218
+ # Create a dummy DataFrame as an input to the plotly_plot function.
219
+ # Uses this data to plot the %played v/s quality score
220
  with gr.Row():
221
  dummy_plot_df = gr.DataFrame(
222
+ value=get_plot_df(),
223
+ visible=False
 
224
  )
225
 
226
  with gr.Row():
 
228
  # Output block for the plot
229
  plot_output = gr.Plot()
230
 
231
+ """
232
+ PLOT CHANGE ACTIONS
233
+ Toggle 'Select All Models' based on the values in Accordion checkbox groups
234
+ """
235
  open_models_selection.change(
236
+ plotly_plot,
237
+ [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend,
238
+ mobile_view],
239
+ [plot_output],
240
  queue=True
241
  )
242
 
243
  closed_models_selection.change(
244
+ plotly_plot,
245
+ [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend,
246
+ mobile_view],
247
+ [plot_output],
248
  queue=True
249
  )
250
+
251
  show_all.change(
252
+ plotly_plot,
253
+ [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend,
254
+ mobile_view],
255
+ [plot_output],
256
  queue=True
257
  )
258
 
259
  show_names.change(
260
+ plotly_plot,
261
+ [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend,
262
+ mobile_view],
263
+ [plot_output],
264
  queue=True
265
  )
266
 
267
  show_legend.change(
268
+ plotly_plot,
269
+ [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend,
270
+ mobile_view],
271
+ [plot_output],
272
+ queue=True
273
+ )
274
+
275
+ mobile_view.change(
276
+ plotly_plot,
277
+ [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend,
278
+ mobile_view],
279
+ [plot_output],
280
+ queue=True
281
+ )
282
+ """
283
+ LEADERBOARD SELECT CHANGE ACTIONS
284
+ Update Checkbox Groups and Dummy DF based on the leaderboard selected
285
+ """
286
+ leaderboard_selection.change(
287
+ update_open_models,
288
+ [leaderboard_selection],
289
+ [open_models_selection],
290
+ queue=True
291
+ )
292
+
293
+ leaderboard_selection.change(
294
+ update_closed_models,
295
+ [leaderboard_selection],
296
+ [closed_models_selection],
297
+ queue=True
298
+ )
299
+
300
+ leaderboard_selection.change(
301
+ get_plot_df,
302
+ [leaderboard_selection],
303
+ [dummy_plot_df],
304
+ queue=True
305
+ )
306
+
307
+ ## Implement Feature - Reset Plot when Leaderboard selection changes
308
+ leaderboard_selection.change(
309
+ reset_show_all,
310
+ outputs=[show_all],
311
+ queue=True
312
+ )
313
+
314
+ open_models_selection.change(
315
+ reset_show_all,
316
+ outputs=[show_all],
317
+ queue=True
318
+ )
319
+
320
+ closed_models_selection.change(
321
+ reset_show_all,
322
+ outputs=[show_all],
323
+ queue=True
324
+ )
325
+
326
+ leaderboard_selection.change(
327
+ reset_show_names,
328
+ outputs=[show_names],
329
+ queue=True
330
+ )
331
+
332
+ leaderboard_selection.change(
333
+ reset_show_legend,
334
+ outputs=[show_legend],
335
+ queue=True
336
+ )
337
+
338
+ leaderboard_selection.change(
339
+ reset_mobile_view,
340
+ outputs=[mobile_view],
341
  queue=True
342
  )
343
 
344
+ """
345
+ ####################### FOURTH TAB - VERSIONS AND DETAILS #######################
346
+ """
347
+ with gr.TabItem("πŸ”„ Versions and Details", elem_id="versions-details-tab", id=3):
348
  with gr.Row():
349
  version_select = gr.Dropdown(
350
+ version_names, label="Select Version πŸ•ΉοΈ", value=latest_version
 
 
 
351
  )
352
  with gr.Row():
353
  search_bar_prev = gr.Textbox(
354
  placeholder=" πŸ” Search for models - separate multiple queries with `;` and press ENTER...",
355
  show_label=False,
356
+ elem_id="search-bar-3",
357
  )
358
 
359
  prev_table = gr.Dataframe(
360
+ value=version_df,
361
+ elem_id="version-leaderboard-table",
362
  interactive=False,
363
  visible=True,
364
+ height=dataframe_height
365
  )
366
 
367
  dummy_prev_table = gr.Dataframe(
368
+ value=version_df,
369
+ elem_id="version-dummy-leaderboard-table",
370
  interactive=False,
371
+ visible=False
 
372
  )
373
 
374
+ gr.HTML(CLEMSCORE_TEXT)
375
+ gr.HTML(f"Last updated - {last_updated_date}")
376
+
377
  search_bar_prev.submit(
378
+ query_search,
379
  [dummy_prev_table, search_bar_prev],
380
  prev_table,
381
  queue=True
382
  )
383
 
384
  version_select.change(
385
+ select_version_df,
386
  [version_select],
387
  prev_table,
388
+ queue=True
 
389
  )
 
390
 
391
+ # Update Dummy Leaderboard, when changing versions
392
+ version_select.change(
393
+ select_version_df,
394
+ [version_select],
395
+ dummy_prev_table,
396
+ queue=True
397
+ )
398
+
399
+ hf_app.load()
400
+ hf_app.queue()
401
+
402
+ # Add scheduler to auto-restart the HF space at every TIME interval and update every component each time
403
+ scheduler = BackgroundScheduler()
404
+ scheduler.add_job(restart_space, 'interval', seconds=TIME)
405
+ scheduler.start()
406
+
407
+ # Log current start time and scheduled restart time
408
+ print(datetime.now())
409
+ print(f"Scheduled restart at {datetime.now() + timedelta(seconds=TIME)}")
410
 
411
+ hf_app.launch()
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
- gradio==3.43.2
2
  pandas==2.0.0
3
- plotly==5.18.0
 
 
 
1
+ gradio==4.36.1
2
  pandas==2.0.0
3
+ plotly==5.18.0
4
+ apscheduler==3.10.4
5
+ huggingfce_hub==0.23.4
src/assets/text_content.py CHANGED
@@ -1,16 +1,29 @@
1
  TITLE = """<h1 align="center" id="space-title"> πŸ† CLEM Leaderboard</h1>"""
2
 
 
 
 
 
 
 
 
 
 
3
  INTRODUCTION_TEXT = """
4
  <h6 align="center">
5
  The CLEM Leaderboard aims to track, rank and evaluate current cLLMs (chat-optimized Large Language Models) with the suggested pronounciation β€œclems”.
6
 
7
- The benchmarking approach is described in [Clembench: Using Game Play to Evaluate Chat-Optimized Language Models as Conversational Agents](https://arxiv.org/abs/2305.13455).
8
 
9
  Source code for benchmarking "clems" is available here: [Clembench](https://github.com/clembench/clembench)
10
 
11
  All generated files and results from the benchmark runs are available here: [clembench-runs](https://github.com/clembench/clembench-runs) </h6>
12
  """
13
 
 
 
 
 
14
  SHORT_NAMES = {
15
  "t0.0": "",
16
  "claude-v1.3": "cl-1.3",
@@ -48,6 +61,6 @@ SHORT_NAMES = {
48
  "vicuna-7b-v1.5": "vic-7b-v1.5",
49
  "vicuna-13b-v1.5": "vic-13b-v1.5",
50
  "gpt4all-13b-snoozy": "g4a-13b-s",
51
- "zephyr-7b-alpha":"z-7b-a",
52
- "zephyr-7b-beta":"z-7b-b"
53
  }
 
1
  TITLE = """<h1 align="center" id="space-title"> πŸ† CLEM Leaderboard</h1>"""
2
 
3
+ REPO = "https://raw.githubusercontent.com/kushal-10/clembench-runs/check/website/"
4
+ HF_REPO = "Koshti10/leaderboard"
5
+
6
+ # REPO = "https://raw.githubusercontent.com/clembench/clembench-runs/main/"
7
+ # HF_REPO = "colab-potsdam/clem-leaderboard"
8
+
9
+ TEXT_NAME = "πŸ₯‡ CLEM Leaderboard"
10
+ MULTIMODAL_NAME = "πŸ₯‡ Multimodal CLEM Leaderboard"
11
+
12
  INTRODUCTION_TEXT = """
13
  <h6 align="center">
14
  The CLEM Leaderboard aims to track, rank and evaluate current cLLMs (chat-optimized Large Language Models) with the suggested pronounciation β€œclems”.
15
 
16
+ The benchmarking approach is described in [Clembench: Using Game Play to Evaluate Chat-Optimized Language Models as Conversational Agents](https://aclanthology.org/2023.emnlp-main.689.pdf).
17
 
18
  Source code for benchmarking "clems" is available here: [Clembench](https://github.com/clembench/clembench)
19
 
20
  All generated files and results from the benchmark runs are available here: [clembench-runs](https://github.com/clembench/clembench-runs) </h6>
21
  """
22
 
23
+ CLEMSCORE_TEXT = """
24
+ The <i>clemscore</i> combines a score representing the overall ability to just follow the game instructions (separately scored in field <i>Played</i>) and the quality of the play in attempt where instructions were followed (field <i>Quality Scores</i>). For details about the games / interaction settings, and for results on older versions of the benchmark, see the tab <i>Versions and Details</i>.
25
+ """
26
+
27
  SHORT_NAMES = {
28
  "t0.0": "",
29
  "claude-v1.3": "cl-1.3",
 
61
  "vicuna-7b-v1.5": "vic-7b-v1.5",
62
  "vicuna-13b-v1.5": "vic-13b-v1.5",
63
  "gpt4all-13b-snoozy": "g4a-13b-s",
64
+ "zephyr-7b-alpha": "z-7b-a",
65
+ "zephyr-7b-beta": "z-7b-b"
66
  }
src/leaderboard_utils.py CHANGED
@@ -1,142 +1,139 @@
1
  import os
2
  import pandas as pd
3
- import requests, json
 
4
  from io import StringIO
5
-
6
  from datetime import datetime
7
 
 
8
 
9
  def get_github_data():
10
- current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
11
- print(f"LOADING GITHUB DATAAA.... at time = {current_time}")
12
- '''
13
- Get data from csv files on Github
14
- Args:
15
- None
16
- Returns:
17
- latest_df: singular list containing dataframe of the latest version of the leaderboard with only 4 columns
18
- all_dfs: list of dataframes for previous versions + latest version including columns for all games
19
- all_vnames: list of the names for the previous versions + latest version (For Details and Versions Tab Dropdown)
20
- '''
21
- uname = "kushal-10"
22
- repo = "clembench-runs"
23
- json_url = f"https://raw.githubusercontent.com/{uname}/{repo}/main/benchmark_runs.json"
24
- resp = requests.get(json_url)
25
- if resp.status_code == 200:
26
- json_data = json.loads(resp.text)
27
- versions = json_data['versions']
28
- version_names = []
29
- csv_url = f"https://raw.githubusercontent.com/{uname}/{repo}/main/"
30
- for ver in versions:
31
- version_names.append(ver['version'])
32
- csv_path = ver['result_file'].split('/')[1:]
33
- csv_path = '/'.join(csv_path)
34
-
35
- #Sort by latest version
36
- float_content = [float(s[1:]) for s in version_names]
37
- float_content.sort(reverse=True)
38
- version_names = ['v'+str(s) for s in float_content]
39
-
40
- DFS = []
41
- for version in version_names:
42
- result_url = csv_url+ version + '/' + csv_path
43
- csv_response = requests.get(result_url)
44
- if csv_response.status_code == 200:
45
- df = pd.read_csv(StringIO(csv_response.text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  df = process_df(df)
47
- df = df.sort_values(by=list(df.columns)[1], ascending=False) # Sort by clemscore
48
- DFS.append(df)
49
- else:
50
- print(f"Failed to read CSV file for version : {version}. Status Code : {resp.status_code}")
51
-
52
- # Only keep relavant columns for the main leaderboard
53
- latest_df_dummy = DFS[0]
54
- all_columns = list(latest_df_dummy.columns)
55
- keep_columns = all_columns[0:4]
56
- latest_df_dummy = latest_df_dummy.drop(columns=[c for c in all_columns if c not in keep_columns])
57
-
58
- latest_df = [latest_df_dummy]
59
- all_dfs = []
60
- all_vnames = []
61
- for df, name in zip(DFS, version_names):
62
- all_dfs.append(df)
63
- all_vnames.append(name)
64
- return latest_df, all_dfs, all_vnames
65
-
66
- else:
67
- print(f"Failed to read JSON file: Status Code : {resp.status_code}")
68
 
69
  def process_df(df: pd.DataFrame) -> pd.DataFrame:
70
- '''
71
- Process dataframe
72
- - Remove repition in model names
73
- - Convert datatypes to sort by "float" instead of "str" for sorting
74
  - Update column names
 
75
  Args:
76
  df: Unprocessed Dataframe (after using update_cols)
 
77
  Returns:
78
  df: Processed Dataframe
79
- '''
80
-
81
- # Change column type to float from str
82
- list_column_names = list(df.columns)
83
- model_col_name = list_column_names[0]
84
- for col in list_column_names:
85
- if col != model_col_name:
86
- df[col] = df[col].astype(float)
87
-
88
- # Remove repetition in model names, if any
89
- models_list = []
90
- for i in range(len(df)):
91
- model_name = df.iloc[i][model_col_name]
92
- splits = model_name.split('--')
93
- splits = [split.replace('-t0.0', '') for split in splits] # Comment to not remove -t0.0
94
- if splits[0] == splits[1]:
95
- models_list.append(splits[0])
96
- else:
97
- models_list.append(splits[0] + "--" + splits[1])
98
- df[model_col_name] = models_list
99
 
100
  # Update column names
101
- update = ['Model', 'Clemscore', '% Played', 'Quality Score']
102
- game_metrics = list_column_names[4:]
103
-
104
- for col in game_metrics:
105
- splits = col.split(',')
106
- update.append(splits[0].capitalize() + "" + splits[1])
107
-
108
- map_cols = {}
109
- for i in range(len(update)):
110
- map_cols[list_column_names[i]] = str(update[i])
111
-
112
- df = df.rename(columns=map_cols)
113
  return df
114
 
115
- def filter_search(df: pd.DataFrame, query: str) -> pd.DataFrame:
116
- '''
117
- Filter the dataframe based on the search query
 
 
118
  Args:
119
- df: Unfiltered dataframe
120
- query: a string of queries separated by ";"
121
- Return:
122
- filtered_df: Dataframe containing searched queries in the 'Model' column
123
- '''
124
- queries = query.split(';')
125
- list_cols = list(df.columns)
126
- df_len = len(df)
127
- filtered_models = []
128
- models_list = list(df[list_cols[0]])
129
- for q in queries:
130
- q = q.lower()
131
- q = q.strip()
132
- for i in range(df_len):
133
- model_name = models_list[i]
134
- if q in model_name.lower():
135
- filtered_models.append(model_name) # Append model names containing query q
136
-
137
- filtered_df = df[df[list_cols[0]].isin(filtered_models)]
138
-
139
- if query == "":
140
  return df
141
 
142
- return filtered_df
 
 
 
 
 
 
1
  import os
2
  import pandas as pd
3
+ import requests
4
+ import json
5
  from io import StringIO
 
6
  from datetime import datetime
7
 
8
+ from src.assets.text_content import REPO
9
 
10
  def get_github_data():
11
+ """
12
+ Read and process data from CSV files hosted on GitHub. - https://github.com/clembench/clembench-runs
13
+
14
+ Returns:
15
+ github_data (dict): Dictionary containing:
16
+ - "text": List of DataFrames for each version's textual leaderboard data.
17
+ - "multimodal": List of DataFrames for each version's multimodal leaderboard data.
18
+ - "date": Formatted date of the latest version in "DD Month YYYY" format.
19
+ """
20
+ base_repo = REPO
21
+ json_url = base_repo + "benchmark_runs.json"
22
+ response = requests.get(json_url)
23
+
24
+ # Check if the JSON file request was successful
25
+ if response.status_code != 200:
26
+ print(f"Failed to read JSON file: Status Code: {response.status_code}")
27
+ return None, None, None, None
28
+
29
+ json_data = response.json()
30
+ versions = json_data['versions']
31
+
32
+ # Sort version names - latest first
33
+ version_names = sorted(
34
+ [ver['version'] for ver in versions],
35
+ key=lambda v: float(v[1:]),
36
+ reverse=True
37
+ )
38
+ print(f"Found {len(version_names)} versions from get_github_data(): {version_names}.")
39
+
40
+ # Get Last updated date of the latest version
41
+ latest_version = version_names[0]
42
+ latest_date = next(
43
+ ver['date'] for ver in versions if ver['version'] == latest_version
44
+ )
45
+ formatted_date = datetime.strptime(latest_date, "%Y/%m/%d").strftime("%d %b %Y")
46
+
47
+ # Get Leaderboard data - for text-only + multimodal
48
+ github_data = {}
49
+
50
+ # Collect Dataframes
51
+ text_dfs = []
52
+ mm_dfs = []
53
+
54
+ for version in version_names:
55
+ # Collect CSV data in descending order of clembench-runs versions
56
+ # Collect Text-only data
57
+ text_url = f"{base_repo}{version}/results.csv"
58
+ csv_response = requests.get(text_url)
59
+ if csv_response.status_code == 200:
60
+ df = pd.read_csv(StringIO(csv_response.text))
61
+ df = process_df(df)
62
+ df = df.sort_values(by=df.columns[1], ascending=False) # Sort by clemscore column
63
+ text_dfs.append(df)
64
+ else:
65
+ print(f"Failed to read Text-only leaderboard CSV file for version: {version}. Status Code: {csv_response.status_code}")
66
+
67
+ # Collect Multimodal data
68
+ if float(version[1:]) >= 1.6:
69
+ mm_url = f"{base_repo}{version}_multimodal/results.csv"
70
+ mm_response = requests.get(mm_url)
71
+ if mm_response.status_code == 200:
72
+ df = pd.read_csv(StringIO(mm_response.text))
73
  df = process_df(df)
74
+ df = df.sort_values(by=df.columns[1], ascending=False) # Sort by clemscore column
75
+ mm_dfs.append(df)
76
+ else:
77
+ print(f"Failed to read multimodal leaderboard CSV file for version: {version}: Status Code: {csv_response.status_code}. Please ignore this message if multimodal results are not available for this version")
78
+
79
+ github_data["text"] = text_dfs
80
+ github_data["multimodal"] = mm_dfs
81
+ github_data["date"] = formatted_date
82
+
83
+ return github_data
84
+
 
 
 
 
 
 
 
 
 
 
85
 
86
  def process_df(df: pd.DataFrame) -> pd.DataFrame:
87
+ """
88
+ Process dataframe:
89
+ - Convert datatypes to sort by "float" instead of "str"
90
+ - Remove repetition in model names
91
  - Update column names
92
+
93
  Args:
94
  df: Unprocessed Dataframe (after using update_cols)
95
+
96
  Returns:
97
  df: Processed Dataframe
98
+ """
99
+
100
+ # Convert column values to float, apart from the model names column
101
+ for col in df.columns[1:]:
102
+ df[col] = pd.to_numeric(df[col], errors='coerce')
103
+
104
+ # Remove repetition in model names
105
+ df[df.columns[0]] = df[df.columns[0]].str.replace('-t0.0', '', regex=True)
106
+ df[df.columns[0]] = df[df.columns[0]].apply(lambda x: '--'.join(set(x.split('--'))))
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  # Update column names
109
+ custom_column_names = ['Model', 'Clemscore', '% Played', 'Quality Score']
110
+ for i, col in enumerate(df.columns[4:]): # Start Capitalizing from the 5th column
111
+ parts = col.split(',')
112
+ custom_name = f"{parts[0].strip().capitalize()} {parts[1].strip()}"
113
+ custom_column_names.append(custom_name)
114
+
115
+ # Rename columns
116
+ df.columns = custom_column_names
117
+
 
 
 
118
  return df
119
 
120
+
121
+ def query_search(df: pd.DataFrame, query: str) -> pd.DataFrame:
122
+ """
123
+ Filter the dataframe based on the search query.
124
+
125
  Args:
126
+ df (pd.DataFrame): Unfiltered dataframe.
127
+ query (str): A string of queries separated by ";".
128
+ Returns:
129
+ pd.DataFrame: Filtered dataframe containing searched queries in the 'Model' column.
130
+ """
131
+ if not query.strip(): # Reset Dataframe if empty query is passed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  return df
133
 
134
+ queries = [q.strip().lower() for q in query.split(';') if q.strip()] # Normalize and split queries
135
+
136
+ # Filter dataframe based on queries in 'Model' column
137
+ filtered_df = df[df['Model'].str.lower().str.contains('|'.join(queries))]
138
+
139
+ return filtered_df
src/plot_utils.py CHANGED
@@ -1,21 +1,31 @@
1
  import pandas as pd
2
  import plotly.express as px
 
 
 
3
 
4
- from src.assets.text_content import SHORT_NAMES
 
5
 
6
- def plotly_plot(df:pd.DataFrame, LIST:list, ALL:list, NAMES:list, LEGEND:list):
7
- '''
 
 
 
8
  Takes in a list of models for a plotly plot
9
  Args:
10
  df: A dummy dataframe of latest version
11
- LIST: List of models to plot
12
- ALL: Either [] or ["Show All Models"] - toggle view to plot all models
13
- NAMES: Either [] or ["Show Names"] - toggle view to show model names on plot
14
- LEGEND: Either [] or ["Show Legend"] - toggle view to show legend on plot
 
 
15
  Returns:
16
- Fig: plotly figure
17
- '''
18
-
 
19
  # Get list of all models and append short names column to df
20
  list_columns = list(df.columns)
21
  ALL_LIST = list(df[list_columns[0]].unique())
@@ -23,25 +33,24 @@ def plotly_plot(df:pd.DataFrame, LIST:list, ALL:list, NAMES:list, LEGEND:list):
23
  list_short_names = list(short_names.values())
24
  df["Short"] = list_short_names
25
 
26
- if ALL:
27
  LIST = ALL_LIST
28
  # Filter dataframe based on the provided list of models
29
  df = df[df[list_columns[0]].isin(LIST)]
30
-
31
 
32
- if NAMES:
33
  fig = px.scatter(df, x=list_columns[2], y=list_columns[3], color=list_columns[0], symbol=list_columns[0],
34
- color_discrete_map={"category1": "blue", "category2": "red"},
35
- hover_name=list_columns[0], template="plotly_white", text="Short")
36
  fig.update_traces(textposition='top center')
37
  else:
38
  fig = px.scatter(df, x=list_columns[2], y=list_columns[3], color=list_columns[0], symbol=list_columns[0],
39
- color_discrete_map={"category1": "blue", "category2": "red"},
40
- hover_name=list_columns[0], template="plotly_white")
41
-
42
- if not LEGEND:
43
  fig.update_layout(showlegend=False)
44
-
45
  fig.update_layout(
46
  xaxis_title='% Played',
47
  yaxis_title='Quality Score',
@@ -52,30 +61,27 @@ def plotly_plot(df:pd.DataFrame, LIST:list, ALL:list, NAMES:list, LEGEND:list):
52
  fig.update_xaxes(range=[-5, 105])
53
  fig.update_yaxes(range=[-5, 105])
54
 
55
- return fig
56
-
57
 
58
- # ['Model', 'Clemscore', 'All(Played)', 'All(Quality Score)']
59
- def compare_plots(df: pd.DataFrame, LIST1: list, LIST2: list, ALL:list, NAMES:list, LEGEND: list):
60
- '''
61
- Quality Score v/s % Played plot by selecting models
62
- Args:
63
- df: A dummy dataframe of latest version
64
- LIST1: The list of open source models to show in the plot, updated from frontend
65
- LIST2: The list of commercial models to show in the plot, updated from frontend
66
- ALL: Either [] or ["Show All Models"] - toggle view to plot all models
67
- NAMES: Either [] or ["Show Names"] - toggle view to show model names on plot
68
- LEGEND: Either [] or ["Show Legend"] - toggle view to show legend on plot
69
- Returns:
70
- fig: The plot
71
- '''
72
 
73
- # Combine lists for Open source and commercial models
74
- LIST = LIST1 + LIST2
75
- fig = plotly_plot(df, LIST, ALL, NAMES, LEGEND)
 
 
76
 
77
  return fig
78
-
 
79
  def shorten_model_name(full_name):
80
  # Split the name into parts
81
  parts = full_name.split('-')
@@ -90,19 +96,20 @@ def shorten_model_name(full_name):
90
  short_name = '-'.join(short_name_parts)
91
 
92
  # Remove any leading or trailing hyphens
93
- short_name = full_name[0] + '-'+ short_name.strip('-')
94
 
95
  return short_name
96
 
 
97
  def label_map(model_list: list) -> dict:
98
- '''
99
  Generate a map from long names to short names, to plot them in frontend graph
100
  Define the short names in src/assets/text_content.py
101
  Args:
102
  model_list: A list of long model names
103
  Returns:
104
  short_name: A dict from long to short name
105
- '''
106
  short_names = {}
107
  for model_name in model_list:
108
  if model_name in SHORT_NAMES:
@@ -114,20 +121,167 @@ def label_map(model_list: list) -> dict:
114
  short_names[model_name] = short_name
115
 
116
  return short_names
117
-
118
- def split_models(MODEL_LIST: list):
119
- '''
 
120
  Split the models into open source and commercial
121
- '''
 
122
  open_models = []
123
- comm_models = []
 
124
 
125
- for model in MODEL_LIST:
126
- if model.startswith(('gpt-', 'claude-', 'command')):
127
- comm_models.append(model)
128
- else:
129
- open_models.append(model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  open_models.sort(key=lambda o: o.upper())
132
- comm_models.sort(key=lambda c: c.upper())
133
- return open_models, comm_models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  import plotly.express as px
3
+ import requests
4
+ import json
5
+ import gradio as gr
6
 
7
+ from src.assets.text_content import SHORT_NAMES, TEXT_NAME, MULTIMODAL_NAME
8
+ from src.leaderboard_utils import get_github_data
9
 
10
+
11
+ def plotly_plot(df: pd.DataFrame, list_op: list, list_co: list,
12
+ show_all: list, show_names: list, show_legend: list,
13
+ mobile_view: list):
14
+ """
15
  Takes in a list of models for a plotly plot
16
  Args:
17
  df: A dummy dataframe of latest version
18
+ list_op: The list of open source models to show in the plot, updated from frontend
19
+ list_co: The list of commercial models to show in the plot, updated from frontend
20
+ show_all: Either [] or ["Show All Models"] - toggle view to plot all models
21
+ show_names: Either [] or ["Show Names"] - toggle view to show model names on plot
22
+ show_legend: Either [] or ["Show Legend"] - toggle view to show legend on plot
23
+ mobile_view: Either [] or ["Mobile View"] - toggle view to for smaller screens
24
  Returns:
25
+ Fig: plotly figure of % played v/s quality score
26
+ """
27
+
28
+ LIST = list_op + list_co
29
  # Get list of all models and append short names column to df
30
  list_columns = list(df.columns)
31
  ALL_LIST = list(df[list_columns[0]].unique())
 
33
  list_short_names = list(short_names.values())
34
  df["Short"] = list_short_names
35
 
36
+ if show_all:
37
  LIST = ALL_LIST
38
  # Filter dataframe based on the provided list of models
39
  df = df[df[list_columns[0]].isin(LIST)]
 
40
 
41
+ if show_names:
42
  fig = px.scatter(df, x=list_columns[2], y=list_columns[3], color=list_columns[0], symbol=list_columns[0],
43
+ color_discrete_map={"category1": "blue", "category2": "red"},
44
+ hover_name=list_columns[0], template="plotly_white", text="Short")
45
  fig.update_traces(textposition='top center')
46
  else:
47
  fig = px.scatter(df, x=list_columns[2], y=list_columns[3], color=list_columns[0], symbol=list_columns[0],
48
+ color_discrete_map={"category1": "blue", "category2": "red"},
49
+ hover_name=list_columns[0], template="plotly_white")
50
+
51
+ if not show_legend:
52
  fig.update_layout(showlegend=False)
53
+
54
  fig.update_layout(
55
  xaxis_title='% Played',
56
  yaxis_title='Quality Score',
 
61
  fig.update_xaxes(range=[-5, 105])
62
  fig.update_yaxes(range=[-5, 105])
63
 
64
+ if mobile_view:
65
+ fig.update_layout(height=300)
66
 
67
+ if mobile_view and show_legend:
68
+ fig.update_layout(height=450)
69
+ fig.update_layout(legend=dict(
70
+ yanchor="bottom",
71
+ y=-5.52,
72
+ xanchor="left",
73
+ x=0.01
74
+ ))
 
 
 
 
 
 
75
 
76
+ fig.update_layout(
77
+ xaxis_title="",
78
+ yaxis_title="",
79
+ title="% Played v/s Quality Score"
80
+ )
81
 
82
  return fig
83
+
84
+
85
  def shorten_model_name(full_name):
86
  # Split the name into parts
87
  parts = full_name.split('-')
 
96
  short_name = '-'.join(short_name_parts)
97
 
98
  # Remove any leading or trailing hyphens
99
+ short_name = full_name[0] + '-' + short_name.strip('-')
100
 
101
  return short_name
102
 
103
+
104
  def label_map(model_list: list) -> dict:
105
+ """
106
  Generate a map from long names to short names, to plot them in frontend graph
107
  Define the short names in src/assets/text_content.py
108
  Args:
109
  model_list: A list of long model names
110
  Returns:
111
  short_name: A dict from long to short name
112
+ """
113
  short_names = {}
114
  for model_name in model_list:
115
  if model_name in SHORT_NAMES:
 
121
  short_names[model_name] = short_name
122
 
123
  return short_names
124
+
125
+
126
+ def split_models(model_list: list):
127
+ """
128
  Split the models into open source and commercial
129
+ """
130
+
131
  open_models = []
132
+ commercial_models = []
133
+ open_backends = {"huggingface_local", "huggingface_multimodal", "openai_compatible"} # Define backends considered as open
134
 
135
+ # Load model registry data from main repo
136
+ model_registry_url = "https://raw.githubusercontent.com/clp-research/clembench/main/backends/model_registry.json"
137
+ response = requests.get(model_registry_url)
138
+
139
+ if response.status_code == 200:
140
+ json_data = json.loads(response.text)
141
+ # Classify as Open or Commercial based on the defined backend in the model registry
142
+ backend_mapping = {}
143
+
144
+ for model_name in model_list:
145
+ model_prefix = model_name.split('-')[0] # Get the prefix part of the model name
146
+ for entry in json_data:
147
+ if entry["model_name"].startswith(model_prefix):
148
+ backend = entry["backend"]
149
+ # Classify based on backend
150
+ if backend in open_backends:
151
+ open_models.append(model_name)
152
+ else:
153
+ commercial_models.append(model_name)
154
+ break
155
+
156
+ else:
157
+ print(f"Failed to read JSON file: Status Code : {response.status_code}")
158
 
159
  open_models.sort(key=lambda o: o.upper())
160
+ commercial_models.sort(key=lambda c: c.upper())
161
+
162
+ # Add missing model from the model_registry
163
+ if "dolphin-2.5-mixtral-8x7b" in model_list:
164
+ open_models.append("dolphin-2.5-mixtral-8x7b")
165
+
166
+ return open_models, commercial_models
167
+
168
+ """
169
+ Update Functions, for when the leaderboard selection changes
170
+ """
171
+ def update_open_models(leaderboard: str = TEXT_NAME):
172
+ """
173
+ Change the checkbox group of Open Models based on the leaderboard selected
174
+
175
+ Args:
176
+ leaderboard: Selected leaderboard from the frontend [Default - Text Leaderboard]
177
+ Return:
178
+ Updated checkbox group for Open Models, based on the leaderboard selected
179
+ """
180
+ github_data = get_github_data()
181
+ leaderboard_data = github_data["text" if leaderboard == TEXT_NAME else "multimodal"][0]
182
+ models = leaderboard_data.iloc[:, 0].unique().tolist()
183
+ open_models, commercial_models = split_models(models)
184
+ return gr.CheckboxGroup(
185
+ open_models,
186
+ value=[],
187
+ elem_id="value-select-1",
188
+ interactive=True,
189
+ )
190
+
191
+ def update_closed_models(leaderboard: str = TEXT_NAME):
192
+ """
193
+ Change the checkbox group of Closed Models based on the leaderboard selected
194
+
195
+ Args:
196
+ leaderboard: Selected leaderboard from the frontend [Default - Text Leaderboard]
197
+ Return:
198
+ Updated checkbox group for Closed Models, based on the leaderboard selected
199
+ """
200
+ github_data = get_github_data()
201
+ leaderboard_data = github_data["text" if leaderboard == TEXT_NAME else "multimodal"][0]
202
+ models = leaderboard_data.iloc[:, 0].unique().tolist()
203
+ open_models, commercial_models = split_models(models)
204
+ return gr.CheckboxGroup(
205
+ commercial_models,
206
+ value=[],
207
+ elem_id="value-select-2",
208
+ interactive=True,
209
+ )
210
+
211
+ def get_plot_df(leaderboard: str = TEXT_NAME) -> pd.DataFrame:
212
+ """
213
+ Get the DataFrame for plotting based on the selected leaderboard.
214
+ Args:
215
+ leaderboard: Selected leaderboard.
216
+ Returns:
217
+ DataFrame with model data.
218
+ """
219
+ github_data = get_github_data()
220
+ return github_data["text" if leaderboard == TEXT_NAME else "multimodal"][0]
221
+
222
+
223
+ """
224
+ Reset Functions for when the Leaderboard selection changes
225
+ """
226
+ def reset_show_all():
227
+ return gr.CheckboxGroup(
228
+ ["Select All Models"],
229
+ label="Show plot for all models πŸ€–",
230
+ value=[],
231
+ elem_id="value-select-3",
232
+ interactive=True,
233
+ )
234
+
235
+ def reset_show_names():
236
+ return gr.CheckboxGroup(
237
+ ["Show Names"],
238
+ label="Show names of models on the plot 🏷️",
239
+ value=[],
240
+ elem_id="value-select-4",
241
+ interactive=True,
242
+ )
243
+
244
+
245
+ def reset_show_legend():
246
+ return gr.CheckboxGroup(
247
+ ["Show Legend"],
248
+ label="Show legend on the plot πŸ’‘",
249
+ value=[],
250
+ elem_id="value-select-5",
251
+ interactive=True,
252
+ )
253
+
254
+
255
+ def reset_mobile_view():
256
+ return gr.CheckboxGroup(
257
+ ["Mobile View"],
258
+ label="View plot on smaller screens πŸ“±",
259
+ value=[],
260
+ elem_id="value-select-6",
261
+ interactive=True,
262
+ )
263
+
264
+
265
+ if __name__ == '__main__':
266
+ mm_model_list = ['gpt-4o-2024-05-13', 'gpt-4-1106-vision-preview', 'claude-3-opus-20240229', 'gemini-1.5-pro-latest',
267
+ 'gemini-1.5-flash-latest', 'llava-v1.6-34b-hf', 'llava-v1.6-vicuna-13b-hf', 'idefics-80b-instruct',
268
+ 'llava-1.5-13b-hf', 'idefics-9b-instruct']
269
+
270
+ text_model_list = ['vicuna-33b-v1.3', 'gpt-4-0125-preview', 'gpt-4-turbo-2024-04-09', 'claude-3-5-sonnet-20240620', 'gpt-4-1106-preview',
271
+ 'gpt-4-0613', 'gpt-4o-2024-05-13', 'claude-3-opus-20240229', 'gemini-1.5-pro-latest',
272
+ 'Meta-Llama-3-70B-Instruct-hf', 'claude-2.1', 'gemini-1.5-flash-latest', 'claude-3-sonnet-20240229',
273
+ 'Qwen1.5-72B-Chat', 'mistral-large-2402', 'gpt-3.5-turbo-0125', 'gemini-1.0-pro', 'command-r-plus', 'openchat_3.5',
274
+ 'claude-3-haiku-20240307', 'sheep-duck-llama-2-70b-v1.1', 'Meta-Llama-3-8B-Instruct-hf', 'openchat-3.5-1210',
275
+ 'WizardLM-70b-v1.0', 'openchat-3.5-0106', 'Qwen1.5-14B-Chat', 'mistral-medium-2312', 'Qwen1.5-32B-Chat',
276
+ 'codegemma-7b-it', 'dolphin-2.5-mixtral-8x7b', 'CodeLlama-34b-Instruct-hf', 'command-r', 'gemma-1.1-7b-it',
277
+ 'SUS-Chat-34B', 'Mixtral-8x22B-Instruct-v0.1', 'tulu-2-dpo-70b', 'Nous-Hermes-2-Mixtral-8x7B-SFT',
278
+ 'WizardLM-13b-v1.2', 'Mistral-7B-Instruct-v0.2', 'Yi-34B-Chat', 'Mixtral-8x7B-Instruct-v0.1',
279
+ 'Mistral-7B-Instruct-v0.1', 'Yi-1.5-34B-Chat', 'vicuna-13b-v1.5', 'Yi-1.5-6B-Chat', 'Starling-LM-7B-beta',
280
+ 'sheep-duck-llama-2-13b', 'Yi-1.5-9B-Chat', 'gemma-1.1-2b-it', 'Qwen1.5-7B-Chat', 'gemma-7b-it',
281
+ 'llama-2-70b-chat-hf', 'Qwen1.5-0.5B-Chat', 'Qwen1.5-1.8B-Chat']
282
+
283
+ om, cm = split_models(mm_model_list)
284
+ print("Open")
285
+ print(om)
286
+ print("Closed")
287
+ print(cm)
src/version_utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## REQUIRED OUTPUT ###
2
+ # A list of version names -> v1.6, v.6_multimodal, v1.6_quantized, v1.5, v0.9, etc......
3
+ # A corresponding DataFrame?
4
+
5
+ import requests
6
+ from datetime import datetime
7
+ import pandas as pd
8
+ import json
9
+ from io import StringIO
10
+
11
+ from src.leaderboard_utils import process_df
12
+ from src.assets.text_content import REPO
13
+
14
+ def get_versions_data():
15
+ """
16
+ Read and process data from CSV files of all available versions hosted on GitHub. - https://github.com/clembench/clembench-runs
17
+
18
+ Returns:
19
+ versions_data:
20
+ -
21
+ """
22
+ base_repo = REPO
23
+ json_url = base_repo + "benchmark_runs.json"
24
+ response = requests.get(json_url)
25
+
26
+ # Check if the JSON file request was successful
27
+ if response.status_code != 200:
28
+ print(f"Failed to read JSON file: Status Code: {response.status_code}")
29
+ return None, None, None, None
30
+
31
+ json_data = response.json()
32
+ versions = json_data['versions']
33
+
34
+ # Sort version names - latest first
35
+ version_names = sorted(
36
+ [ver['version'] for ver in versions],
37
+ key=lambda v: float(v[1:]),
38
+ reverse=True
39
+ )
40
+ print(f"Found {len(version_names)} versions from get_versions_data(): {version_names}.")
41
+
42
+ # Get Last updated date of the latest version
43
+ latest_version = version_names[0]
44
+ latest_date = next(
45
+ ver['date'] for ver in versions if ver['version'] == latest_version
46
+ )
47
+ formatted_date = datetime.strptime(latest_date, "%Y/%m/%d").strftime("%d %b %Y")
48
+
49
+ # Get Versions data
50
+ versions_data = {"latest": latest_version, "date": formatted_date}
51
+
52
+ # Collect Dataframes
53
+ dfs = []
54
+
55
+ for version in version_names:
56
+ text_url = f"{base_repo}{version}/results.csv"
57
+ mm_url = f"{base_repo}{version}_multimodal/results.csv"
58
+ quant_url = f"{base_repo}{version}_quantized/results.csv"
59
+
60
+ # Text Data
61
+ response = requests.get(text_url)
62
+ if response.status_code == 200:
63
+ df = pd.read_csv(StringIO(response.text))
64
+ df = process_df(df)
65
+ df = df.sort_values(by=df.columns[1], ascending=False) # Sort by clemscore column
66
+ versions_data[version] = df
67
+ else:
68
+ print(f"Failed to read Text-only leaderboard CSV file for version: {version}. Status Code: {response.status_code}")
69
+
70
+ # Multimodal Data
71
+ mm_response = requests.get(mm_url)
72
+ if mm_response.status_code == 200:
73
+ mm_df = pd.read_csv(StringIO(mm_response.text))
74
+ mm_df = process_df(mm_df)
75
+ mm_df = mm_df.sort_values(by=mm_df.columns[1], ascending=False) # Sort by clemscore column
76
+ versions_data[version+"_multimodal"] = mm_df
77
+ else:
78
+ print(f"Failed to read multimodal leaderboard CSV file for version: {version}: Status Code: {mm_response.status_code}. Please ignore this message if multimodal results are not available for this version")
79
+
80
+ # Multimodal Data
81
+ q_response = requests.get(quant_url)
82
+ if q_response.status_code == 200:
83
+ q_df = pd.read_csv(StringIO(q_response.text))
84
+ q_df = process_df(q_df)
85
+ q_df = q_df.sort_values(by=q_df.columns[1], ascending=False) # Sort by clemscore column
86
+ versions_data[version + "_quantized"] = q_df
87
+ else:
88
+ print(f"Failed to read quantized leaderboard CSV file for version: {version}: Status Code: {mm_response.status_code}. Please ignore this message if quantized results are not available for this version")
89
+
90
+ return versions_data
91
+
92
+
93
+ if __name__ == "__main__":
94
+ versions_data = get_versions_data()
95
+ print(versions_data.keys())