update
Browse files
app.py
CHANGED
@@ -39,7 +39,7 @@ def make_arena_leaderboard_md(arena_df):
|
|
39 |
leaderboard_md = f"""
|
40 |
Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: March 29, 2024.
|
41 |
|
42 |
-
**NEW!** View
|
43 |
"""
|
44 |
return leaderboard_md
|
45 |
|
@@ -49,8 +49,8 @@ def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall"
|
|
49 |
space = " "
|
50 |
total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
|
51 |
total_subset_models = len(arena_subset_df)
|
52 |
-
leaderboard_md = f"""### {name}
|
53 |
-
#models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)
|
54 |
"""
|
55 |
return leaderboard_md
|
56 |
|
@@ -259,10 +259,14 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
|
259 |
arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
|
260 |
arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
|
261 |
# join arena_df and arena_subset_df on index
|
262 |
-
arena_df = arena_subset_df.join(arena_df["
|
263 |
-
|
264 |
-
|
265 |
-
#
|
|
|
|
|
|
|
|
|
266 |
arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
|
267 |
arena_df["final_ranking"] = arena_df.apply(lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1)
|
268 |
|
@@ -310,7 +314,25 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
|
310 |
print(f"{model_key} - {e}")
|
311 |
return values
|
312 |
|
313 |
-
key_to_category_name = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
|
315 |
def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
|
316 |
arena_dfs = {}
|
@@ -328,12 +350,12 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
328 |
arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
|
329 |
category_elo_results[key_to_category_name[k]] = elo_results[k]
|
330 |
|
331 |
-
p1 = category_elo_results["
|
332 |
-
p2 = category_elo_results["
|
333 |
-
p3 = category_elo_results["
|
334 |
-
p4 = category_elo_results["
|
335 |
-
arena_df = arena_dfs["
|
336 |
-
default_md = make_default_md(arena_df, category_elo_results["
|
337 |
|
338 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
339 |
if leaderboard_table_file:
|
@@ -347,9 +369,10 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
347 |
md = make_arena_leaderboard_md(arena_df)
|
348 |
leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
|
349 |
with gr.Row():
|
350 |
-
|
351 |
-
|
352 |
-
|
|
|
353 |
category_deets = gr.Markdown(default_category_details, elem_id="category_deets")
|
354 |
|
355 |
elo_display_df = gr.Dataframe(
|
@@ -364,7 +387,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
364 |
"Knowledge Cutoff",
|
365 |
],
|
366 |
datatype=[
|
367 |
-
"
|
368 |
"markdown",
|
369 |
"number",
|
370 |
"str",
|
@@ -449,7 +472,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
449 |
pass
|
450 |
|
451 |
def update_leaderboard_df(arena_table_vals):
|
452 |
-
elo_datarame = pd.DataFrame(arena_table_vals, columns=[ "Rank", "
|
453 |
|
454 |
# goal: color the rows based on the rank with styler
|
455 |
def highlight_max(s):
|
@@ -459,20 +482,20 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
459 |
def highlight_rank_max(s):
|
460 |
return ["color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s]
|
461 |
|
462 |
-
return elo_datarame.style.apply(highlight_max, subset=["Rank"]).apply(highlight_rank_max, subset=["
|
463 |
|
464 |
def update_leaderboard_and_plots(category):
|
465 |
arena_subset_df = arena_dfs[category]
|
466 |
arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
|
467 |
elo_subset_results = category_elo_results[category]
|
468 |
-
arena_df = arena_dfs["
|
469 |
-
arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df = arena_subset_df if category != "
|
470 |
-
if category != "
|
471 |
arena_values = update_leaderboard_df(arena_values)
|
472 |
arena_values = gr.Dataframe(
|
473 |
headers=[
|
474 |
"Rank",
|
475 |
-
"
|
476 |
"π€ Model",
|
477 |
"β Arena Elo",
|
478 |
"π 95% CI",
|
@@ -482,7 +505,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
482 |
"Knowledge Cutoff",
|
483 |
],
|
484 |
datatype=[
|
485 |
-
"
|
486 |
"number",
|
487 |
"markdown",
|
488 |
"number",
|
@@ -495,7 +518,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
495 |
value=arena_values,
|
496 |
elem_id="arena_leaderboard_dataframe",
|
497 |
height=700,
|
498 |
-
column_widths=[
|
499 |
wrap=True,
|
500 |
)
|
501 |
else:
|
@@ -511,7 +534,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
511 |
"Knowledge Cutoff",
|
512 |
],
|
513 |
datatype=[
|
514 |
-
"
|
515 |
"markdown",
|
516 |
"number",
|
517 |
"str",
|
|
|
39 |
leaderboard_md = f"""
|
40 |
Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: March 29, 2024.
|
41 |
|
42 |
+
π£ **NEW!** View leaderboard for different categories (e.g., coding, long user query)!
|
43 |
"""
|
44 |
return leaderboard_md
|
45 |
|
|
|
49 |
space = " "
|
50 |
total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
|
51 |
total_subset_models = len(arena_subset_df)
|
52 |
+
leaderboard_md = f"""### {cat_name_to_explanation[name]}
|
53 |
+
#### [Coverage] {space} #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space}
|
54 |
"""
|
55 |
return leaderboard_md
|
56 |
|
|
|
259 |
arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
|
260 |
arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
|
261 |
# join arena_df and arena_subset_df on index
|
262 |
+
arena_df = arena_subset_df.join(arena_df["final_ranking"], rsuffix="_global", how="inner")
|
263 |
+
arena_df["ranking_difference"] = arena_df["final_ranking_global"] - arena_df["final_ranking"]
|
264 |
+
|
265 |
+
# no tie version
|
266 |
+
# arena_df = arena_subset_df.join(arena_df["final_ranking_no_tie"], rsuffix="_global", how="inner")
|
267 |
+
# arena_df["ranking_difference"] = arena_df["final_ranking_no_tie_global"] - arena_df["final_ranking_no_tie"]
|
268 |
+
|
269 |
+
arena_df = arena_df.sort_values(by=["rating"], ascending=False)
|
270 |
arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
|
271 |
arena_df["final_ranking"] = arena_df.apply(lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1)
|
272 |
|
|
|
314 |
print(f"{model_key} - {e}")
|
315 |
return values
|
316 |
|
317 |
+
key_to_category_name = {
|
318 |
+
"full": "Overall",
|
319 |
+
"coding": "Coding",
|
320 |
+
"long_user": "Longer query",
|
321 |
+
"english": "English",
|
322 |
+
"chinese": "Chinese",
|
323 |
+
"no_tie": "Exclude Ties",
|
324 |
+
"no_short": "Exclude Short",
|
325 |
+
}
|
326 |
+
cat_name_to_explanation = {
|
327 |
+
"Overall": "Overall Questions",
|
328 |
+
"Coding": "Coding: whether conversation contains code snippets",
|
329 |
+
"Longer query": "Longer user query (>= 500 tokens)",
|
330 |
+
"English": "English Prompts",
|
331 |
+
"Chinese": "Chinese Prompts",
|
332 |
+
"Exclude Ties": "Exclude Ties and Bothbad",
|
333 |
+
"Exclude Short": "User Query >= 5 tokens",
|
334 |
+
}
|
335 |
+
|
336 |
|
337 |
def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
|
338 |
arena_dfs = {}
|
|
|
350 |
arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
|
351 |
category_elo_results[key_to_category_name[k]] = elo_results[k]
|
352 |
|
353 |
+
p1 = category_elo_results["Overall"]["win_fraction_heatmap"]
|
354 |
+
p2 = category_elo_results["Overall"]["battle_count_heatmap"]
|
355 |
+
p3 = category_elo_results["Overall"]["bootstrap_elo_rating"]
|
356 |
+
p4 = category_elo_results["Overall"]["average_win_rate_bar"]
|
357 |
+
arena_df = arena_dfs["Overall"]
|
358 |
+
default_md = make_default_md(arena_df, category_elo_results["Overall"])
|
359 |
|
360 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
361 |
if leaderboard_table_file:
|
|
|
369 |
md = make_arena_leaderboard_md(arena_df)
|
370 |
leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
|
371 |
with gr.Row():
|
372 |
+
with gr.Column(scale=2):
|
373 |
+
category_dropdown = gr.Dropdown(choices=list(arena_dfs.keys()), label="Category", value="Overall")
|
374 |
+
default_category_details = make_category_arena_leaderboard_md(arena_df, arena_df, name="Overall")
|
375 |
+
with gr.Column(scale=4, variant="panel"):
|
376 |
category_deets = gr.Markdown(default_category_details, elem_id="category_deets")
|
377 |
|
378 |
elo_display_df = gr.Dataframe(
|
|
|
387 |
"Knowledge Cutoff",
|
388 |
],
|
389 |
datatype=[
|
390 |
+
"number",
|
391 |
"markdown",
|
392 |
"number",
|
393 |
"str",
|
|
|
472 |
pass
|
473 |
|
474 |
def update_leaderboard_df(arena_table_vals):
|
475 |
+
elo_datarame = pd.DataFrame(arena_table_vals, columns=[ "Rank", "Delta", "π€ Model", "β Arena Elo", "π 95% CI", "π³οΈ Votes", "Organization", "License", "Knowledge Cutoff"])
|
476 |
|
477 |
# goal: color the rows based on the rank with styler
|
478 |
def highlight_max(s):
|
|
|
482 |
def highlight_rank_max(s):
|
483 |
return ["color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s]
|
484 |
|
485 |
+
return elo_datarame.style.apply(highlight_max, subset=["Rank"]).apply(highlight_rank_max, subset=["Delta"])
|
486 |
|
487 |
def update_leaderboard_and_plots(category):
|
488 |
arena_subset_df = arena_dfs[category]
|
489 |
arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
|
490 |
elo_subset_results = category_elo_results[category]
|
491 |
+
arena_df = arena_dfs["Overall"]
|
492 |
+
arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df = arena_subset_df if category != "Overall" else None)
|
493 |
+
if category != "Overall":
|
494 |
arena_values = update_leaderboard_df(arena_values)
|
495 |
arena_values = gr.Dataframe(
|
496 |
headers=[
|
497 |
"Rank",
|
498 |
+
"Delta",
|
499 |
"π€ Model",
|
500 |
"β Arena Elo",
|
501 |
"π 95% CI",
|
|
|
505 |
"Knowledge Cutoff",
|
506 |
],
|
507 |
datatype=[
|
508 |
+
"number",
|
509 |
"number",
|
510 |
"markdown",
|
511 |
"number",
|
|
|
518 |
value=arena_values,
|
519 |
elem_id="arena_leaderboard_dataframe",
|
520 |
height=700,
|
521 |
+
column_widths=[60, 70, 190, 110, 100, 90, 160, 150, 140],
|
522 |
wrap=True,
|
523 |
)
|
524 |
else:
|
|
|
534 |
"Knowledge Cutoff",
|
535 |
],
|
536 |
datatype=[
|
537 |
+
"number",
|
538 |
"markdown",
|
539 |
"number",
|
540 |
"str",
|