Lisa Dunlap
commited on
Commit
β’
e121d4e
1
Parent(s):
df2a130
moved around text for asthetics purposes
Browse files
app.py
CHANGED
@@ -26,7 +26,7 @@ def make_default_md(arena_df, elo_results):
|
|
26 |
| [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
|
27 |
|
28 |
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
29 |
-
We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system.
|
30 |
"""
|
31 |
return leaderboard_md
|
32 |
|
@@ -44,34 +44,10 @@ def make_arena_leaderboard_md(arena_df, arena_subset_df=None, name="Overall"):
|
|
44 |
leaderboard_md = f"""
|
45 |
Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{vote_str}{space} Last updated: March 29, 2024.
|
46 |
|
47 |
-
Contribute your vote π³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! You can find code to recreate these tables and plots in this [notebook]({notebook_url}).
|
48 |
-
|
49 |
**NEW!** Click the buttons below to view the ELO leaderboard and stats for different input categories. You are currently viewing **{name}** inputs.
|
50 |
"""
|
51 |
return leaderboard_md
|
52 |
|
53 |
-
# def make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df):
|
54 |
-
# # Calculate totals for each arena
|
55 |
-
# total_votes = sum(arena_df["num_battles"]) // 2
|
56 |
-
# total_chinese_votes = sum(arena_chinese_df["num_battles"]) // 2
|
57 |
-
# total_long_votes = sum(arena_long_df["num_battles"]) // 2
|
58 |
-
# total_english_votes = sum(arena_english_df["num_battles"]) // 2
|
59 |
-
|
60 |
-
# # Constructing the markdown table
|
61 |
-
# leaderboard_md = f"""
|
62 |
-
# Last updated: March 29, 2024.
|
63 |
-
# | | **Total** | English | Chinese | Long Context |
|
64 |
-
# | :-------------- | :-----------------------: | :-----------------------: | :-----------------------: | :-----------------------: |
|
65 |
-
# | # Votes | **{"{:,}".format(total_votes)}** | {"{:,}".format(total_english_votes)} | {"{:,}".format(total_chinese_votes)} | {"{:,}".format(total_long_votes)} |
|
66 |
-
# | # Models | **{len(arena_df)}** | {len(arena_english_df)}| {len(arena_chinese_df)} | {len(arena_long_df)} |
|
67 |
-
|
68 |
-
# Contribute your vote π³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! You can find code to recreate these tables and plots in this [notebook]({notebook_url}).
|
69 |
-
# """
|
70 |
-
|
71 |
-
# return leaderboard_md
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
def make_full_leaderboard_md(elo_results):
|
76 |
leaderboard_md = f"""
|
77 |
Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
|
@@ -301,8 +277,7 @@ def update_leaderboard_and_plots(button, arena_df, model_table_df, arena_subset_
|
|
301 |
p2 = elo_subset_results["battle_count_heatmap"]
|
302 |
p3 = elo_subset_results["bootstrap_elo_rating"]
|
303 |
p4 = elo_subset_results["average_win_rate_bar"]
|
304 |
-
more_stats_md = f"""## More Statistics for Chatbot Arena ({button})
|
305 |
-
You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
306 |
"""
|
307 |
leaderboard_md = make_arena_leaderboard_md(arena_df, arena_subset_df, name=button)
|
308 |
return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
|
@@ -383,11 +358,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
383 |
column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
|
384 |
wrap=True,
|
385 |
)
|
386 |
-
# Setup the button click action
|
387 |
-
# overall_rating.click(fn=update_overall_rating_df, inputs=overall_rating, outputs=elo_display_df)
|
388 |
-
# english_rating.click(fn=update_english_rating_df, inputs=english_rating, outputs=elo_display_df)
|
389 |
-
# chinese_rating.click(fn=update_chinese_rating_df, inputs=chinese_rating ,outputs=elo_display_df)
|
390 |
-
# long_context_rating.click(fn=update_long_context_rating_df, inputs=long_context_rating, outputs=elo_display_df)
|
391 |
|
392 |
with gr.Tab("Full Leaderboard", id=1):
|
393 |
md = make_full_leaderboard_md(elo_results)
|
@@ -422,7 +392,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
422 |
gr.Markdown(
|
423 |
f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
|
424 |
A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
|
425 |
-
See Figure 3 below for visualization of the confidence intervals.
|
426 |
""",
|
427 |
elem_id="leaderboard_markdown"
|
428 |
)
|
@@ -430,32 +400,31 @@ See Figure 3 below for visualization of the confidence intervals.
|
|
430 |
leader_component_values[:] = [default_md, p1, p2, p3, p4]
|
431 |
|
432 |
if show_plot:
|
433 |
-
more_stats_md = gr.Markdown(
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
)
|
439 |
with gr.Row():
|
440 |
with gr.Column():
|
441 |
gr.Markdown(
|
442 |
-
"#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
|
443 |
)
|
444 |
-
plot_1 = gr.Plot(p1, show_label=False)
|
445 |
with gr.Column():
|
446 |
gr.Markdown(
|
447 |
-
"#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
|
448 |
)
|
449 |
plot_2 = gr.Plot(p2, show_label=False)
|
450 |
with gr.Row():
|
451 |
with gr.Column():
|
452 |
gr.Markdown(
|
453 |
-
"#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)"
|
454 |
)
|
455 |
plot_3 = gr.Plot(p3, show_label=False)
|
456 |
with gr.Column():
|
457 |
gr.Markdown(
|
458 |
-
"#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
|
459 |
)
|
460 |
plot_4 = gr.Plot(p4, show_label=False)
|
461 |
|
@@ -494,21 +463,23 @@ block_css = """
|
|
494 |
line-height: 0.1em;
|
495 |
}
|
496 |
|
497 |
-
#
|
498 |
-
|
499 |
-
|
500 |
-
}
|
501 |
-
#arena_leaderboard_dataframe th {
|
502 |
-
font-size: 20px;
|
503 |
}
|
504 |
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
|
|
|
|
|
|
|
|
|
|
512 |
}
|
513 |
|
514 |
footer {
|
@@ -540,11 +511,12 @@ We thank [Kaggle](https://www.kaggle.com/), [MBZUAI](https://mbzuai.ac.ae/), [a1
|
|
540 |
|
541 |
def build_demo(elo_results_file, leaderboard_table_file):
|
542 |
text_size = gr.themes.sizes.text_lg
|
543 |
-
|
|
|
544 |
with gr.Blocks(
|
545 |
title="Chatbot Arena Leaderboard",
|
546 |
-
|
547 |
-
theme = gr.themes.Base.load("theme.json"),
|
548 |
css=block_css,
|
549 |
) as demo:
|
550 |
leader_components = build_leaderboard_tab(
|
|
|
26 |
| [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
|
27 |
|
28 |
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
29 |
+
We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system. Contribute your vote π³οΈ at [chat.lmsys.org](https://chat.lmsys.org)!
|
30 |
"""
|
31 |
return leaderboard_md
|
32 |
|
|
|
44 |
leaderboard_md = f"""
|
45 |
Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{vote_str}{space} Last updated: March 29, 2024.
|
46 |
|
|
|
|
|
47 |
**NEW!** Click the buttons below to view the ELO leaderboard and stats for different input categories. You are currently viewing **{name}** inputs.
|
48 |
"""
|
49 |
return leaderboard_md
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def make_full_leaderboard_md(elo_results):
|
52 |
leaderboard_md = f"""
|
53 |
Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
|
|
|
277 |
p2 = elo_subset_results["battle_count_heatmap"]
|
278 |
p3 = elo_subset_results["bootstrap_elo_rating"]
|
279 |
p4 = elo_subset_results["average_win_rate_bar"]
|
280 |
+
more_stats_md = f"""## More Statistics for Chatbot Arena ({button})
|
|
|
281 |
"""
|
282 |
leaderboard_md = make_arena_leaderboard_md(arena_df, arena_subset_df, name=button)
|
283 |
return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
|
|
|
358 |
column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
|
359 |
wrap=True,
|
360 |
)
|
|
|
|
|
|
|
|
|
|
|
361 |
|
362 |
with gr.Tab("Full Leaderboard", id=1):
|
363 |
md = make_full_leaderboard_md(elo_results)
|
|
|
392 |
gr.Markdown(
|
393 |
f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
|
394 |
A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
|
395 |
+
See Figure 3 below for visualization of the confidence intervals. Code to recreate these tables and plots in this [notebook]({notebook_url}) and more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
396 |
""",
|
397 |
elem_id="leaderboard_markdown"
|
398 |
)
|
|
|
400 |
leader_component_values[:] = [default_md, p1, p2, p3, p4]
|
401 |
|
402 |
if show_plot:
|
403 |
+
# more_stats_md = gr.Markdown(
|
404 |
+
# f"""## More Statistics for Chatbot Arena (Overall)""",
|
405 |
+
# elem_id="leaderboard_markdown"
|
406 |
+
# )
|
407 |
+
more_stats_md = gr.Button("More Statistics for Chatbot Arena (Overall)", elem_id="non-interactive-button")
|
|
|
408 |
with gr.Row():
|
409 |
with gr.Column():
|
410 |
gr.Markdown(
|
411 |
+
"#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles", elem_id="plot-title", variant="panel"
|
412 |
)
|
413 |
+
plot_1 = gr.Plot(p1, show_label=False, elem_id="plot-container")
|
414 |
with gr.Column():
|
415 |
gr.Markdown(
|
416 |
+
"#### Figure 2: Battle Count for Each Combination of Models (without Ties)", elem_id="plot-title"
|
417 |
)
|
418 |
plot_2 = gr.Plot(p2, show_label=False)
|
419 |
with gr.Row():
|
420 |
with gr.Column():
|
421 |
gr.Markdown(
|
422 |
+
"#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)", elem_id="plot-title"
|
423 |
)
|
424 |
plot_3 = gr.Plot(p3, show_label=False)
|
425 |
with gr.Column():
|
426 |
gr.Markdown(
|
427 |
+
"#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)", elem_id="plot-title"
|
428 |
)
|
429 |
plot_4 = gr.Plot(p4, show_label=False)
|
430 |
|
|
|
463 |
line-height: 0.1em;
|
464 |
}
|
465 |
|
466 |
+
#plot-title {
|
467 |
+
text-align: center;
|
468 |
+
display:block;
|
|
|
|
|
|
|
469 |
}
|
470 |
|
471 |
+
#non-interactive-button {
|
472 |
+
display: inline-block;
|
473 |
+
padding: 10px 10px;
|
474 |
+
background-color: #f7f7f7; /* Super light grey background */
|
475 |
+
color: #000000; /* Black text */
|
476 |
+
text-align: center;
|
477 |
+
font-size: 26px; /* Larger text */
|
478 |
+
border-radius: 0; /* Straight edges, no border radius */
|
479 |
+
border: 0px solid #dcdcdc; /* A light grey border to match the background */
|
480 |
+
font-weight: bold;
|
481 |
+
user-select: none; /* The text inside the button is not selectable */
|
482 |
+
pointer-events: none; /* The button is non-interactive */
|
483 |
}
|
484 |
|
485 |
footer {
|
|
|
511 |
|
512 |
def build_demo(elo_results_file, leaderboard_table_file):
|
513 |
text_size = gr.themes.sizes.text_lg
|
514 |
+
theme = gr.themes.Base(text_size=text_size)
|
515 |
+
theme.set(button_secondary_background_fill_hover="*primary_300", button_secondary_background_fill_hover_dark="*primary_700")
|
516 |
with gr.Blocks(
|
517 |
title="Chatbot Arena Leaderboard",
|
518 |
+
theme=theme,
|
519 |
+
# theme = gr.themes.Base.load("theme.json"),
|
520 |
css=block_css,
|
521 |
) as demo:
|
522 |
leader_components = build_leaderboard_tab(
|