Spaces:

lmarena-ai
/

chatbot-arena-leaderboard

Running

App Files Files Community

Lisa Dunlap commited on Mar 31

Commit

e121d4e

•

1 Parent(s): df2a130

moved around text for asthetics purposes

Browse files

Files changed (1) hide show

app.py +32 -60

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ def make_default_md(arena_df, elo_results):
 | [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
 LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
-We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system.
 """
     return leaderboard_md
@@ -44,34 +44,10 @@ def make_arena_leaderboard_md(arena_df, arena_subset_df=None, name="Overall"):
     leaderboard_md = f"""
 Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{vote_str}{space} Last updated: March 29, 2024.
-Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)! You can find code to recreate these tables and plots in this [notebook]({notebook_url}).
 **NEW!** Click the buttons below to view the ELO leaderboard and stats for different input categories. You are currently viewing **{name}** inputs.
 """
     return leaderboard_md
-# def make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df):
-#     # Calculate totals for each arena
-#     total_votes = sum(arena_df["num_battles"]) // 2
-#     total_chinese_votes = sum(arena_chinese_df["num_battles"]) // 2
-#     total_long_votes = sum(arena_long_df["num_battles"]) // 2
-#     total_english_votes = sum(arena_english_df["num_battles"]) // 2
-#     # Constructing the markdown table
-#     leaderboard_md = f"""
-# Last updated: March 29, 2024.
-# |   | **Total** | English  | Chinese | Long Context |
-# | :-------------- | :-----------------------: | :-----------------------: | :-----------------------: | :-----------------------: |
-# | # Votes | **{"{:,}".format(total_votes)}** | {"{:,}".format(total_english_votes)} | {"{:,}".format(total_chinese_votes)} | {"{:,}".format(total_long_votes)} |
-# | # Models | **{len(arena_df)}** | {len(arena_english_df)}| {len(arena_chinese_df)} | {len(arena_long_df)} |
-# Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)! You can find code to recreate these tables and plots in this [notebook]({notebook_url}).
-# """
-#     return leaderboard_md
 def make_full_leaderboard_md(elo_results):
     leaderboard_md = f"""
 Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
@@ -301,8 +277,7 @@ def update_leaderboard_and_plots(button, arena_df, model_table_df, arena_subset_
     p2 = elo_subset_results["battle_count_heatmap"]
     p3 = elo_subset_results["bootstrap_elo_rating"]
     p4 = elo_subset_results["average_win_rate_bar"]
-    more_stats_md = f"""## More Statistics for Chatbot Arena ({button})\n
-    You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
     """
     leaderboard_md = make_arena_leaderboard_md(arena_df, arena_subset_df, name=button)
     return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
@@ -383,11 +358,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
                     column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
                     wrap=True,
                 )
-                # Setup the button click action
-                # overall_rating.click(fn=update_overall_rating_df, inputs=overall_rating, outputs=elo_display_df)
-                # english_rating.click(fn=update_english_rating_df, inputs=english_rating, outputs=elo_display_df)
-                # chinese_rating.click(fn=update_chinese_rating_df, inputs=chinese_rating ,outputs=elo_display_df)
-                # long_context_rating.click(fn=update_long_context_rating_df, inputs=long_context_rating, outputs=elo_display_df)
             with gr.Tab("Full Leaderboard", id=1):
                 md = make_full_leaderboard_md(elo_results)
@@ -422,7 +392,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
     gr.Markdown(
         f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
 A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
-See Figure 3 below for visualization of the confidence intervals.
 """,
         elem_id="leaderboard_markdown"
     )
@@ -430,32 +400,31 @@ See Figure 3 below for visualization of the confidence intervals.
     leader_component_values[:] = [default_md, p1, p2, p3, p4]
     if show_plot:
-        more_stats_md = gr.Markdown(
-            f"""## More Statistics for Chatbot Arena\n
-    You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
-    """,
-            elem_id="leaderboard_markdown"
-        )
         with gr.Row():
             with gr.Column():
                 gr.Markdown(
-                    "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
                 )
-                plot_1 = gr.Plot(p1, show_label=False)
             with gr.Column():
                 gr.Markdown(
-                    "#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
                 )
                 plot_2 = gr.Plot(p2, show_label=False)
         with gr.Row():
             with gr.Column():
                 gr.Markdown(
-                    "#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)"
                 )
                 plot_3 = gr.Plot(p3, show_label=False)
             with gr.Column():
                 gr.Markdown(
-                    "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
                 )
                 plot_4 = gr.Plot(p4, show_label=False)
@@ -494,21 +463,23 @@ block_css = """
     line-height: 0.1em;
 }
-#arena_leaderboard_dataframe td {
-    line-height: 0.15em;
-    font-size: 18px;
-}
-#arena_leaderboard_dataframe th {
-    font-size: 20px;
 }
-#full_leaderboard_dataframe td {
-    line-height: 0.15em;
-    font-size: 18px;
-}
-#full_leaderboard_dataframe th {
-    font-size: 20px;
 }
 footer {
@@ -540,11 +511,12 @@ We thank [Kaggle](https://www.kaggle.com/), [MBZUAI](https://mbzuai.ac.ae/), [a1
 def build_demo(elo_results_file, leaderboard_table_file):
     text_size = gr.themes.sizes.text_lg
     with gr.Blocks(
         title="Chatbot Arena Leaderboard",
-        # theme=gr.themes.Base(text_size=text_size),
-        theme = gr.themes.Base.load("theme.json"),
         css=block_css,
     ) as demo:
         leader_components = build_leaderboard_tab(

 | [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
 LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
+We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system. Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)!
 """
     return leaderboard_md
     leaderboard_md = f"""
 Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{vote_str}{space} Last updated: March 29, 2024.
 **NEW!** Click the buttons below to view the ELO leaderboard and stats for different input categories. You are currently viewing **{name}** inputs.
 """
     return leaderboard_md
 def make_full_leaderboard_md(elo_results):
     leaderboard_md = f"""
 Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
     p2 = elo_subset_results["battle_count_heatmap"]
     p3 = elo_subset_results["bootstrap_elo_rating"]
     p4 = elo_subset_results["average_win_rate_bar"]
+    more_stats_md = f"""## More Statistics for Chatbot Arena ({button})
     """
     leaderboard_md = make_arena_leaderboard_md(arena_df, arena_subset_df, name=button)
     return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
                     column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
                     wrap=True,
                 )
             with gr.Tab("Full Leaderboard", id=1):
                 md = make_full_leaderboard_md(elo_results)
     gr.Markdown(
         f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
 A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
+See Figure 3 below for visualization of the confidence intervals. Code to recreate these tables and plots in this [notebook]({notebook_url}) and more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
 """,
         elem_id="leaderboard_markdown"
     )
     leader_component_values[:] = [default_md, p1, p2, p3, p4]
     if show_plot:
+        # more_stats_md = gr.Markdown(
+        #     f"""## More Statistics for Chatbot Arena (Overall)""",
+        #     elem_id="leaderboard_markdown"
+        # )
+        more_stats_md = gr.Button("More Statistics for Chatbot Arena (Overall)", elem_id="non-interactive-button")
         with gr.Row():
             with gr.Column():
                 gr.Markdown(
+                    "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles", elem_id="plot-title", variant="panel"
                 )
+                plot_1 = gr.Plot(p1, show_label=False, elem_id="plot-container")
             with gr.Column():
                 gr.Markdown(
+                    "#### Figure 2: Battle Count for Each Combination of Models (without Ties)", elem_id="plot-title"
                 )
                 plot_2 = gr.Plot(p2, show_label=False)
         with gr.Row():
             with gr.Column():
                 gr.Markdown(
+                    "#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)", elem_id="plot-title"
                 )
                 plot_3 = gr.Plot(p3, show_label=False)
             with gr.Column():
                 gr.Markdown(
+                    "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)", elem_id="plot-title"
                 )
                 plot_4 = gr.Plot(p4, show_label=False)
     line-height: 0.1em;
 }
+#plot-title {
+    text-align: center;
+    display:block;
 }
+#non-interactive-button {
+  display: inline-block;
+  padding: 10px 10px;
+  background-color: #f7f7f7; /* Super light grey background */
+  color: #000000; /* Black text */
+  text-align: center;
+  font-size: 26px; /* Larger text */
+  border-radius: 0; /* Straight edges, no border radius */
+  border: 0px solid #dcdcdc; /* A light grey border to match the background */
+  font-weight: bold;
+  user-select: none; /* The text inside the button is not selectable */
+  pointer-events: none; /* The button is non-interactive */
 }
 footer {
 def build_demo(elo_results_file, leaderboard_table_file):
     text_size = gr.themes.sizes.text_lg
+    theme = gr.themes.Base(text_size=text_size)
+    theme.set(button_secondary_background_fill_hover="*primary_300", button_secondary_background_fill_hover_dark="*primary_700")
     with gr.Blocks(
         title="Chatbot Arena Leaderboard",
+        theme=theme,
+        # theme = gr.themes.Base.load("theme.json"),
         css=block_css,
     ) as demo:
         leader_components = build_leaderboard_tab(