lmzheng commited on
Commit
8bedda3
β€’
1 Parent(s): 2dbec06

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +79 -0
  2. elo_results_20230508.pkl +3 -0
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A gradio app that renders a static leaderboard. This is used for Hugging Face Space."""
2
+ import pickle
3
+ import gradio as gr
4
+
5
+
6
+ notebook_url = "https://colab.research.google.com/drive/1iI_IszGAwSMkdfUrIDI6NfTG7tGDDRxZ?usp=sharing"
7
+
8
+
9
+ def make_leaderboard_md(elo_results):
10
+ leaderboard_md = f"""
11
+ # Leaderboard
12
+ [[Blog](https://lmsys.org/blog/2023-05-03-arena/)] [[Vote](https://arena.lmsys.org/)] [[Github]](https://github.com/lm-sys/FastChat) [[Twitter]](https://twitter.com/lmsysorg) [[Discord]](https://discord.gg/h6kCZb72G7)
13
+
14
+ We use the Elo rating system to calculate the relative performance of the models. You can view the voting data, basic analyses, and calculation procedure in this [notebook]({notebook_url}). We will periodically release new leaderboards. If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model).
15
+ Last updated: {elo_results["last_updated_datetime"]}
16
+ {elo_results["leaderboard_table"]}
17
+ """
18
+ return leaderboard_md
19
+
20
+
21
+ def build_leaderboard_tab(elo_results_file):
22
+ if elo_results_file is not None:
23
+ with open(elo_results_file, "rb") as fin:
24
+ elo_results = pickle.load(fin)
25
+
26
+ md = make_leaderboard_md(elo_results)
27
+ p1 = elo_results["win_fraction_heatmap"]
28
+ p2 = elo_results["battle_count_heatmap"]
29
+ p3 = elo_results["average_win_rate_bar"]
30
+ p4 = elo_results["bootstrap_elo_rating"]
31
+ else:
32
+ md = "Loading ..."
33
+ p1 = p2 = p3 = p4 = None
34
+
35
+ md_1 = gr.Markdown(md)
36
+ gr.Markdown(
37
+ f"""## More Statistics\n
38
+ Here, we have added some additional figures to show more statistics. The code for generating them is also included in this [notebook]({notebook_url}).
39
+ Please note that you may see different orders from different ranking methods. This is expected for models that perform similarly, as demonstrated by the confidence interval in the bootstrap figure. Going forward, we prefer the classical Elo calculation because of its scalability and interpretability. You can find more discussions in this blog [post](https://lmsys.org/blog/2023-05-03-arena/).
40
+ """
41
+ )
42
+
43
+ with gr.Row():
44
+ with gr.Column():
45
+ gr.Markdown(
46
+ "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
47
+ )
48
+ plot_1 = gr.Plot(p1, show_label=False)
49
+ with gr.Column():
50
+ gr.Markdown(
51
+ "#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
52
+ )
53
+ plot_2 = gr.Plot(p2, show_label=False)
54
+ with gr.Row():
55
+ with gr.Column():
56
+ gr.Markdown(
57
+ "#### Figure 3: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
58
+ )
59
+ plot_3 = gr.Plot(p3, show_label=False)
60
+ with gr.Column():
61
+ gr.Markdown(
62
+ "#### Figure 4: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)"
63
+ )
64
+ plot_4 = gr.Plot(p4, show_label=False)
65
+ return [md_1, plot_1, plot_2, plot_3, plot_4]
66
+
67
+
68
+ def build_demo(elo_results_file):
69
+ with gr.Blocks(
70
+ title="Chatbot Arena Leaderboard",
71
+ theme=gr.themes.Base(),
72
+ ) as demo:
73
+ leader_components = build_leaderboard_tab(elo_results_file)
74
+
75
+ return demo
76
+
77
+
78
+ demo = build_demo("elo_results_20230508.pkl")
79
+ demo.launch(share=True)
elo_results_20230508.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3a83502895fc5a19511448c058f8e48edc97c020025b83af3b626228f9b1295
3
+ size 26516