update
Browse files
app.py
CHANGED
@@ -26,7 +26,7 @@ def make_default_md(arena_df, elo_results):
|
|
26 |
| [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
|
27 |
|
28 |
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
29 |
-
We've collected over **
|
30 |
"""
|
31 |
return leaderboard_md
|
32 |
|
@@ -36,7 +36,7 @@ def make_arena_leaderboard_md(arena_df):
|
|
36 |
total_models = len(arena_df)
|
37 |
|
38 |
leaderboard_md = f"""
|
39 |
-
Total #models: **{total_models}**. Total #votes: **{total_votes}**. Last updated: March
|
40 |
|
41 |
Contribute your vote π³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! Find more analysis in the [notebook]({notebook_url}).
|
42 |
"""
|
@@ -46,7 +46,7 @@ Contribute your vote π³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! Find m
|
|
46 |
def make_full_leaderboard_md(elo_results):
|
47 |
leaderboard_md = f"""
|
48 |
Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
|
49 |
-
- [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use
|
50 |
- [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
|
51 |
- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks.
|
52 |
|
@@ -210,7 +210,6 @@ def get_arena_table(arena_df, model_table_df):
|
|
210 |
for i in range(len(arena_df)):
|
211 |
row = []
|
212 |
model_key = arena_df.index[i]
|
213 |
-
print(model_key)
|
214 |
model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
|
215 |
0
|
216 |
]
|
|
|
26 |
| [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
|
27 |
|
28 |
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
29 |
+
We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system.
|
30 |
"""
|
31 |
return leaderboard_md
|
32 |
|
|
|
36 |
total_models = len(arena_df)
|
37 |
|
38 |
leaderboard_md = f"""
|
39 |
+
Total #models: **{total_models}**. Total #votes: **{total_votes}**. Last updated: March 29, 2024.
|
40 |
|
41 |
Contribute your vote π³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! Find more analysis in the [notebook]({notebook_url}).
|
42 |
"""
|
|
|
46 |
def make_full_leaderboard_md(elo_results):
|
47 |
leaderboard_md = f"""
|
48 |
Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
|
49 |
+
- [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 500K+ user votes to compute Elo ratings.
|
50 |
- [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
|
51 |
- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks.
|
52 |
|
|
|
210 |
for i in range(len(arena_df)):
|
211 |
row = []
|
212 |
model_key = arena_df.index[i]
|
|
|
213 |
model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
|
214 |
0
|
215 |
]
|