Update app.py
Browse files
app.py
CHANGED
@@ -24,7 +24,7 @@ def make_leaderboard_md(elo_results):
|
|
24 |
- [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
|
25 |
- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
|
26 |
|
27 |
-
π» We use [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) to compute MT-bench scores (single-answer grading on a scale of 10)
|
28 |
"""
|
29 |
return leaderboard_md
|
30 |
|
@@ -173,7 +173,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file):
|
|
173 |
"Model",
|
174 |
"Arena Elo rating",
|
175 |
"MT-bench (score)",
|
176 |
-
"MT-bench (win rate %)",
|
177 |
"MMLU",
|
178 |
"License",
|
179 |
]
|
@@ -191,7 +190,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file):
|
|
191 |
|
192 |
gr.Dataframe(
|
193 |
headers=headers,
|
194 |
-
datatype=["markdown", "number", "number", "number", "
|
195 |
value=values,
|
196 |
elem_id="leaderboard_dataframe",
|
197 |
)
|
|
|
24 |
- [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
|
25 |
- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
|
26 |
|
27 |
+
π» We use [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) to compute MT-bench scores (single-answer grading on a scale of 10). The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available.
|
28 |
"""
|
29 |
return leaderboard_md
|
30 |
|
|
|
173 |
"Model",
|
174 |
"Arena Elo rating",
|
175 |
"MT-bench (score)",
|
|
|
176 |
"MMLU",
|
177 |
"License",
|
178 |
]
|
|
|
190 |
|
191 |
gr.Dataframe(
|
192 |
headers=headers,
|
193 |
+
datatype=["markdown", "number", "number", "number", "str"],
|
194 |
value=values,
|
195 |
elem_id="leaderboard_dataframe",
|
196 |
)
|