Spaces:

uonlp
/

open_multilingual_llm_leaderboard

Running

laiviet commited on Jun 5, 2023

Commit

5faacb0

•

1 Parent(s): 8c2ee0f

Fix intro and sort order

Files changed (2) hide show

app.py CHANGED Viewed

@@ -99,12 +99,12 @@ def get_leaderboard_df(performance_dict, pretrained_models):
         if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
             continue
         avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
-        notes = ' '.join([pretrained, lang_name, lang])
         row = [pretrained, lang_name, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
         df.append(row)
     df = pd.DataFrame.from_records(df, columns=COLS)
-    df = df.sort_values(by=[AVERAGE_COL], ascending=False)
     df = df[COLS]
     return df
@@ -140,7 +140,7 @@ with demo:
     with gr.Box():
         search_bar = gr.Textbox(
-            placeholder="Search models...", show_label=False, elem_id="search-bar"
         )
         leaderboard_table = gr.components.Dataframe(

         if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
             continue
         avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
+        notes = ' '.join([pretrained, lang_name])
         row = [pretrained, lang_name, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
         df.append(row)
     df = pd.DataFrame.from_records(df, columns=COLS)
+    df = df.sort_values(by=[LANG_COL, AVERAGE_COL], ascending=False)
     df = df[COLS]
     return df
     with gr.Box():
         search_bar = gr.Textbox(
+            placeholder="Search models and languages...", show_label=False, elem_id="search-bar"
         )
         leaderboard_table = gr.components.Dataframe(

content.py CHANGED Viewed

@@ -3,21 +3,29 @@ TITLE = '<h1 align="center" id="space-title">Open Multilingual LLM Evaluation Le
 INTRO_TEXT = f"""
 ## About
-This leaderboard shows the performance of pretrained models in 29 languages including Arabic, Armenian, Basque, Bengali, Catalan, Chinese, Croatian, Danish, Dutch, French, German, Gujarati, Hindi, Hungarian, Indonesian, Italian, Kannada, Malayalam, Marathi, Nepali, Portuguese, Romanian, Russian, Serbian, Slovak, Spanish, Swedish, Tamil, Telugu, Ukrainian, and Vietnameseon four benchmarks:
 - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot)
 - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot)
 - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot)
 - <a href="https://arxiv.org/abs/2109.07958" target="_blank">  TruthfulQA </a> (0-shot)
-The evaluation data was translated into 29 languages using ChatGPT.
 """
 HOW_TO = f"""
 ## How to list your model performance on this leaderboard:
-Send an email with title [Open mLLM Loaderboard] to [email protected] with the huggingface's model name.
 We will run your model on the four benchmarks and add it to the leaderboard.
 """

 INTRO_TEXT = f"""
 ## About
+This leaderboard tracks progress and ranks performance of large language models (LLMs) developed for different languages,
+emphasizing on non-English languages to democratize benefits of LLMs to broader society.
+Our current leaderboard provides evaluation data for 29 languages, i.e.,
+Arabic, Armenian, Basque, Bengali, Catalan, Chinese, Croatian, Danish, Dutch,
+French, German, Gujarati, Hindi, Hungarian, Indonesian, Italian, Kannada, Malayalam,
+Marathi, Nepali, Portuguese, Romanian, Russian, Serbian, Slovak, Spanish, Swedish,
+Tamil, Telugu, Ukrainian, and Vietnamese, that will be expanded along the way.
+Both multilingual and language-specific LLMs are welcome in this leaderboard.
+We currently evaluate models over four benchmarks:
 - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot)
 - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot)
 - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot)
 - <a href="https://arxiv.org/abs/2109.07958" target="_blank">  TruthfulQA </a> (0-shot)
+The evaluation data was translated into these languages using ChatGPT (gpt-35-turbo).
 """
 HOW_TO = f"""
 ## How to list your model performance on this leaderboard:
+Send an email with title [Open mLLM Loaderboard] to [email protected] with the Huggingface's model name.
 We will run your model on the four benchmarks and add it to the leaderboard.
 """