File size: 2,224 Bytes
f067bfb
 
 
 
 
5faacb0
 
 
 
 
 
 
 
 
f067bfb
 
 
 
 
 
5faacb0
f067bfb
 
 
 
 
cae7a54
 
f067bfb
3c85a39
f067bfb
 
90fafdc
 
 
 
 
 
 
13a280b
90fafdc
 
 
 
 
f067bfb
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
TITLE = '<h1 align="center" id="space-title">Open Multilingual LLM Evaluation Leaderboard</h1>'

INTRO_TEXT = f"""
## About

This leaderboard tracks progress and ranks performance of large language models (LLMs) developed for different languages, 
emphasizing on non-English languages to democratize benefits of LLMs to broader society. 
Our current leaderboard provides evaluation data for 29 languages, i.e., 
Arabic, Armenian, Basque, Bengali, Catalan, Chinese, Croatian, Danish, Dutch, 
French, German, Gujarati, Hindi, Hungarian, Indonesian, Italian, Kannada, Malayalam, 
Marathi, Nepali, Portuguese, Romanian, Russian, Serbian, Slovak, Spanish, Swedish, 
Tamil, Telugu, Ukrainian, and Vietnamese, that will be expanded along the way. 
Both multilingual and language-specific LLMs are welcome in this leaderboard.  
We currently evaluate models over four benchmarks:

- <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) 
- <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) 
- <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot) 
- <a href="https://arxiv.org/abs/2109.07958" target="_blank">  TruthfulQA </a> (0-shot)

The evaluation data was translated into these languages using ChatGPT (gpt-35-turbo).

"""

HOW_TO = f"""
## How to list your model performance on this leaderboard:
<a
Run the evaluation of your model using this repo: <a href="https://github.com/laiviet/lm-evaluation-harness" target="_blank">https://github.com/laiviet/lm-evaluation-harness</a>.

And then, push the evaluation log and make a pull request.
"""

CREDIT = f"""
## Credit

To make this website, we use the following resources:

- Datasets (AI2_ARC, HellaSwag, MMLU, TruthfulQA)
- Funding and GPU access (Adobe Research)
- Evaluation code (EleutherAI's lm_evaluation_harness repo)
- Leaderboard code (Huggingface4's open_llm_leaderboard repo)

"""


CITATION = f"""
## Citation

```

@misc{{lai2023openllmbenchmark,
    author = {{Viet Lai and Nghia Trung Ngo and Amir Pouran Ben Veyseh and Franck Dernoncourt and Thien Huu Nguyen}},
    title={{Open Multilingual LLM Evaluation Leaderboard}},
    year={{2023}}
}}
```
"""