Spaces:
Running
Running
Muennighoff
commited on
Commit
•
d2198dc
1
Parent(s):
1e84aac
Update app.py
Browse files
app.py
CHANGED
@@ -336,6 +336,7 @@ EXTERNAL_MODELS = [
|
|
336 |
"text2vec-large-chinese",
|
337 |
"text-embedding-3-small",
|
338 |
"text-embedding-3-large",
|
|
|
339 |
"text-embedding-ada-002",
|
340 |
"text-similarity-ada-001",
|
341 |
"text-similarity-babbage-001",
|
@@ -418,6 +419,7 @@ EXTERNAL_MODEL_TO_LINK = {
|
|
418 |
"text2vec-large-chinese": "https://huggingface.co/GanymedeNil/text2vec-large-chinese",
|
419 |
"text-embedding-3-small": "https://openai.com/blog/new-embedding-models-and-api-updates",
|
420 |
"text-embedding-3-large": "https://openai.com/blog/new-embedding-models-and-api-updates",
|
|
|
421 |
"text-embedding-ada-002": "https://openai.com/blog/new-and-improved-embedding-model",
|
422 |
"text-similarity-ada-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
|
423 |
"text-similarity-babbage-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
|
@@ -499,7 +501,8 @@ EXTERNAL_MODEL_TO_DIM = {
|
|
499 |
"text2vec-base-chinese": 768,
|
500 |
"text2vec-large-chinese": 1024,
|
501 |
"text-embedding-3-large": 3072,
|
502 |
-
"text-embedding-3-
|
|
|
503 |
"text-embedding-ada-002": 1536,
|
504 |
"text-similarity-ada-001": 1024,
|
505 |
"text-similarity-babbage-001": 2048,
|
@@ -581,6 +584,7 @@ EXTERNAL_MODEL_TO_SEQLEN = {
|
|
581 |
"text2vec-base-chinese": 512,
|
582 |
"text2vec-large-chinese": 512,
|
583 |
"text-embedding-3-large": 8191,
|
|
|
584 |
"text-embedding-3-small": 8191,
|
585 |
"text-embedding-ada-002": 8191,
|
586 |
"text-similarity-ada-001": 2046,
|
@@ -882,7 +886,7 @@ def make_datasets_clickable(df):
|
|
882 |
return df
|
883 |
|
884 |
def add_rank(df):
|
885 |
-
cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (GB)", "Embedding Dimensions", "
|
886 |
if len(cols_to_rank) == 1:
|
887 |
df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
|
888 |
else:
|
@@ -914,7 +918,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
914 |
if add_emb_dim:
|
915 |
res["Model Size (GB)"] = EXTERNAL_MODEL_TO_SIZE.get(model, "")
|
916 |
res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
|
917 |
-
res["
|
918 |
df_list.append(res)
|
919 |
|
920 |
for model in models:
|
@@ -953,7 +957,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
953 |
if add_emb_dim:
|
954 |
try:
|
955 |
# Fails on gated repos, so we only include scores for them
|
956 |
-
out["Embedding Dimensions"], out["
|
957 |
except:
|
958 |
pass
|
959 |
df_list.append(out)
|
@@ -1030,7 +1034,7 @@ def get_mteb_average():
|
|
1030 |
# Fill NaN after averaging
|
1031 |
DATA_OVERALL.fillna("", inplace=True)
|
1032 |
|
1033 |
-
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "
|
1034 |
DATA_OVERALL = DATA_OVERALL[DATA_OVERALL.iloc[:, 5:].ne("").any(axis=1)]
|
1035 |
|
1036 |
return DATA_OVERALL
|
@@ -1089,7 +1093,7 @@ def get_mteb_average_zh():
|
|
1089 |
# Fill NaN after averaging
|
1090 |
DATA_OVERALL_ZH.fillna("", inplace=True)
|
1091 |
|
1092 |
-
DATA_OVERALL_ZH = DATA_OVERALL_ZH[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "
|
1093 |
DATA_OVERALL_ZH = DATA_OVERALL_ZH[DATA_OVERALL_ZH.iloc[:, 5:].ne("").any(axis=1)]
|
1094 |
|
1095 |
return DATA_OVERALL_ZH
|
@@ -1143,7 +1147,7 @@ def get_mteb_average_pl():
|
|
1143 |
# Fill NaN after averaging
|
1144 |
DATA_OVERALL_PL.fillna("", inplace=True)
|
1145 |
|
1146 |
-
DATA_OVERALL_PL = DATA_OVERALL_PL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "
|
1147 |
DATA_OVERALL_PL = DATA_OVERALL_PL[DATA_OVERALL_PL.iloc[:, 5:].ne("").any(axis=1)]
|
1148 |
|
1149 |
return DATA_OVERALL_PL
|
@@ -1215,7 +1219,7 @@ table > thead {
|
|
1215 |
}
|
1216 |
|
1217 |
table {
|
1218 |
-
--cell-width-1:
|
1219 |
}
|
1220 |
|
1221 |
table > tbody > tr > td:nth-child(2) > div {
|
@@ -1227,11 +1231,6 @@ block = gr.Blocks(css=css)
|
|
1227 |
with block:
|
1228 |
gr.Markdown(f"""
|
1229 |
Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗 Refer to the [MTEB paper](https://arxiv.org/abs/2210.07316) for details on metrics, tasks and models.
|
1230 |
-
|
1231 |
-
- **Total Datasets**: {NUM_DATASETS}
|
1232 |
-
- **Total Languages**: 113
|
1233 |
-
- **Total Scores**: {NUM_SCORES}
|
1234 |
-
- **Total Models**: {NUM_MODELS}
|
1235 |
""")
|
1236 |
with gr.Tabs():
|
1237 |
with gr.TabItem("Overall"):
|
@@ -1248,6 +1247,7 @@ with block:
|
|
1248 |
DATA_OVERALL,
|
1249 |
datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL.columns),
|
1250 |
type="pandas",
|
|
|
1251 |
)
|
1252 |
with gr.Row():
|
1253 |
data_run_overall = gr.Button("Refresh")
|
@@ -1266,10 +1266,11 @@ with block:
|
|
1266 |
DATA_OVERALL_ZH,
|
1267 |
datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL_ZH.columns),
|
1268 |
type="pandas",
|
|
|
1269 |
)
|
1270 |
with gr.Row():
|
1271 |
data_run_overall_zh = gr.Button("Refresh")
|
1272 |
-
data_run_overall_zh.click(get_mteb_average_zh, inputs=None, outputs=data_overall_zh)
|
1273 |
with gr.TabItem("Polish"):
|
1274 |
with gr.Row():
|
1275 |
gr.Markdown("""
|
@@ -1284,6 +1285,7 @@ with block:
|
|
1284 |
DATA_OVERALL_PL,
|
1285 |
datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL_PL.columns),
|
1286 |
type="pandas",
|
|
|
1287 |
)
|
1288 |
with gr.Row():
|
1289 |
data_run_overall_pl = gr.Button("Refresh")
|
@@ -1834,8 +1836,12 @@ with block:
|
|
1834 |
partial(get_mteb_data, tasks=["Summarization"]),
|
1835 |
outputs=data_summarization,
|
1836 |
)
|
1837 |
-
gr.Markdown(
|
1838 |
-
|
|
|
|
|
|
|
|
|
1839 |
Made with ❤️ for NLP. If this work is useful to you, please consider citing:
|
1840 |
|
1841 |
```bibtex
|
|
|
336 |
"text2vec-large-chinese",
|
337 |
"text-embedding-3-small",
|
338 |
"text-embedding-3-large",
|
339 |
+
"text-embedding-3-large-256",
|
340 |
"text-embedding-ada-002",
|
341 |
"text-similarity-ada-001",
|
342 |
"text-similarity-babbage-001",
|
|
|
419 |
"text2vec-large-chinese": "https://huggingface.co/GanymedeNil/text2vec-large-chinese",
|
420 |
"text-embedding-3-small": "https://openai.com/blog/new-embedding-models-and-api-updates",
|
421 |
"text-embedding-3-large": "https://openai.com/blog/new-embedding-models-and-api-updates",
|
422 |
+
"text-embedding-3-large-256": "https://openai.com/blog/new-embedding-models-and-api-updates",
|
423 |
"text-embedding-ada-002": "https://openai.com/blog/new-and-improved-embedding-model",
|
424 |
"text-similarity-ada-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
|
425 |
"text-similarity-babbage-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
|
|
|
501 |
"text2vec-base-chinese": 768,
|
502 |
"text2vec-large-chinese": 1024,
|
503 |
"text-embedding-3-large": 3072,
|
504 |
+
"text-embedding-3-large-256": 256,
|
505 |
+
"text-embedding-3-small": 1536,
|
506 |
"text-embedding-ada-002": 1536,
|
507 |
"text-similarity-ada-001": 1024,
|
508 |
"text-similarity-babbage-001": 2048,
|
|
|
584 |
"text2vec-base-chinese": 512,
|
585 |
"text2vec-large-chinese": 512,
|
586 |
"text-embedding-3-large": 8191,
|
587 |
+
"text-embedding-3-large-256": 8191,
|
588 |
"text-embedding-3-small": 8191,
|
589 |
"text-embedding-ada-002": 8191,
|
590 |
"text-similarity-ada-001": 2046,
|
|
|
886 |
return df
|
887 |
|
888 |
def add_rank(df):
|
889 |
+
cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (GB)", "Embedding Dimensions", "Max Tokens"]]
|
890 |
if len(cols_to_rank) == 1:
|
891 |
df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
|
892 |
else:
|
|
|
918 |
if add_emb_dim:
|
919 |
res["Model Size (GB)"] = EXTERNAL_MODEL_TO_SIZE.get(model, "")
|
920 |
res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
|
921 |
+
res["Max Tokens"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
|
922 |
df_list.append(res)
|
923 |
|
924 |
for model in models:
|
|
|
957 |
if add_emb_dim:
|
958 |
try:
|
959 |
# Fails on gated repos, so we only include scores for them
|
960 |
+
out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (GB)"] = get_dim_seq_size(model)
|
961 |
except:
|
962 |
pass
|
963 |
df_list.append(out)
|
|
|
1034 |
# Fill NaN after averaging
|
1035 |
DATA_OVERALL.fillna("", inplace=True)
|
1036 |
|
1037 |
+
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
1038 |
DATA_OVERALL = DATA_OVERALL[DATA_OVERALL.iloc[:, 5:].ne("").any(axis=1)]
|
1039 |
|
1040 |
return DATA_OVERALL
|
|
|
1093 |
# Fill NaN after averaging
|
1094 |
DATA_OVERALL_ZH.fillna("", inplace=True)
|
1095 |
|
1096 |
+
DATA_OVERALL_ZH = DATA_OVERALL_ZH[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_ZH)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_ZH)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_ZH)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_ZH)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING_ZH)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_ZH)} datasets)", f"STS Average ({len(TASK_LIST_STS_ZH)} datasets)"]]
|
1097 |
DATA_OVERALL_ZH = DATA_OVERALL_ZH[DATA_OVERALL_ZH.iloc[:, 5:].ne("").any(axis=1)]
|
1098 |
|
1099 |
return DATA_OVERALL_ZH
|
|
|
1147 |
# Fill NaN after averaging
|
1148 |
DATA_OVERALL_PL.fillna("", inplace=True)
|
1149 |
|
1150 |
+
DATA_OVERALL_PL = DATA_OVERALL_PL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_PL)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_PL)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_PL)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_PL)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_PL)} datasets)", f"STS Average ({len(TASK_LIST_STS_PL)} datasets)"]]
|
1151 |
DATA_OVERALL_PL = DATA_OVERALL_PL[DATA_OVERALL_PL.iloc[:, 5:].ne("").any(axis=1)]
|
1152 |
|
1153 |
return DATA_OVERALL_PL
|
|
|
1219 |
}
|
1220 |
|
1221 |
table {
|
1222 |
+
--cell-width-1: 210px
|
1223 |
}
|
1224 |
|
1225 |
table > tbody > tr > td:nth-child(2) > div {
|
|
|
1231 |
with block:
|
1232 |
gr.Markdown(f"""
|
1233 |
Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗 Refer to the [MTEB paper](https://arxiv.org/abs/2210.07316) for details on metrics, tasks and models.
|
|
|
|
|
|
|
|
|
|
|
1234 |
""")
|
1235 |
with gr.Tabs():
|
1236 |
with gr.TabItem("Overall"):
|
|
|
1247 |
DATA_OVERALL,
|
1248 |
datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL.columns),
|
1249 |
type="pandas",
|
1250 |
+
height=600,
|
1251 |
)
|
1252 |
with gr.Row():
|
1253 |
data_run_overall = gr.Button("Refresh")
|
|
|
1266 |
DATA_OVERALL_ZH,
|
1267 |
datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL_ZH.columns),
|
1268 |
type="pandas",
|
1269 |
+
height=600,
|
1270 |
)
|
1271 |
with gr.Row():
|
1272 |
data_run_overall_zh = gr.Button("Refresh")
|
1273 |
+
data_run_overall_zh.click(get_mteb_average_zh, inputs=None, outputs=data_overall_zh)
|
1274 |
with gr.TabItem("Polish"):
|
1275 |
with gr.Row():
|
1276 |
gr.Markdown("""
|
|
|
1285 |
DATA_OVERALL_PL,
|
1286 |
datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL_PL.columns),
|
1287 |
type="pandas",
|
1288 |
+
height=600,
|
1289 |
)
|
1290 |
with gr.Row():
|
1291 |
data_run_overall_pl = gr.Button("Refresh")
|
|
|
1836 |
partial(get_mteb_data, tasks=["Summarization"]),
|
1837 |
outputs=data_summarization,
|
1838 |
)
|
1839 |
+
gr.Markdown(f"""
|
1840 |
+
- **Total Datasets**: {NUM_DATASETS}
|
1841 |
+
- **Total Languages**: 113
|
1842 |
+
- **Total Scores**: {NUM_SCORES}
|
1843 |
+
- **Total Models**: {NUM_MODELS}
|
1844 |
+
""" + r"""
|
1845 |
Made with ❤️ for NLP. If this work is useful to you, please consider citing:
|
1846 |
|
1847 |
```bibtex
|