Spaces:
Running
Running
Muennighoff
commited on
Merge main
Browse files- config.yaml +34 -11
- refresh.py +11 -4
config.yaml
CHANGED
@@ -20,7 +20,7 @@ tasks:
|
|
20 |
task_description: "Clustering is the task of grouping similar documents together."
|
21 |
PairClassification:
|
22 |
icon: "🎭"
|
23 |
-
metric:
|
24 |
metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
|
25 |
task_description: "Pair classification is the task of determining whether two texts are similar."
|
26 |
Reranking:
|
@@ -35,14 +35,19 @@ tasks:
|
|
35 |
task_description: "Retrieval is the task of finding relevant documents for a query."
|
36 |
STS:
|
37 |
icon: "☘️"
|
38 |
-
metric:
|
39 |
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
|
40 |
task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
|
41 |
Summarization:
|
42 |
icon: "📜"
|
43 |
-
metric:
|
44 |
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
|
45 |
task_description: "Summarization is the task of generating a summary of a text."
|
|
|
|
|
|
|
|
|
|
|
46 |
InstructionRetrieval:
|
47 |
icon: "🔎📋"
|
48 |
metric: "p-MRR"
|
@@ -347,6 +352,8 @@ boards:
|
|
347 |
- RuReviewsClassification (rus-Cyrl)
|
348 |
- RuSciBenchGRNTIClassification (rus-Cyrl)
|
349 |
- RuSciBenchOECDClassification (rus-Cyrl)
|
|
|
|
|
350 |
Clustering:
|
351 |
- GeoreviewClusteringP2P (rus-Cyrl)
|
352 |
- RuSciBenchGRNTIClusteringP2P (rus-Cyrl)
|
@@ -355,12 +362,18 @@ boards:
|
|
355 |
- TERRa (rus-Cyrl)
|
356 |
Reranking:
|
357 |
- RuBQReranking (rus-Cyrl)
|
|
|
358 |
Retrieval:
|
359 |
- RiaNewsRetrieval (rus-Cyrl)
|
360 |
- RuBQRetrieval (rus-Cyrl)
|
|
|
361 |
STS:
|
362 |
- RUParaPhraserSTS (rus-Cyrl)
|
363 |
- RuSTSBenchmarkSTS (rus-Cyrl)
|
|
|
|
|
|
|
|
|
364 |
se:
|
365 |
title: Swedish
|
366 |
language_long: Swedish
|
@@ -517,13 +530,23 @@ boards:
|
|
517 |
metric: nDCG@10
|
518 |
tasks:
|
519 |
Retrieval:
|
520 |
-
- AppsRetrieval
|
521 |
-
- CodeFeedbackMT
|
522 |
-
- CodeFeedbackST
|
523 |
-
- CodeSearchNetCCRetrieval
|
524 |
-
-
|
525 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
526 |
- CodeTransOceanDL
|
527 |
-
- CosQA
|
528 |
- StackOverflowQA
|
529 |
-
- SyntheticText2SQL
|
|
|
20 |
task_description: "Clustering is the task of grouping similar documents together."
|
21 |
PairClassification:
|
22 |
icon: "🎭"
|
23 |
+
metric: max_ap
|
24 |
metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
|
25 |
task_description: "Pair classification is the task of determining whether two texts are similar."
|
26 |
Reranking:
|
|
|
35 |
task_description: "Retrieval is the task of finding relevant documents for a query."
|
36 |
STS:
|
37 |
icon: "☘️"
|
38 |
+
metric: cosine_spearman
|
39 |
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
|
40 |
task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
|
41 |
Summarization:
|
42 |
icon: "📜"
|
43 |
+
metric: cosine_spearman
|
44 |
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
|
45 |
task_description: "Summarization is the task of generating a summary of a text."
|
46 |
+
MultilabelClassification:
|
47 |
+
icon: "🏷️"
|
48 |
+
metric: accuracy
|
49 |
+
metric_description: "Accuracy"
|
50 |
+
task_description: "Multilabel classification is the task of assigning multiple labels to a text."
|
51 |
InstructionRetrieval:
|
52 |
icon: "🔎📋"
|
53 |
metric: "p-MRR"
|
|
|
352 |
- RuReviewsClassification (rus-Cyrl)
|
353 |
- RuSciBenchGRNTIClassification (rus-Cyrl)
|
354 |
- RuSciBenchOECDClassification (rus-Cyrl)
|
355 |
+
- MassiveIntentClassification (rus-Cyrl)
|
356 |
+
- MassiveScenarioClassification (rus-Cyrl)
|
357 |
Clustering:
|
358 |
- GeoreviewClusteringP2P (rus-Cyrl)
|
359 |
- RuSciBenchGRNTIClusteringP2P (rus-Cyrl)
|
|
|
362 |
- TERRa (rus-Cyrl)
|
363 |
Reranking:
|
364 |
- RuBQReranking (rus-Cyrl)
|
365 |
+
- MIRACLReranking (rus-Cyrl)
|
366 |
Retrieval:
|
367 |
- RiaNewsRetrieval (rus-Cyrl)
|
368 |
- RuBQRetrieval (rus-Cyrl)
|
369 |
+
- MIRACLRetrieval (rus-Cyrl)
|
370 |
STS:
|
371 |
- RUParaPhraserSTS (rus-Cyrl)
|
372 |
- RuSTSBenchmarkSTS (rus-Cyrl)
|
373 |
+
- STS22 (rus-Cyrl)
|
374 |
+
MultilabelClassification:
|
375 |
+
- CEDRClassification (rus-Cyrl)
|
376 |
+
- SensitiveTopicsClassification (rus-Cyrl)
|
377 |
se:
|
378 |
title: Swedish
|
379 |
language_long: Swedish
|
|
|
530 |
metric: nDCG@10
|
531 |
tasks:
|
532 |
Retrieval:
|
533 |
+
- AppsRetrieval (eng-Latn_python-Code)
|
534 |
+
- CodeFeedbackMT (c-Code_sql-Code_python-Code_shell-Code_swift-Code_eng-Latn)
|
535 |
+
- CodeFeedbackST (python-Code_javascript-Code_go-Code_ruby-Code_java-Code_php-Code_eng-Latn)
|
536 |
+
- CodeSearchNetCCRetrieval (python-Code)
|
537 |
+
- CodeSearchNetCCRetrieval (javascript-Code)
|
538 |
+
- CodeSearchNetCCRetrieval (go-Code)
|
539 |
+
- CodeSearchNetCCRetrieval (ruby-Code)
|
540 |
+
- CodeSearchNetCCRetrieval (java-Code)
|
541 |
+
- CodeSearchNetCCRetrieval (php-Code)
|
542 |
+
- CodeSearchNetRetrieval (python-Code)
|
543 |
+
- CodeSearchNetRetrieval (javascript-Code)
|
544 |
+
- CodeSearchNetRetrieval (go-Code)
|
545 |
+
- CodeSearchNetRetrieval (ruby-Code)
|
546 |
+
- CodeSearchNetRetrieval (java-Code)
|
547 |
+
- CodeSearchNetRetrieval (php-Code)
|
548 |
+
- CodeTransOceanContest (python-Code_c++-Code)
|
549 |
- CodeTransOceanDL
|
550 |
+
- CosQA (eng-Latn_python-Code)
|
551 |
- StackOverflowQA
|
552 |
+
- SyntheticText2SQL (eng-Latn_sql-Code)
|
refresh.py
CHANGED
@@ -30,9 +30,10 @@ PRETTY_NAMES = {
|
|
30 |
TASK_TO_METRIC = {k: [v["metric"]] for k, v in TASKS_CONFIG.items()}
|
31 |
# Add legacy metric names
|
32 |
TASK_TO_METRIC["STS"].append("cos_sim_spearman")
|
33 |
-
TASK_TO_METRIC["STS"].append("
|
34 |
TASK_TO_METRIC["Summarization"].append("cos_sim_spearman")
|
35 |
-
TASK_TO_METRIC["Summarization"].append("
|
|
|
36 |
TASK_TO_METRIC["PairClassification"].append("cos_sim_ap")
|
37 |
TASK_TO_METRIC["PairClassification"].append("cosine_ap")
|
38 |
|
@@ -166,6 +167,8 @@ def filter_metric_external(x, task, metrics) -> bool:
|
|
166 |
return bool(x["mteb_task"] == task and x["metric"] == "ndcg_at_1")
|
167 |
elif (x["mteb_dataset_name"].startswith("BrightRetrieval") and (x["split"] == "long")):
|
168 |
return bool(x["mteb_task"] == task and x["metric"] in ["recall_at_1"])
|
|
|
|
|
169 |
else:
|
170 |
return bool(x["mteb_task"] == task and x["metric"] in metrics)
|
171 |
|
@@ -258,6 +261,10 @@ def get_external_model_results():
|
|
258 |
download_mode="force_redownload",
|
259 |
verification_mode="no_checks",
|
260 |
)
|
|
|
|
|
|
|
|
|
261 |
ds = ds.map(add_lang)
|
262 |
ds = ds.map(add_task)
|
263 |
base_dict = {
|
@@ -273,8 +280,8 @@ def get_external_model_results():
|
|
273 |
ds_sub = ds.filter(lambda x: filter_metric_external(x, task, metrics))[
|
274 |
"test"
|
275 |
]
|
276 |
-
|
277 |
-
for metric in
|
278 |
ds_dict = ds_sub.filter(lambda x: x["metric"] == metric).to_dict()
|
279 |
ds_dict = {
|
280 |
k: round(v, 2)
|
|
|
30 |
TASK_TO_METRIC = {k: [v["metric"]] for k, v in TASKS_CONFIG.items()}
|
31 |
# Add legacy metric names
|
32 |
TASK_TO_METRIC["STS"].append("cos_sim_spearman")
|
33 |
+
TASK_TO_METRIC["STS"].append("spearman")
|
34 |
TASK_TO_METRIC["Summarization"].append("cos_sim_spearman")
|
35 |
+
TASK_TO_METRIC["Summarization"].append("spearman")
|
36 |
+
TASK_TO_METRIC["PairClassification"].append("ap")
|
37 |
TASK_TO_METRIC["PairClassification"].append("cos_sim_ap")
|
38 |
TASK_TO_METRIC["PairClassification"].append("cosine_ap")
|
39 |
|
|
|
167 |
return bool(x["mteb_task"] == task and x["metric"] == "ndcg_at_1")
|
168 |
elif (x["mteb_dataset_name"].startswith("BrightRetrieval") and (x["split"] == "long")):
|
169 |
return bool(x["mteb_task"] == task and x["metric"] in ["recall_at_1"])
|
170 |
+
elif x["mteb_dataset_name"] == "MIRACLReranking":
|
171 |
+
return bool(x["mteb_task"] == task and x["metric"] in ["NDCG@10(MIRACL)"])
|
172 |
else:
|
173 |
return bool(x["mteb_task"] == task and x["metric"] in metrics)
|
174 |
|
|
|
261 |
download_mode="force_redownload",
|
262 |
verification_mode="no_checks",
|
263 |
)
|
264 |
+
except ValueError as e:
|
265 |
+
print(f"Can't fined model {model} in results repository. Exception: {e}")
|
266 |
+
continue
|
267 |
+
|
268 |
ds = ds.map(add_lang)
|
269 |
ds = ds.map(add_task)
|
270 |
base_dict = {
|
|
|
280 |
ds_sub = ds.filter(lambda x: filter_metric_external(x, task, metrics))[
|
281 |
"test"
|
282 |
]
|
283 |
+
curent_task_metrics = ds_sub.unique("metric")
|
284 |
+
for metric in curent_task_metrics:
|
285 |
ds_dict = ds_sub.filter(lambda x: x["metric"] == metric).to_dict()
|
286 |
ds_dict = {
|
287 |
k: round(v, 2)
|