Muennighoff commited on
Commit
dcdb471
2 Parent(s): 1fbbed1 6a84030

Merge main

Browse files
Files changed (2) hide show
  1. config.yaml +34 -11
  2. refresh.py +11 -4
config.yaml CHANGED
@@ -20,7 +20,7 @@ tasks:
20
  task_description: "Clustering is the task of grouping similar documents together."
21
  PairClassification:
22
  icon: "🎭"
23
- metric: ap
24
  metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
25
  task_description: "Pair classification is the task of determining whether two texts are similar."
26
  Reranking:
@@ -35,14 +35,19 @@ tasks:
35
  task_description: "Retrieval is the task of finding relevant documents for a query."
36
  STS:
37
  icon: "☘️"
38
- metric: spearman
39
  metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
40
  task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
41
  Summarization:
42
  icon: "📜"
43
- metric: spearman
44
  metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
45
  task_description: "Summarization is the task of generating a summary of a text."
 
 
 
 
 
46
  InstructionRetrieval:
47
  icon: "🔎📋"
48
  metric: "p-MRR"
@@ -347,6 +352,8 @@ boards:
347
  - RuReviewsClassification (rus-Cyrl)
348
  - RuSciBenchGRNTIClassification (rus-Cyrl)
349
  - RuSciBenchOECDClassification (rus-Cyrl)
 
 
350
  Clustering:
351
  - GeoreviewClusteringP2P (rus-Cyrl)
352
  - RuSciBenchGRNTIClusteringP2P (rus-Cyrl)
@@ -355,12 +362,18 @@ boards:
355
  - TERRa (rus-Cyrl)
356
  Reranking:
357
  - RuBQReranking (rus-Cyrl)
 
358
  Retrieval:
359
  - RiaNewsRetrieval (rus-Cyrl)
360
  - RuBQRetrieval (rus-Cyrl)
 
361
  STS:
362
  - RUParaPhraserSTS (rus-Cyrl)
363
  - RuSTSBenchmarkSTS (rus-Cyrl)
 
 
 
 
364
  se:
365
  title: Swedish
366
  language_long: Swedish
@@ -517,13 +530,23 @@ boards:
517
  metric: nDCG@10
518
  tasks:
519
  Retrieval:
520
- - AppsRetrieval
521
- - CodeFeedbackMT
522
- - CodeFeedbackST
523
- - CodeSearchNetCCRetrieval
524
- - CodeSearchNetRetrieval
525
- - CodeTransOceanContest
 
 
 
 
 
 
 
 
 
 
526
  - CodeTransOceanDL
527
- - CosQA
528
  - StackOverflowQA
529
- - SyntheticText2SQL
 
20
  task_description: "Clustering is the task of grouping similar documents together."
21
  PairClassification:
22
  icon: "🎭"
23
+ metric: max_ap
24
  metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
25
  task_description: "Pair classification is the task of determining whether two texts are similar."
26
  Reranking:
 
35
  task_description: "Retrieval is the task of finding relevant documents for a query."
36
  STS:
37
  icon: "☘️"
38
+ metric: cosine_spearman
39
  metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
40
  task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
41
  Summarization:
42
  icon: "📜"
43
+ metric: cosine_spearman
44
  metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
45
  task_description: "Summarization is the task of generating a summary of a text."
46
+ MultilabelClassification:
47
+ icon: "🏷️"
48
+ metric: accuracy
49
+ metric_description: "Accuracy"
50
+ task_description: "Multilabel classification is the task of assigning multiple labels to a text."
51
  InstructionRetrieval:
52
  icon: "🔎📋"
53
  metric: "p-MRR"
 
352
  - RuReviewsClassification (rus-Cyrl)
353
  - RuSciBenchGRNTIClassification (rus-Cyrl)
354
  - RuSciBenchOECDClassification (rus-Cyrl)
355
+ - MassiveIntentClassification (rus-Cyrl)
356
+ - MassiveScenarioClassification (rus-Cyrl)
357
  Clustering:
358
  - GeoreviewClusteringP2P (rus-Cyrl)
359
  - RuSciBenchGRNTIClusteringP2P (rus-Cyrl)
 
362
  - TERRa (rus-Cyrl)
363
  Reranking:
364
  - RuBQReranking (rus-Cyrl)
365
+ - MIRACLReranking (rus-Cyrl)
366
  Retrieval:
367
  - RiaNewsRetrieval (rus-Cyrl)
368
  - RuBQRetrieval (rus-Cyrl)
369
+ - MIRACLRetrieval (rus-Cyrl)
370
  STS:
371
  - RUParaPhraserSTS (rus-Cyrl)
372
  - RuSTSBenchmarkSTS (rus-Cyrl)
373
+ - STS22 (rus-Cyrl)
374
+ MultilabelClassification:
375
+ - CEDRClassification (rus-Cyrl)
376
+ - SensitiveTopicsClassification (rus-Cyrl)
377
  se:
378
  title: Swedish
379
  language_long: Swedish
 
530
  metric: nDCG@10
531
  tasks:
532
  Retrieval:
533
+ - AppsRetrieval (eng-Latn_python-Code)
534
+ - CodeFeedbackMT (c-Code_sql-Code_python-Code_shell-Code_swift-Code_eng-Latn)
535
+ - CodeFeedbackST (python-Code_javascript-Code_go-Code_ruby-Code_java-Code_php-Code_eng-Latn)
536
+ - CodeSearchNetCCRetrieval (python-Code)
537
+ - CodeSearchNetCCRetrieval (javascript-Code)
538
+ - CodeSearchNetCCRetrieval (go-Code)
539
+ - CodeSearchNetCCRetrieval (ruby-Code)
540
+ - CodeSearchNetCCRetrieval (java-Code)
541
+ - CodeSearchNetCCRetrieval (php-Code)
542
+ - CodeSearchNetRetrieval (python-Code)
543
+ - CodeSearchNetRetrieval (javascript-Code)
544
+ - CodeSearchNetRetrieval (go-Code)
545
+ - CodeSearchNetRetrieval (ruby-Code)
546
+ - CodeSearchNetRetrieval (java-Code)
547
+ - CodeSearchNetRetrieval (php-Code)
548
+ - CodeTransOceanContest (python-Code_c++-Code)
549
  - CodeTransOceanDL
550
+ - CosQA (eng-Latn_python-Code)
551
  - StackOverflowQA
552
+ - SyntheticText2SQL (eng-Latn_sql-Code)
refresh.py CHANGED
@@ -30,9 +30,10 @@ PRETTY_NAMES = {
30
  TASK_TO_METRIC = {k: [v["metric"]] for k, v in TASKS_CONFIG.items()}
31
  # Add legacy metric names
32
  TASK_TO_METRIC["STS"].append("cos_sim_spearman")
33
- TASK_TO_METRIC["STS"].append("cosine_spearman")
34
  TASK_TO_METRIC["Summarization"].append("cos_sim_spearman")
35
- TASK_TO_METRIC["Summarization"].append("cosine_spearman")
 
36
  TASK_TO_METRIC["PairClassification"].append("cos_sim_ap")
37
  TASK_TO_METRIC["PairClassification"].append("cosine_ap")
38
 
@@ -166,6 +167,8 @@ def filter_metric_external(x, task, metrics) -> bool:
166
  return bool(x["mteb_task"] == task and x["metric"] == "ndcg_at_1")
167
  elif (x["mteb_dataset_name"].startswith("BrightRetrieval") and (x["split"] == "long")):
168
  return bool(x["mteb_task"] == task and x["metric"] in ["recall_at_1"])
 
 
169
  else:
170
  return bool(x["mteb_task"] == task and x["metric"] in metrics)
171
 
@@ -258,6 +261,10 @@ def get_external_model_results():
258
  download_mode="force_redownload",
259
  verification_mode="no_checks",
260
  )
 
 
 
 
261
  ds = ds.map(add_lang)
262
  ds = ds.map(add_task)
263
  base_dict = {
@@ -273,8 +280,8 @@ def get_external_model_results():
273
  ds_sub = ds.filter(lambda x: filter_metric_external(x, task, metrics))[
274
  "test"
275
  ]
276
- metrics = ds_sub.unique("metric")
277
- for metric in metrics:
278
  ds_dict = ds_sub.filter(lambda x: x["metric"] == metric).to_dict()
279
  ds_dict = {
280
  k: round(v, 2)
 
30
  TASK_TO_METRIC = {k: [v["metric"]] for k, v in TASKS_CONFIG.items()}
31
  # Add legacy metric names
32
  TASK_TO_METRIC["STS"].append("cos_sim_spearman")
33
+ TASK_TO_METRIC["STS"].append("spearman")
34
  TASK_TO_METRIC["Summarization"].append("cos_sim_spearman")
35
+ TASK_TO_METRIC["Summarization"].append("spearman")
36
+ TASK_TO_METRIC["PairClassification"].append("ap")
37
  TASK_TO_METRIC["PairClassification"].append("cos_sim_ap")
38
  TASK_TO_METRIC["PairClassification"].append("cosine_ap")
39
 
 
167
  return bool(x["mteb_task"] == task and x["metric"] == "ndcg_at_1")
168
  elif (x["mteb_dataset_name"].startswith("BrightRetrieval") and (x["split"] == "long")):
169
  return bool(x["mteb_task"] == task and x["metric"] in ["recall_at_1"])
170
+ elif x["mteb_dataset_name"] == "MIRACLReranking":
171
+ return bool(x["mteb_task"] == task and x["metric"] in ["NDCG@10(MIRACL)"])
172
  else:
173
  return bool(x["mteb_task"] == task and x["metric"] in metrics)
174
 
 
261
  download_mode="force_redownload",
262
  verification_mode="no_checks",
263
  )
264
+ except ValueError as e:
265
+ print(f"Can't fined model {model} in results repository. Exception: {e}")
266
+ continue
267
+
268
  ds = ds.map(add_lang)
269
  ds = ds.map(add_task)
270
  base_dict = {
 
280
  ds_sub = ds.filter(lambda x: filter_metric_external(x, task, metrics))[
281
  "test"
282
  ]
283
+ curent_task_metrics = ds_sub.unique("metric")
284
+ for metric in curent_task_metrics:
285
  ds_dict = ds_sub.filter(lambda x: x["metric"] == metric).to_dict()
286
  ds_dict = {
287
  k: round(v, 2)