leaderboard_demo

Running

App Files Files Community

Roman Solomatin commited on Sep 13

Commit

f2e732e

•

1 Parent(s): df36f2a

align results with models card

Browse files

Files changed (3) hide show

EXTERNAL_MODEL_RESULTS.json +0 -0
config.yaml +42 -42
refresh.py +9 -5

EXTERNAL_MODEL_RESULTS.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

config.yaml CHANGED Viewed

@@ -23,7 +23,7 @@ tasks:
     metric: max_ap
     metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
     task_description: "Pair classification is the task of determining whether two texts are similar."
-  Reranking:
     icon: "🥈"
     metric: map
     metric_description: "Mean Average Precision (MAP)"
@@ -345,35 +345,35 @@ boards:
     credits: "[Roman Solomatin](https://github.com/Samoed) and SaluteDevices: [Alena Fenogenova](https://github.com/Alenush), [Aleksandr Abramov](https://github.com/Ab1992ao), [Artem Snegirev](https://github.com/artemsnegirev), [Anna Maksimova](https://github.com/anpalmak2003), [Maria Tikhonova](https://github.com/MariyaTikhonova)"
     tasks:
       Classification:
-        - GeoreviewClassification (rus-Cyrl)
-        - HeadlineClassification (rus-Cyrl)
-        - InappropriatenessClassification (rus-Cyrl)
-        - KinopoiskClassification (rus-Cyrl)
-        - RuReviewsClassification (rus-Cyrl)
-        - RuSciBenchGRNTIClassification (rus-Cyrl)
-        - RuSciBenchOECDClassification (rus-Cyrl)
-        - MassiveIntentClassification (rus-Cyrl)
-        - MassiveScenarioClassification (rus-Cyrl)
       Clustering:
-        - GeoreviewClusteringP2P (rus-Cyrl)
-        - RuSciBenchGRNTIClusteringP2P (rus-Cyrl)
-        - RuSciBenchOECDClusteringP2P (rus-Cyrl)
       PairClassification:
-        - TERRa (rus-Cyrl)
       Reranking:
-        - RuBQReranking (rus-Cyrl)
-        - MIRACLReranking (rus-Cyrl)
       Retrieval:
-        - RiaNewsRetrieval (rus-Cyrl)
-        - RuBQRetrieval (rus-Cyrl)
-        - MIRACLRetrieval (rus-Cyrl)
       STS:
-        - RUParaPhraserSTS (rus-Cyrl)
-        - RuSTSBenchmarkSTS (rus-Cyrl)
-        - STS22 (rus-Cyrl)
       MultilabelClassification:
-        - CEDRClassification (rus-Cyrl)
-        - SensitiveTopicsClassification (rus-Cyrl)
   se:
     title: Swedish
     language_long: Swedish
@@ -530,23 +530,23 @@ boards:
     metric: nDCG@10
     tasks:
       Retrieval:
-        - AppsRetrieval (eng-Latn_python-Code)
-        - CodeFeedbackMT (c-Code_sql-Code_python-Code_shell-Code_swift-Code_eng-Latn)
-        - CodeFeedbackST (python-Code_javascript-Code_go-Code_ruby-Code_java-Code_php-Code_eng-Latn)
-        - CodeSearchNetCCRetrieval (python-Code)
-        - CodeSearchNetCCRetrieval (javascript-Code)
-        - CodeSearchNetCCRetrieval (go-Code)
-        - CodeSearchNetCCRetrieval (ruby-Code)
-        - CodeSearchNetCCRetrieval (java-Code)
-        - CodeSearchNetCCRetrieval (php-Code)
-        - CodeSearchNetRetrieval (python-Code)
-        - CodeSearchNetRetrieval (javascript-Code)
-        - CodeSearchNetRetrieval (go-Code)
-        - CodeSearchNetRetrieval (ruby-Code)
-        - CodeSearchNetRetrieval (java-Code)
-        - CodeSearchNetRetrieval (php-Code)
-        - CodeTransOceanContest (python-Code_c++-Code)
         - CodeTransOceanDL
-        - CosQA (eng-Latn_python-Code)
         - StackOverflowQA
-        - SyntheticText2SQL (eng-Latn_sql-Code)

     metric: max_ap
     metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
     task_description: "Pair classification is the task of determining whether two texts are similar."
+  Reranking:
     icon: "🥈"
     metric: map
     metric_description: "Mean Average Precision (MAP)"
     credits: "[Roman Solomatin](https://github.com/Samoed) and SaluteDevices: [Alena Fenogenova](https://github.com/Alenush), [Aleksandr Abramov](https://github.com/Ab1992ao), [Artem Snegirev](https://github.com/artemsnegirev), [Anna Maksimova](https://github.com/anpalmak2003), [Maria Tikhonova](https://github.com/MariyaTikhonova)"
     tasks:
       Classification:
+        - GeoreviewClassification
+        - HeadlineClassification
+        - InappropriatenessClassification
+        - KinopoiskClassification
+        - RuReviewsClassification
+        - RuSciBenchGRNTIClassification
+        - RuSciBenchOECDClassification
+        - MassiveIntentClassification (ru)
+        - MassiveScenarioClassification (ru)
       Clustering:
+        - GeoreviewClusteringP2P
+        - RuSciBenchGRNTIClusteringP2P
+        - RuSciBenchOECDClusteringP2P
       PairClassification:
+        - TERRa
       Reranking:
+        - RuBQReranking
+        - MIRACLReranking (ru)
       Retrieval:
+        - RiaNewsRetrieval
+        - RuBQRetrieval
+        - MIRACLRetrieval (ru)
       STS:
+        - RUParaPhraserSTS
+        - RuSTSBenchmarkSTS
+        - STS22 (ru)
       MultilabelClassification:
+        - CEDRClassification
+        - SensitiveTopicsClassification
   se:
     title: Swedish
     language_long: Swedish
     metric: nDCG@10
     tasks:
       Retrieval:
+        - AppsRetrieval
+        - CodeFeedbackMT
+        - CodeFeedbackST
+        - CodeSearchNetCCRetrieval (python)
+        - CodeSearchNetCCRetrieval (javascript)
+        - CodeSearchNetCCRetrieval (go)
+        - CodeSearchNetCCRetrieval (ruby)
+        - CodeSearchNetCCRetrieval (java)
+        - CodeSearchNetCCRetrieval (php)
+        - CodeSearchNetRetrieval (python)
+        - CodeSearchNetRetrieval (javascript)
+        - CodeSearchNetRetrieval (go)
+        - CodeSearchNetRetrieval (ruby)
+        - CodeSearchNetRetrieval (java)
+        - CodeSearchNetRetrieval (php)
+        - CodeTransOceanContest
         - CodeTransOceanDL
+        - CosQA
         - StackOverflowQA
+        - SyntheticText2SQL

refresh.py CHANGED Viewed

@@ -132,11 +132,11 @@ def make_clickable_model(model_name: str, link: None | str = None) -> str:
 def add_lang(examples):
-    if not (examples["eval_language"]) or (examples["eval_language"] == "default"):
         examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
     else:
         examples["mteb_dataset_name_with_lang"] = (
-            examples["mteb_dataset_name"] + f' ({examples["eval_language"]})'
         )
     return examples
@@ -313,7 +313,7 @@ def get_external_model_results():
     # Save & cache EXTERNAL_MODEL_RESULTS
     with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
-        json.dump(EXTERNAL_MODEL_RESULTS, f, indent=4)
     return EXTERNAL_MODEL_RESULTS
@@ -332,6 +332,10 @@ def download_or_use_cache(modelId: str):
     return meta
 def get_mteb_data(
     tasks: list = ["Clustering"],
     langs: list = [],
@@ -450,11 +454,11 @@ def get_mteb_data(
         try:
             out = [
                 {
-                    res["dataset"]["name"].replace("MTEB ", ""): [
                         round(score["value"], 2)
                         for score in res["metrics"]
                         if filter_metric_fetched(
-                            res["dataset"]["name"].replace("MTEB ", ""),
                             score["type"],
                             task_to_metric.get(res["task"]["type"]),
                             res["dataset"]["split"],

 def add_lang(examples):
+    if not (examples["hf_subset"]) or (examples["hf_subset"] == "default"):
         examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
     else:
         examples["mteb_dataset_name_with_lang"] = (
+            examples["mteb_dataset_name"] + f' ({examples["hf_subset"]})'
         )
     return examples
     # Save & cache EXTERNAL_MODEL_RESULTS
     with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
+        json.dump(dict(sorted(EXTERNAL_MODEL_RESULTS.items())), f, indent=4)
     return EXTERNAL_MODEL_RESULTS
     return meta
+def simplify_dataset_name(name):
+    return name.replace("MTEB ", "").replace(" (default)", "")
 def get_mteb_data(
     tasks: list = ["Clustering"],
     langs: list = [],
         try:
             out = [
                 {
+                    simplify_dataset_name(res["dataset"]["name"]): [
                         round(score["value"], 2)
                         for score in res["metrics"]
                         if filter_metric_fetched(
+                            simplify_dataset_name(res["dataset"]["name"]),
                             score["type"],
                             task_to_metric.get(res["task"]["type"]),
                             res["dataset"]["split"],