Spaces:

hallucinations-leaderboard
/

leaderboard

Running on CPU Upgrade

App Files Files Community

pminervini commited on Jan 11

Commit

b06387f

•

1 Parent(s): 6524ea0

update

Browse files

Files changed (1) hide show

cli/analysis-cli.py +136 -0

cli/analysis-cli.py ADDED Viewed

	@@ -0,0 +1,136 @@

+#!/usr/bin/env python3
+import os
+import sys
+import json
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from scipy.cluster.hierarchy import linkage
+from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
+from src.envs import QUEUE_REPO, RESULTS_REPO, API
+from src.utils import my_snapshot_download
+def find_json_files(json_path):
+    res = []
+    for root, dirs, files in os.walk(json_path):
+        for file in files:
+            if file.endswith(".json"):
+                res.append(os.path.join(root, file))
+    return res
+my_snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
+my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
+result_path_lst = find_json_files(EVAL_RESULTS_PATH_BACKEND)
+request_path_lst = find_json_files(EVAL_REQUESTS_PATH_BACKEND)
+model_name_to_model_map = {}
+for path in request_path_lst:
+    with open(path, 'r') as f:
+        data = json.load(f)
+    model_name_to_model_map[data["model"]] = data
+model_dataset_metric_to_result_map = {}
+data_map = {}
+for path in result_path_lst:
+    with open(path, 'r') as f:
+        data = json.load(f)
+    model_name = data["config"]["model_name"]
+    for dataset_name, results_dict in data["results"].items():
+        for metric_name, value in results_dict.items():
+            # print(model_name, dataset_name, metric_name, value)
+            if ',' in metric_name and '_stderr' not in metric_name \
+                    and 'f1' not in metric_name \
+                    and 'selfcheckgpt' not in dataset_name \
+                    and model_name_to_model_map[model_name]["likes"] > 256:
+                to_add = True
+                if 'nq_open' in dataset_name or 'triviaqa' in dataset_name:
+                    to_add = False
+                    # pass
+                # breakpoint()
+                if 'bertscore' in metric_name:
+                    if 'precision' not in metric_name:
+                        to_add = False
+                if 'correctness,' in metric_name or 'em,' in metric_name:
+                    to_add = False
+                if 'rouge' in metric_name:
+                    if 'rougeL' not in metric_name:
+                        to_add = False
+                if 'ifeval' in dataset_name:
+                    if 'prompt_level_strict_acc' not in metric_name:
+                        to_add = False
+                if 'squad' in dataset_name:
+                    to_add = False
+                if 'fever' in dataset_name:
+                    to_add = False
+                if 'rouge' in metric_name:
+                    value /= 100.0
+                if to_add:
+                    sanitised_metric_name = metric_name.split(',')[0]
+                    model_dataset_metric_to_result_map[(model_name, dataset_name, sanitised_metric_name)] = value
+                    # if (model_name, dataset_name) not in data_map:
+                    #     data_map[(model_name, dataset_name)] = {}
+                    # data_map[(model_name, dataset_name)][metric_name] = value
+                    if model_name not in data_map:
+                        data_map[model_name] = {}
+                    data_map[model_name][(dataset_name, sanitised_metric_name)] = value
+                    print('model_name', model_name, 'dataset_name', dataset_name, 'metric_name', metric_name, 'value', value)
+model_name_lst = [m for m in data_map.keys()]
+for m in model_name_lst:
+    if len(data_map[m]) < 8:
+        del data_map[m]
+df = pd.DataFrame.from_dict(data_map, orient='index')
+o_df = df.copy(deep=True)
+print(df)
+# Check for NaN or infinite values and replace them
+df.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace infinities with NaN
+df.fillna(0, inplace=True)  # Replace NaN with 0 (or use another imputation strategy)
+from sklearn.preprocessing import MinMaxScaler
+# scaler = MinMaxScaler()
+# df = pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns)
+sns.set_context("notebook", font_scale=1.0)
+# fig = sns.clustermap(df, method='average', metric='cosine', cmap='coolwarm', figsize=(16, 12), annot=True)
+fig = sns.clustermap(df, method='ward', metric='euclidean', cmap='coolwarm', figsize=(16, 12), annot=True, mask=o_df.isnull())
+# Adjust the size of the cells (less wide)
+plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
+plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
+# Save the clustermap to file
+fig.savefig('plots/clustermap.pdf')
+fig.savefig('plots/clustermap.png')