import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.float_format = '{:.3f}'.format
dataset_type = {
    "imagenet1k": "natural",
    "imagenetv2": "natural",
    "imagenet-r": "natural",
    "imagenet_sketch": "specialized",
    "objectnet": "natural",
    "imagenet-a": "natural",
    "imagenet-o": "natural",
    
    "vtab/cifar10": "natural",
    "vtab/cifar100": "natural",
    "mnist": "specialized",

    "vtab/flowers": "natural",
    "cars": "natural",
    "vtab/svhn": "natural",
    "fer2013": "natural",
    "renderedsst2": "specialized",
    "vtab/pets": "natural",
    "vtab/caltech101": "natural",
    "voc2007_multilabel": "natural",    
    "voc2007": "natural",
    "sun397": "natural",
    "fgvc_aircraft": "natural",
    "country211": "natural",
    "vtab/dtd": "natural",
    "gtsrb": "natural",
    "stl10": "natural",

    "vtab/diabetic_retinopathy": "specialized",
    "vtab/eurosat": "specialized",
    "vtab/resisc45": "specialized",
    "vtab/pcam": "specialized",

    "vtab/clevr_count_all": "structured",
    "vtab/clevr_closest_object_distance": "structured",

    "vtab/dsprites_label_orientation": "structured",
    "vtab/dsprites_label_x_position": "structured",

    "vtab/smallnorb_label_elevation": "structured",
    "vtab/smallnorb_label_azimuth": "structured",

    "vtab/dmlab": "structured",
    "vtab/kitti_closest_vehicle_distance": "structured",
    
    "mscoco_captions": "retrieval",
    "flickr8k": "retrieval",
    "flickr30k": "retrieval",
}

def extract_arch(model):
    vit, size, patch_size, *rest = model.split("-")
    return vit+"-"+size+"-"+patch_size


df = pd.read_csv("benchmark.csv")
vtab_plus = list(map(lambda s:s.strip(), open("datasets.txt").readlines()))
df = df[df.dataset.isin(vtab_plus)]
df.loc[:, "dataset_type"] = df.dataset.apply(lambda d:dataset_type[d])
df.loc[:, "model_arch"] = df.model.apply(extract_arch)


df["model_fullname"]=df["model_fullname"].str.replace("/fsx/rom1504/open_clip/good_models/", "openclip ")
df["model_fullname"]=df["model_fullname"].str.replace("/fsx/rwightman/", "openclip ")
df["pretrained"]=df["pretrained"].str.replace("/fsx/rom1504/open_clip/good_models/", "openclip ")
df["pretrained"]=df["pretrained"].str.replace("/fsx/rwightman/", "openclip ")


df_retrieval = df[df["dataset_type"] == "retrieval"]
df = df[df["dataset_type"] != "retrieval"]
df = df.drop(["image_retrieval_recall@5", "text_retrieval_recall@5"], axis=1)

dataset_type = {k:v for k,v in dataset_type.items() if v != "retrieval"}


fig = plt.figure(figsize=(12,8))
#order = df.sort_values(by="dataset_type").dataset.unique()
order = list(dataset_type.keys())
ax = sns.barplot(
    x="dataset", y="acc1", 
    data=df,
    order=order,
    hue="model_fullname"
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax

<AxesSubplot:xlabel='dataset', ylabel='acc1'>


fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
d = df[df.model_arch=="ViT-B-32"]
ax = sns.barplot(
    x="dataset", y="acc1", 
    data=d,
    order=order,
    hue="model_fullname"
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax

<AxesSubplot:xlabel='dataset', ylabel='acc1'>


fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
ax = sns.barplot(
    x="dataset", y="acc1", data=df,
    order=order
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax

/home/rom1504/CLIP_benchmark/.env/lib/python3.8/site-packages/seaborn/algorithms.py:98: RuntimeWarning: Mean of empty slice
  boot_dist.append(f(*sample, **func_kwargs))
/home/rom1504/CLIP_benchmark/.env/lib/python3.8/site-packages/numpy/lib/nanfunctions.py:1559: RuntimeWarning: All-NaN slice encountered
  r, k = function_base._ureduce(a,

<AxesSubplot:xlabel='dataset', ylabel='acc1'>


fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
ax = sns.barplot(
    x="dataset", y="acc1", 
    data=df,
    order=order,
    hue="model_arch"
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax

<AxesSubplot:xlabel='dataset', ylabel='acc1'>


fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
d = df.copy()
ax = sns.barplot(
    x="dataset", y="acc1", 
    data=d,
    order=order,
    hue="pretrained"
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax

<AxesSubplot:xlabel='dataset', ylabel='acc1'>


fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
d = df.copy()
ax = sns.barplot(
    x="dataset", y="acc1", 
    data=d,
    order=order,
    hue="pretrained",
    estimator=np.max,
    ci=None
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax

/tmp/ipykernel_114262/2264146503.py:4: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.barplot(

<AxesSubplot:xlabel='dataset', ylabel='acc1'>


metric = "acc1"
df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
df_metric


metric = "mean_per_class_recall"
df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
df_metric


# Imagenet robustness results
metric = "acc1"
df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
df_metric[(df_metric.index.str.startswith("imagenet")) | (df_metric.index=="objectnet")]


df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values="acc1").T.dropna()
dataset = "imagenetv2"
line_fits_data = {
    # slopes and intercepts from https://share.streamlit.io/modestyachts/imagenet-testbed-website/main/website.py
    "imagenetv2": (1.112, -20.433),
    "imagenet-r": (1.549, -104.556),
    "imagenet_sketch": (0.931, -45.373)
}
x=np.linspace(0, 100,100)
slope, intercept = line_fits_data[dataset]
y=x*slope+intercept
plt.xlim(55,90)
plt.ylim(40,90)
d = df_metric.T[["imagenet1k", dataset]]*100
plt.scatter(d["imagenet1k"], d[dataset], color="green")
plt.plot(x,y, color="red")
plt.xlabel("imagenet1k top-1 accuracy (%)")
plt.ylabel(f"{dataset} top-1 accuracy (%)")
plt.legend()

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.

<matplotlib.legend.Legend at 0x7f0ca8673520>


metric = "mean_per_class_recall"
pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()


# For multi-label classification tasks
metric = "mean_average_precision"
pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()


metric = "image_retrieval_recall@5"
pd.pivot(df_retrieval, index="model_fullname", columns="dataset", values=metric).T.dropna()


metric = "text_retrieval_recall@5"
pd.pivot(df_retrieval, index="model_fullname", columns="dataset", values=metric).T.dropna()


df.groupby("model_fullname").agg(['mean', 'std', 'median']).sort_values(by=("acc1", "mean"), ascending=False)

/tmp/ipykernel_114262/453967910.py:1: FutureWarning: ['dataset', 'model', 'pretrained', 'task', 'dataset_type', 'model_arch'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.
  df.groupby("model_fullname").agg(['mean', 'std', 'median']).sort_values(by=("acc1", "mean"), ascending=False)


metric = "acc1"
df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
df_metric.rank(axis=1,ascending=False).agg(["mean", "std"]).T.sort_values(by="mean",ascending=True)

model_fullname	ViT-B-32-quickgelu laion400m_e32	roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt
dataset
cars	0.792	0.832
country211	0.147	0.147
fer2013	0.427	0.421
fgvc_aircraft	0.168	0.174
gtsrb	0.420	0.409
imagenet-a	0.217	0.212
imagenet-r	0.734	0.722
imagenet1k	0.629	0.617
imagenet_sketch	0.493	0.491
imagenetv2	0.551	0.533
mnist	0.374	0.663
objectnet	0.439	0.451
renderedsst2	0.526	0.544
stl10	0.955	0.956
sun397	0.670	0.663
voc2007	0.757	0.780
vtab/caltech101	0.833	0.826
vtab/cifar10	0.908	0.932
vtab/cifar100	0.702	0.750
vtab/clevr_closest_object_distance	0.159	0.201
vtab/clevr_count_all	0.163	0.147
vtab/diabetic_retinopathy	0.338	0.502
vtab/dmlab	0.172	0.129
vtab/dsprites_label_orientation	0.019	0.025
vtab/dsprites_label_x_position	0.029	0.028
vtab/dtd	0.543	0.591
vtab/eurosat	0.516	0.521
vtab/flowers	0.683	0.621
vtab/kitti_closest_vehicle_distance	0.288	0.387
vtab/pcam	0.546	0.498
vtab/pets	0.868	0.868
vtab/resisc45	0.546	0.612
vtab/smallnorb_label_azimuth	0.045	0.060
vtab/smallnorb_label_elevation	0.097	0.102
vtab/svhn	0.279	0.442

model_fullname	ViT-B-32-quickgelu laion400m_e32	roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt
dataset
cars	0.793	0.830
country211	0.147	0.147
fer2013	0.399	0.401
fgvc_aircraft	0.166	0.174
gtsrb	0.393	0.383
imagenet-a	0.235	0.242
imagenet-r	0.721	0.708
imagenet1k	0.629	0.617
imagenet_sketch	0.494	0.491
imagenetv2	0.551	0.533
mnist	0.371	0.659
objectnet	0.427	0.440
renderedsst2	0.526	0.545
stl10	0.955	0.957
sun397	0.661	0.664
voc2007	0.791	0.809
vtab/caltech101	0.909	0.905
vtab/cifar10	0.908	0.933
vtab/cifar100	0.703	0.750
vtab/clevr_closest_object_distance	0.167	0.167
vtab/clevr_count_all	0.158	0.144
vtab/diabetic_retinopathy	0.259	0.202
vtab/dmlab	0.158	0.160
vtab/dsprites_label_orientation	0.020	0.026
vtab/dsprites_label_x_position	0.031	0.028
vtab/dtd	0.547	0.593
vtab/eurosat	0.526	0.534
vtab/flowers	0.663	0.590
vtab/kitti_closest_vehicle_distance	0.365	0.404
vtab/pcam	0.546	0.498
vtab/pets	0.866	0.867
vtab/resisc45	0.554	0.616
vtab/smallnorb_label_azimuth	0.045	0.060
vtab/smallnorb_label_elevation	0.097	0.102
vtab/svhn	0.280	0.393

model_fullname	ViT-B-32-quickgelu laion400m_e32	roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt
dataset
imagenet-a	0.217	0.212
imagenet-r	0.734	0.722
imagenet1k	0.629	0.617
imagenet_sketch	0.493	0.491
imagenetv2	0.551	0.533
objectnet	0.439	0.451

model_fullname	ViT-B-32-quickgelu laion400m_e32	roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt
dataset
cars	0.793	0.830
country211	0.147	0.147
fer2013	0.399	0.401
fgvc_aircraft	0.166	0.174
gtsrb	0.393	0.383
imagenet-a	0.235	0.242
imagenet-r	0.721	0.708
imagenet1k	0.629	0.617
imagenet_sketch	0.494	0.491
imagenetv2	0.551	0.533
mnist	0.371	0.659
objectnet	0.427	0.440
renderedsst2	0.526	0.545
stl10	0.955	0.957
sun397	0.661	0.664
voc2007	0.791	0.809
vtab/caltech101	0.909	0.905
vtab/cifar10	0.908	0.933
vtab/cifar100	0.703	0.750
vtab/clevr_closest_object_distance	0.167	0.167
vtab/clevr_count_all	0.158	0.144
vtab/diabetic_retinopathy	0.259	0.202
vtab/dmlab	0.158	0.160
vtab/dsprites_label_orientation	0.020	0.026
vtab/dsprites_label_x_position	0.031	0.028
vtab/dtd	0.547	0.593
vtab/eurosat	0.526	0.534
vtab/flowers	0.663	0.590
vtab/kitti_closest_vehicle_distance	0.365	0.404
vtab/pcam	0.546	0.498
vtab/pets	0.866	0.867
vtab/resisc45	0.554	0.616
vtab/smallnorb_label_azimuth	0.045	0.060
vtab/smallnorb_label_elevation	0.097	0.102
vtab/svhn	0.280	0.393

model_fullname	ViT-B-32-quickgelu laion400m_e32	roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt
dataset
voc2007_multilabel	0.762	0.766

Accuracy of all models on all datasets¶

Zooming on a specific architecture¶

Accuracy averaged over all models for each dataset¶

Grouping over architecture for each dataset¶

Grouping over pre-training data source¶

Best results from each pre-training source¶

Detailed results¶

All results (acc1)¶

Imagenet robustness results (acc1)¶

Robustness plot¶

All results (mean_per_class_recall)¶

All results (mAP)¶

All results (retrieval)¶

Aggregating over datasets¶

Ranking the models over mean top-1 accuracy over all datasets¶

Compute rank of the model for each dataset (1 = best, lower is better), then average the ranks over the datasets¶

model_fullname	ViT-B-32-quickgelu laion400m_e32	roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt
dataset
flickr30k	0.855	0.868
flickr8k	0.579	0.595
mscoco_captions	0.608	0.631

model_fullname	ViT-B-32-quickgelu laion400m_e32	roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt
dataset
flickr30k	0.941	0.948
flickr8k	0.739	0.751
mscoco_captions	0.768	0.778

	acc1			acc5			mean_per_class_recall			mean_average_precision
	mean	std	median	mean	std	median	mean	std	median	mean	std	median
model_fullname
roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt	0.482	0.273	0.502	0.768	0.255	0.880	0.474	0.280	0.498	0.766	NaN	0.766
ViT-B-32-quickgelu laion400m_e32	0.458	0.272	0.493	0.757	0.254	0.858	0.459	0.276	0.494	0.762	NaN	0.762

	mean	std
model_fullname
roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt	1.429	0.502
ViT-B-32-quickgelu laion400m_e32	1.571	0.502