import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.float_format = '{:.3f}'.format
dataset_type = {
"imagenet1k": "natural",
"imagenetv2": "natural",
"imagenet-r": "natural",
"imagenet_sketch": "specialized",
"objectnet": "natural",
"imagenet-a": "natural",
"imagenet-o": "natural",
"vtab/cifar10": "natural",
"vtab/cifar100": "natural",
"mnist": "specialized",
"vtab/flowers": "natural",
"cars": "natural",
"vtab/svhn": "natural",
"fer2013": "natural",
"renderedsst2": "specialized",
"vtab/pets": "natural",
"vtab/caltech101": "natural",
"voc2007_multilabel": "natural",
"voc2007": "natural",
"sun397": "natural",
"fgvc_aircraft": "natural",
"country211": "natural",
"vtab/dtd": "natural",
"gtsrb": "natural",
"stl10": "natural",
"vtab/diabetic_retinopathy": "specialized",
"vtab/eurosat": "specialized",
"vtab/resisc45": "specialized",
"vtab/pcam": "specialized",
"vtab/clevr_count_all": "structured",
"vtab/clevr_closest_object_distance": "structured",
"vtab/dsprites_label_orientation": "structured",
"vtab/dsprites_label_x_position": "structured",
"vtab/smallnorb_label_elevation": "structured",
"vtab/smallnorb_label_azimuth": "structured",
"vtab/dmlab": "structured",
"vtab/kitti_closest_vehicle_distance": "structured",
"mscoco_captions": "retrieval",
"flickr8k": "retrieval",
"flickr30k": "retrieval",
}
def extract_arch(model):
vit, size, patch_size, *rest = model.split("-")
return vit+"-"+size+"-"+patch_size
df = pd.read_csv("benchmark.csv")
vtab_plus = list(map(lambda s:s.strip(), open("datasets.txt").readlines()))
df = df[df.dataset.isin(vtab_plus)]
df.loc[:, "dataset_type"] = df.dataset.apply(lambda d:dataset_type[d])
df.loc[:, "model_arch"] = df.model.apply(extract_arch)
df["model_fullname"]=df["model_fullname"].str.replace("/fsx/rom1504/open_clip/good_models/", "openclip ")
df["model_fullname"]=df["model_fullname"].str.replace("/fsx/rwightman/", "openclip ")
df["pretrained"]=df["pretrained"].str.replace("/fsx/rom1504/open_clip/good_models/", "openclip ")
df["pretrained"]=df["pretrained"].str.replace("/fsx/rwightman/", "openclip ")
df_retrieval = df[df["dataset_type"] == "retrieval"]
df = df[df["dataset_type"] != "retrieval"]
df = df.drop(["image_retrieval_recall@5", "text_retrieval_recall@5"], axis=1)
dataset_type = {k:v for k,v in dataset_type.items() if v != "retrieval"}
fig = plt.figure(figsize=(12,8))
#order = df.sort_values(by="dataset_type").dataset.unique()
order = list(dataset_type.keys())
ax = sns.barplot(
x="dataset", y="acc1",
data=df,
order=order,
hue="model_fullname"
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax
<AxesSubplot:xlabel='dataset', ylabel='acc1'>
fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
d = df[df.model_arch=="ViT-B-32"]
ax = sns.barplot(
x="dataset", y="acc1",
data=d,
order=order,
hue="model_fullname"
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax
<AxesSubplot:xlabel='dataset', ylabel='acc1'>
fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
ax = sns.barplot(
x="dataset", y="acc1", data=df,
order=order
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax
/home/rom1504/CLIP_benchmark/.env/lib/python3.8/site-packages/seaborn/algorithms.py:98: RuntimeWarning: Mean of empty slice boot_dist.append(f(*sample, **func_kwargs)) /home/rom1504/CLIP_benchmark/.env/lib/python3.8/site-packages/numpy/lib/nanfunctions.py:1559: RuntimeWarning: All-NaN slice encountered r, k = function_base._ureduce(a,
<AxesSubplot:xlabel='dataset', ylabel='acc1'>
fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
ax = sns.barplot(
x="dataset", y="acc1",
data=df,
order=order,
hue="model_arch"
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax
<AxesSubplot:xlabel='dataset', ylabel='acc1'>
fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
d = df.copy()
ax = sns.barplot(
x="dataset", y="acc1",
data=d,
order=order,
hue="pretrained"
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax
<AxesSubplot:xlabel='dataset', ylabel='acc1'>
fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
d = df.copy()
ax = sns.barplot(
x="dataset", y="acc1",
data=d,
order=order,
hue="pretrained",
estimator=np.max,
ci=None
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax
/tmp/ipykernel_114262/2264146503.py:4: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. ax = sns.barplot(
<AxesSubplot:xlabel='dataset', ylabel='acc1'>
metric = "acc1"
df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
df_metric
model_fullname | ViT-B-32-quickgelu laion400m_e32 | roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt |
---|---|---|
dataset | ||
cars | 0.792 | 0.832 |
country211 | 0.147 | 0.147 |
fer2013 | 0.427 | 0.421 |
fgvc_aircraft | 0.168 | 0.174 |
gtsrb | 0.420 | 0.409 |
imagenet-a | 0.217 | 0.212 |
imagenet-r | 0.734 | 0.722 |
imagenet1k | 0.629 | 0.617 |
imagenet_sketch | 0.493 | 0.491 |
imagenetv2 | 0.551 | 0.533 |
mnist | 0.374 | 0.663 |
objectnet | 0.439 | 0.451 |
renderedsst2 | 0.526 | 0.544 |
stl10 | 0.955 | 0.956 |
sun397 | 0.670 | 0.663 |
voc2007 | 0.757 | 0.780 |
vtab/caltech101 | 0.833 | 0.826 |
vtab/cifar10 | 0.908 | 0.932 |
vtab/cifar100 | 0.702 | 0.750 |
vtab/clevr_closest_object_distance | 0.159 | 0.201 |
vtab/clevr_count_all | 0.163 | 0.147 |
vtab/diabetic_retinopathy | 0.338 | 0.502 |
vtab/dmlab | 0.172 | 0.129 |
vtab/dsprites_label_orientation | 0.019 | 0.025 |
vtab/dsprites_label_x_position | 0.029 | 0.028 |
vtab/dtd | 0.543 | 0.591 |
vtab/eurosat | 0.516 | 0.521 |
vtab/flowers | 0.683 | 0.621 |
vtab/kitti_closest_vehicle_distance | 0.288 | 0.387 |
vtab/pcam | 0.546 | 0.498 |
vtab/pets | 0.868 | 0.868 |
vtab/resisc45 | 0.546 | 0.612 |
vtab/smallnorb_label_azimuth | 0.045 | 0.060 |
vtab/smallnorb_label_elevation | 0.097 | 0.102 |
vtab/svhn | 0.279 | 0.442 |
metric = "mean_per_class_recall"
df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
df_metric
model_fullname | ViT-B-32-quickgelu laion400m_e32 | roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt |
---|---|---|
dataset | ||
cars | 0.793 | 0.830 |
country211 | 0.147 | 0.147 |
fer2013 | 0.399 | 0.401 |
fgvc_aircraft | 0.166 | 0.174 |
gtsrb | 0.393 | 0.383 |
imagenet-a | 0.235 | 0.242 |
imagenet-r | 0.721 | 0.708 |
imagenet1k | 0.629 | 0.617 |
imagenet_sketch | 0.494 | 0.491 |
imagenetv2 | 0.551 | 0.533 |
mnist | 0.371 | 0.659 |
objectnet | 0.427 | 0.440 |
renderedsst2 | 0.526 | 0.545 |
stl10 | 0.955 | 0.957 |
sun397 | 0.661 | 0.664 |
voc2007 | 0.791 | 0.809 |
vtab/caltech101 | 0.909 | 0.905 |
vtab/cifar10 | 0.908 | 0.933 |
vtab/cifar100 | 0.703 | 0.750 |
vtab/clevr_closest_object_distance | 0.167 | 0.167 |
vtab/clevr_count_all | 0.158 | 0.144 |
vtab/diabetic_retinopathy | 0.259 | 0.202 |
vtab/dmlab | 0.158 | 0.160 |
vtab/dsprites_label_orientation | 0.020 | 0.026 |
vtab/dsprites_label_x_position | 0.031 | 0.028 |
vtab/dtd | 0.547 | 0.593 |
vtab/eurosat | 0.526 | 0.534 |
vtab/flowers | 0.663 | 0.590 |
vtab/kitti_closest_vehicle_distance | 0.365 | 0.404 |
vtab/pcam | 0.546 | 0.498 |
vtab/pets | 0.866 | 0.867 |
vtab/resisc45 | 0.554 | 0.616 |
vtab/smallnorb_label_azimuth | 0.045 | 0.060 |
vtab/smallnorb_label_elevation | 0.097 | 0.102 |
vtab/svhn | 0.280 | 0.393 |
# Imagenet robustness results
metric = "acc1"
df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
df_metric[(df_metric.index.str.startswith("imagenet")) | (df_metric.index=="objectnet")]
model_fullname | ViT-B-32-quickgelu laion400m_e32 | roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt |
---|---|---|
dataset | ||
imagenet-a | 0.217 | 0.212 |
imagenet-r | 0.734 | 0.722 |
imagenet1k | 0.629 | 0.617 |
imagenet_sketch | 0.493 | 0.491 |
imagenetv2 | 0.551 | 0.533 |
objectnet | 0.439 | 0.451 |
Here, following "Measuring Robustness to Natural Distribution Shifts in Image Classification" (https://arxiv.org/pdf/2007.00644.pdf, https://share.streamlit.io/modestyachts/imagenet-testbed-website/main/website.py), we show the deviation from the line fit of (x=imagenet1k accuracy, y=imagenetv2/imagenet-1/imagenet_sketch) which was used to measure robustnest improvements separately from accuracy improvements in imagenet1k, as the two are correlated.
In the plot below, deviation from the line are improvements in robustness.
df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values="acc1").T.dropna()
dataset = "imagenetv2"
line_fits_data = {
# slopes and intercepts from https://share.streamlit.io/modestyachts/imagenet-testbed-website/main/website.py
"imagenetv2": (1.112, -20.433),
"imagenet-r": (1.549, -104.556),
"imagenet_sketch": (0.931, -45.373)
}
x=np.linspace(0, 100,100)
slope, intercept = line_fits_data[dataset]
y=x*slope+intercept
plt.xlim(55,90)
plt.ylim(40,90)
d = df_metric.T[["imagenet1k", dataset]]*100
plt.scatter(d["imagenet1k"], d[dataset], color="green")
plt.plot(x,y, color="red")
plt.xlabel("imagenet1k top-1 accuracy (%)")
plt.ylabel(f"{dataset} top-1 accuracy (%)")
plt.legend()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
<matplotlib.legend.Legend at 0x7f0ca8673520>
metric = "mean_per_class_recall"
pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
model_fullname | ViT-B-32-quickgelu laion400m_e32 | roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt |
---|---|---|
dataset | ||
cars | 0.793 | 0.830 |
country211 | 0.147 | 0.147 |
fer2013 | 0.399 | 0.401 |
fgvc_aircraft | 0.166 | 0.174 |
gtsrb | 0.393 | 0.383 |
imagenet-a | 0.235 | 0.242 |
imagenet-r | 0.721 | 0.708 |
imagenet1k | 0.629 | 0.617 |
imagenet_sketch | 0.494 | 0.491 |
imagenetv2 | 0.551 | 0.533 |
mnist | 0.371 | 0.659 |
objectnet | 0.427 | 0.440 |
renderedsst2 | 0.526 | 0.545 |
stl10 | 0.955 | 0.957 |
sun397 | 0.661 | 0.664 |
voc2007 | 0.791 | 0.809 |
vtab/caltech101 | 0.909 | 0.905 |
vtab/cifar10 | 0.908 | 0.933 |
vtab/cifar100 | 0.703 | 0.750 |
vtab/clevr_closest_object_distance | 0.167 | 0.167 |
vtab/clevr_count_all | 0.158 | 0.144 |
vtab/diabetic_retinopathy | 0.259 | 0.202 |
vtab/dmlab | 0.158 | 0.160 |
vtab/dsprites_label_orientation | 0.020 | 0.026 |
vtab/dsprites_label_x_position | 0.031 | 0.028 |
vtab/dtd | 0.547 | 0.593 |
vtab/eurosat | 0.526 | 0.534 |
vtab/flowers | 0.663 | 0.590 |
vtab/kitti_closest_vehicle_distance | 0.365 | 0.404 |
vtab/pcam | 0.546 | 0.498 |
vtab/pets | 0.866 | 0.867 |
vtab/resisc45 | 0.554 | 0.616 |
vtab/smallnorb_label_azimuth | 0.045 | 0.060 |
vtab/smallnorb_label_elevation | 0.097 | 0.102 |
vtab/svhn | 0.280 | 0.393 |
# For multi-label classification tasks
metric = "mean_average_precision"
pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
model_fullname | ViT-B-32-quickgelu laion400m_e32 | roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt |
---|---|---|
dataset | ||
voc2007_multilabel | 0.762 | 0.766 |
metric = "image_retrieval_recall@5"
pd.pivot(df_retrieval, index="model_fullname", columns="dataset", values=metric).T.dropna()
model_fullname | ViT-B-32-quickgelu laion400m_e32 | roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt |
---|---|---|
dataset | ||
flickr30k | 0.855 | 0.868 |
flickr8k | 0.579 | 0.595 |
mscoco_captions | 0.608 | 0.631 |
metric = "text_retrieval_recall@5"
pd.pivot(df_retrieval, index="model_fullname", columns="dataset", values=metric).T.dropna()
model_fullname | ViT-B-32-quickgelu laion400m_e32 | roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt |
---|---|---|
dataset | ||
flickr30k | 0.941 | 0.948 |
flickr8k | 0.739 | 0.751 |
mscoco_captions | 0.768 | 0.778 |
See VTAB (https://arxiv.org/pdf/1910.04867.pdf, Section E) for a discussion about different aggregation strategies and how much they correlate. They find that all aggregation strategies have high Kendall score with the simple top-1 mean accuracy over datasets.
df.groupby("model_fullname").agg(['mean', 'std', 'median']).sort_values(by=("acc1", "mean"), ascending=False)
/tmp/ipykernel_114262/453967910.py:1: FutureWarning: ['dataset', 'model', 'pretrained', 'task', 'dataset_type', 'model_arch'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning. df.groupby("model_fullname").agg(['mean', 'std', 'median']).sort_values(by=("acc1", "mean"), ascending=False)
acc1 | acc5 | mean_per_class_recall | mean_average_precision | |||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
mean | std | median | mean | std | median | mean | std | median | mean | std | median | |
model_fullname | ||||||||||||
roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | 0.482 | 0.273 | 0.502 | 0.768 | 0.255 | 0.880 | 0.474 | 0.280 | 0.498 | 0.766 | NaN | 0.766 |
ViT-B-32-quickgelu laion400m_e32 | 0.458 | 0.272 | 0.493 | 0.757 | 0.254 | 0.858 | 0.459 | 0.276 | 0.494 | 0.762 | NaN | 0.762 |
metric = "acc1"
df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
df_metric.rank(axis=1,ascending=False).agg(["mean", "std"]).T.sort_values(by="mean",ascending=True)
mean | std | |
---|---|---|
model_fullname | ||
roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | 1.429 | 0.502 |
ViT-B-32-quickgelu laion400m_e32 | 1.571 | 0.502 |