Spaces:

hikerxu
/

Grounded-Segment-Anything

Paused

App Files Files Community

hikerxu commited on Mar 26

Commit

483de47

•

1 Parent(s): 8a20308

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +25 -0
.gitignore +135 -0
.gitmodules +7 -0
CITATION.cff +8 -0
Dockerfile +30 -0
EfficientSAM/EdgeSAM/common.py +118 -0
EfficientSAM/EdgeSAM/rep_vit.py +370 -0
EfficientSAM/EdgeSAM/setup_edge_sam.py +90 -0
EfficientSAM/FastSAM/tools.py +413 -0
EfficientSAM/LightHQSAM/example_light_hqsam.png +3 -0
EfficientSAM/LightHQSAM/grounded_light_hqsam_annotated_image.jpg +0 -0
EfficientSAM/LightHQSAM/setup_light_hqsam.py +45 -0
EfficientSAM/LightHQSAM/tiny_vit_sam.py +724 -0
EfficientSAM/MobileSAM/setup_mobile_sam.py +44 -0
EfficientSAM/MobileSAM/tiny_vit_sam.py +716 -0
EfficientSAM/README.md +194 -0
EfficientSAM/RepViTSAM/repvit.py +364 -0
EfficientSAM/RepViTSAM/setup_repvit_sam.py +53 -0
EfficientSAM/grounded_edge_sam.py +107 -0
EfficientSAM/grounded_efficient_sam.py +118 -0
EfficientSAM/grounded_fast_sam.py +141 -0
EfficientSAM/grounded_light_hqsam.py +109 -0
EfficientSAM/grounded_mobile_sam.py +145 -0
EfficientSAM/grounded_repvit_sam.py +107 -0
GroundingDINO/.asset/COCO.png +0 -0
GroundingDINO/.asset/GD_GLIGEN.png +3 -0
GroundingDINO/.asset/GD_SD.png +3 -0
GroundingDINO/.asset/ODinW.png +0 -0
GroundingDINO/.asset/arch.png +0 -0
GroundingDINO/.asset/cats.png +0 -0
GroundingDINO/.asset/hero_figure.png +3 -0
GroundingDINO/LICENSE +201 -0
GroundingDINO/README.md +163 -0
GroundingDINO/demo/gradio_app.py +125 -0
GroundingDINO/demo/inference_on_a_image.py +172 -0
GroundingDINO/groundingdino/__init__.py +0 -0
GroundingDINO/groundingdino/config/GroundingDINO_SwinB.py +43 -0
GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py +43 -0
GroundingDINO/groundingdino/datasets/__init__.py +0 -0
GroundingDINO/groundingdino/datasets/transforms.py +311 -0
GroundingDINO/groundingdino/models/GroundingDINO/__init__.py +15 -0
GroundingDINO/groundingdino/models/GroundingDINO/backbone/__init__.py +1 -0
GroundingDINO/groundingdino/models/GroundingDINO/backbone/backbone.py +221 -0
GroundingDINO/groundingdino/models/GroundingDINO/backbone/position_encoding.py +186 -0
GroundingDINO/groundingdino/models/GroundingDINO/backbone/swin_transformer.py +802 -0
GroundingDINO/groundingdino/models/GroundingDINO/bertwarper.py +273 -0
GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn.h +64 -0
GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp +43 -0
GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.h +35 -0
GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cuda.cu +156 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,28 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+EfficientSAM/LightHQSAM/example_light_hqsam.png filter=lfs diff=lfs merge=lfs -text
+GroundingDINO/.asset/GD_GLIGEN.png filter=lfs diff=lfs merge=lfs -text
+GroundingDINO/.asset/GD_SD.png filter=lfs diff=lfs merge=lfs -text
+GroundingDINO/.asset/hero_figure.png filter=lfs diff=lfs merge=lfs -text
+VISAM/thirdparty/segment_anything/assets/masks1.png filter=lfs diff=lfs merge=lfs -text
+VISAM/thirdparty/segment_anything/assets/notebook2.png filter=lfs diff=lfs merge=lfs -text
+VISAM/visam.gif filter=lfs diff=lfs merge=lfs -text
+assets/acoustics/gsam_whisper_inpainting_demo.png filter=lfs diff=lfs merge=lfs -text
+assets/acoustics/gsam_whisper_inpainting_pipeline.png filter=lfs diff=lfs merge=lfs -text
+assets/demo9.jpg filter=lfs diff=lfs merge=lfs -text
+assets/gradio_demo.png filter=lfs diff=lfs merge=lfs -text
+assets/grounded_sam_demo3_demo4.png filter=lfs diff=lfs merge=lfs -text
+assets/grounded_sam_inpainting_demo.png filter=lfs diff=lfs merge=lfs -text
+assets/grounded_sam_new_demo_image.png filter=lfs diff=lfs merge=lfs -text
+assets/mask_3dbox.png filter=lfs diff=lfs merge=lfs -text
+assets/osx/grounded_sam_osx_demo.png filter=lfs diff=lfs merge=lfs -text
+assets/osx/grouned_sam_osx_demo.gif filter=lfs diff=lfs merge=lfs -text
+assets/ram_grounded_sam_new.png filter=lfs diff=lfs merge=lfs -text
+segment_anything/assets/masks1.png filter=lfs diff=lfs merge=lfs -text
+segment_anything/assets/notebook2.png filter=lfs diff=lfs merge=lfs -text
+voxelnext_3d_box/images/image_boxes1.png filter=lfs diff=lfs merge=lfs -text
+voxelnext_3d_box/images/image_boxes2.png filter=lfs diff=lfs merge=lfs -text
+voxelnext_3d_box/images/image_boxes3.png filter=lfs diff=lfs merge=lfs -text
+voxelnext_3d_box/images/mask_box.png filter=lfs diff=lfs merge=lfs -text
+voxelnext_3d_box/images/sam-voxelnext.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,135 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# checkpoint
+*.pth
+outputs/
+.idea/

.gitmodules ADDED Viewed

	@@ -0,0 +1,7 @@

+[submodule "grounded-sam-osx"]
+	path = grounded-sam-osx
+	url = https://github.com/linjing7/grounded-sam-osx.git
+[submodule "VISAM"]
+	path = VISAM
+	url = https://github.com/BingfengYan/VISAM

CITATION.cff ADDED Viewed

	@@ -0,0 +1,8 @@

+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - name: "Grounded-SAM Contributors"
+title: "Grounded-Segment-Anything"
+date-released: 2023-04-06
+url: "https://github.com/IDEA-Research/Grounded-Segment-Anything"
+license: Apache-2.0

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel
+# Arguments to build Docker Image using CUDA
+ARG USE_CUDA=0
+ARG TORCH_ARCH=
+ENV AM_I_DOCKER True
+ENV BUILD_WITH_CUDA "${USE_CUDA}"
+ENV TORCH_CUDA_ARCH_LIST "${TORCH_ARCH}"
+ENV CUDA_HOME /usr/local/cuda-11.6/
+RUN mkdir -p /home/appuser/Grounded-Segment-Anything
+COPY . /home/appuser/Grounded-Segment-Anything/
+RUN apt-get update && apt-get install --no-install-recommends wget ffmpeg=7:* \
+    libsm6=2:* libxext6=2:* git=1:* nano=2.* \
+    vim=2:* -y \
+    && apt-get clean && apt-get autoremove && rm -rf /var/lib/apt/lists/*
+WORKDIR /home/appuser/Grounded-Segment-Anything
+RUN python -m pip install --no-cache-dir -e segment_anything
+# When using build isolation, PyTorch with newer CUDA is installed and can't compile GroundingDINO
+RUN python -m pip install --no-cache-dir wheel
+RUN python -m pip install --no-cache-dir --no-build-isolation -e GroundingDINO
+WORKDIR /home/appuser
+RUN pip install --no-cache-dir diffusers[torch]==0.15.1 opencv-python==4.7.0.72 \
+    pycocotools==2.0.6 matplotlib==3.5.3 \
+    onnxruntime==1.14.1 onnx==1.13.1 ipykernel==6.16.2 scipy gradio openai

EfficientSAM/EdgeSAM/common.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Type
+class MLPBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        mlp_dim: int,
+        act: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lin2(self.act(self.lin1(x)))
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+def val2list(x: list or tuple or any, repeat_time=1) -> list:
+    if isinstance(x, (list, tuple)):
+        return list(x)
+    return [x for _ in range(repeat_time)]
+def val2tuple(x: list or tuple or any, min_len: int = 1, idx_repeat: int = -1) -> tuple:
+    x = val2list(x)
+    # repeat elements if necessary
+    if len(x) > 0:
+        x[idx_repeat:idx_repeat] = [x[idx_repeat] for _ in range(min_len - len(x))]
+    return tuple(x)
+def list_sum(x: list) -> any:
+    return x[0] if len(x) == 1 else x[0] + list_sum(x[1:])
+def resize(
+        x: torch.Tensor,
+        size: any or None = None,
+        scale_factor=None,
+        mode: str = "bicubic",
+        align_corners: bool or None = False,
+) -> torch.Tensor:
+    if mode in ["bilinear", "bicubic"]:
+        return F.interpolate(
+            x,
+            size=size,
+            scale_factor=scale_factor,
+            mode=mode,
+            align_corners=align_corners,
+        )
+    elif mode in ["nearest", "area"]:
+        return F.interpolate(x, size=size, scale_factor=scale_factor, mode=mode)
+    else:
+        raise NotImplementedError(f"resize(mode={mode}) not implemented.")
+class UpSampleLayer(nn.Module):
+    def __init__(
+            self,
+            mode="bicubic",
+            size=None,
+            factor=2,
+            align_corners=False,
+    ):
+        super(UpSampleLayer, self).__init__()
+        self.mode = mode
+        self.size = val2list(size, 2) if size is not None else None
+        self.factor = None if self.size is not None else factor
+        self.align_corners = align_corners
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return resize(x, self.size, self.factor, self.mode, self.align_corners)
+class OpSequential(nn.Module):
+    def __init__(self, op_list):
+        super(OpSequential, self).__init__()
+        valid_op_list = []
+        for op in op_list:
+            if op is not None:
+                valid_op_list.append(op)
+        self.op_list = nn.ModuleList(valid_op_list)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for op in self.op_list:
+            x = op(x)
+        return x

EfficientSAM/EdgeSAM/rep_vit.py ADDED Viewed

	@@ -0,0 +1,370 @@

+import torch.nn as nn
+from EdgeSAM.common import LayerNorm2d, UpSampleLayer, OpSequential
+__all__ = ['rep_vit_m1', 'rep_vit_m2', 'rep_vit_m3', 'RepViT']
+m1_cfgs = [
+    # k, t, c, SE, HS, s
+    [3, 2, 48, 1, 0, 1],
+    [3, 2, 48, 0, 0, 1],
+    [3, 2, 48, 0, 0, 1],
+    [3, 2, 96, 0, 0, 2],
+    [3, 2, 96, 1, 0, 1],
+    [3, 2, 96, 0, 0, 1],
+    [3, 2, 96, 0, 0, 1],
+    [3, 2, 192, 0, 1, 2],
+    [3, 2, 192, 1, 1, 1],
+    [3, 2, 192, 0, 1, 1],
+    [3, 2, 192, 1, 1, 1],
+    [3, 2, 192, 0, 1, 1],
+    [3, 2, 192, 1, 1, 1],
+    [3, 2, 192, 0, 1, 1],
+    [3, 2, 192, 1, 1, 1],
+    [3, 2, 192, 0, 1, 1],
+    [3, 2, 192, 1, 1, 1],
+    [3, 2, 192, 0, 1, 1],
+    [3, 2, 192, 1, 1, 1],
+    [3, 2, 192, 0, 1, 1],
+    [3, 2, 192, 1, 1, 1],
+    [3, 2, 192, 0, 1, 1],
+    [3, 2, 192, 0, 1, 1],
+    [3, 2, 384, 0, 1, 2],
+    [3, 2, 384, 1, 1, 1],
+    [3, 2, 384, 0, 1, 1]
+]
+m2_cfgs = [
+    # k, t, c, SE, HS, s
+    [3, 2, 64, 1, 0, 1],
+    [3, 2, 64, 0, 0, 1],
+    [3, 2, 64, 0, 0, 1],
+    [3, 2, 128, 0, 0, 2],
+    [3, 2, 128, 1, 0, 1],
+    [3, 2, 128, 0, 0, 1],
+    [3, 2, 128, 0, 0, 1],
+    [3, 2, 256, 0, 1, 2],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 512, 0, 1, 2],
+    [3, 2, 512, 1, 1, 1],
+    [3, 2, 512, 0, 1, 1]
+]
+m3_cfgs = [
+    # k, t, c, SE, HS, s
+    [3, 2, 64, 1, 0, 1],
+    [3, 2, 64, 0, 0, 1],
+    [3, 2, 64, 1, 0, 1],
+    [3, 2, 64, 0, 0, 1],
+    [3, 2, 64, 0, 0, 1],
+    [3, 2, 128, 0, 0, 2],
+    [3, 2, 128, 1, 0, 1],
+    [3, 2, 128, 0, 0, 1],
+    [3, 2, 128, 1, 0, 1],
+    [3, 2, 128, 0, 0, 1],
+    [3, 2, 128, 0, 0, 1],
+    [3, 2, 256, 0, 1, 2],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 1, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 256, 0, 1, 1],
+    [3, 2, 512, 0, 1, 2],
+    [3, 2, 512, 1, 1, 1],
+    [3, 2, 512, 0, 1, 1]
+]
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+from timm.models.layers import SqueezeExcite
+import torch
+class Conv2d_BN(torch.nn.Sequential):
+    def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1,
+                 groups=1, bn_weight_init=1, resolution=-10000):
+        super().__init__()
+        self.add_module('c', torch.nn.Conv2d(
+            a, b, ks, stride, pad, dilation, groups, bias=False))
+        self.add_module('bn', torch.nn.BatchNorm2d(b))
+        torch.nn.init.constant_(self.bn.weight, bn_weight_init)
+        torch.nn.init.constant_(self.bn.bias, 0)
+    @torch.no_grad()
+    def fuse(self):
+        c, bn = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps) ** 0.5
+        w = c.weight * w[:, None, None, None]
+        b = bn.bias - bn.running_mean * bn.weight / \
+            (bn.running_var + bn.eps) ** 0.5
+        m = torch.nn.Conv2d(w.size(1) * self.c.groups, w.size(
+            0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation,
+                            groups=self.c.groups,
+                            device=c.weight.device)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+class Residual(torch.nn.Module):
+    def __init__(self, m, drop=0.):
+        super().__init__()
+        self.m = m
+        self.drop = drop
+    def forward(self, x):
+        if self.training and self.drop > 0:
+            return x + self.m(x) * torch.rand(x.size(0), 1, 1, 1,
+                                              device=x.device).ge_(self.drop).div(1 - self.drop).detach()
+        else:
+            return x + self.m(x)
+    @torch.no_grad()
+    def fuse(self):
+        if isinstance(self.m, Conv2d_BN):
+            m = self.m.fuse()
+            assert (m.groups == m.in_channels)
+            identity = torch.ones(m.weight.shape[0], m.weight.shape[1], 1, 1)
+            identity = torch.nn.functional.pad(identity, [1, 1, 1, 1])
+            m.weight += identity.to(m.weight.device)
+            return m
+        elif isinstance(self.m, torch.nn.Conv2d):
+            m = self.m
+            assert (m.groups != m.in_channels)
+            identity = torch.ones(m.weight.shape[0], m.weight.shape[1], 1, 1)
+            identity = torch.nn.functional.pad(identity, [1, 1, 1, 1])
+            m.weight += identity.to(m.weight.device)
+            return m
+        else:
+            return self
+class RepVGGDW(torch.nn.Module):
+    def __init__(self, ed) -> None:
+        super().__init__()
+        self.conv = Conv2d_BN(ed, ed, 3, 1, 1, groups=ed)
+        self.conv1 = Conv2d_BN(ed, ed, 1, 1, 0, groups=ed)
+        self.dim = ed
+    def forward(self, x):
+        return self.conv(x) + self.conv1(x) + x
+    @torch.no_grad()
+    def fuse(self):
+        conv = self.conv.fuse()
+        conv1 = self.conv1.fuse()
+        conv_w = conv.weight
+        conv_b = conv.bias
+        conv1_w = conv1.weight
+        conv1_b = conv1.bias
+        conv1_w = torch.nn.functional.pad(conv1_w, [1, 1, 1, 1])
+        identity = torch.nn.functional.pad(torch.ones(conv1_w.shape[0], conv1_w.shape[1], 1, 1, device=conv1_w.device),
+                                           [1, 1, 1, 1])
+        final_conv_w = conv_w + conv1_w + identity
+        final_conv_b = conv_b + conv1_b
+        conv.weight.data.copy_(final_conv_w)
+        conv.bias.data.copy_(final_conv_b)
+        return conv
+class RepViTBlock(nn.Module):
+    def __init__(self, inp, hidden_dim, oup, kernel_size, stride, use_se, use_hs, skip_downsample=False):
+        super(RepViTBlock, self).__init__()
+        assert stride in [1, 2]
+        self.identity = stride == 1 and inp == oup
+        assert (hidden_dim == 2 * inp)
+        if stride == 2:
+            if skip_downsample:
+                stride = 1
+            self.token_mixer = nn.Sequential(
+                Conv2d_BN(inp, inp, kernel_size, stride, (kernel_size - 1) // 2, groups=inp),
+                SqueezeExcite(inp, 0.25) if use_se else nn.Identity(),
+                Conv2d_BN(inp, oup, ks=1, stride=1, pad=0)
+            )
+            self.channel_mixer = Residual(nn.Sequential(
+                # pw
+                Conv2d_BN(oup, 2 * oup, 1, 1, 0),
+                nn.GELU() if use_hs else nn.GELU(),
+                # pw-linear
+                Conv2d_BN(2 * oup, oup, 1, 1, 0, bn_weight_init=0),
+            ))
+        else:
+            assert (self.identity)
+            self.token_mixer = nn.Sequential(
+                RepVGGDW(inp),
+                SqueezeExcite(inp, 0.25) if use_se else nn.Identity(),
+            )
+            self.channel_mixer = Residual(nn.Sequential(
+                # pw
+                Conv2d_BN(inp, hidden_dim, 1, 1, 0),
+                nn.GELU() if use_hs else nn.GELU(),
+                # pw-linear
+                Conv2d_BN(hidden_dim, oup, 1, 1, 0, bn_weight_init=0),
+            ))
+    def forward(self, x):
+        return self.channel_mixer(self.token_mixer(x))
+from timm.models.vision_transformer import trunc_normal_
+class BN_Linear(torch.nn.Sequential):
+    def __init__(self, a, b, bias=True, std=0.02):
+        super().__init__()
+        self.add_module('bn', torch.nn.BatchNorm1d(a))
+        self.add_module('l', torch.nn.Linear(a, b, bias=bias))
+        trunc_normal_(self.l.weight, std=std)
+        if bias:
+            torch.nn.init.constant_(self.l.bias, 0)
+    @torch.no_grad()
+    def fuse(self):
+        bn, l = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps) ** 0.5
+        b = bn.bias - self.bn.running_mean * \
+            self.bn.weight / (bn.running_var + bn.eps) ** 0.5
+        w = l.weight * w[None, :]
+        if l.bias is None:
+            b = b @ self.l.weight.T
+        else:
+            b = (l.weight @ b[:, None]).view(-1) + self.l.bias
+        m = torch.nn.Linear(w.size(1), w.size(0), device=l.weight.device)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+class RepViT(nn.Module):
+    arch_settings = {
+        'm1': m1_cfgs,
+        'm2': m2_cfgs,
+        'm3': m3_cfgs
+    }
+    def __init__(self, arch, img_size=1024, upsample_mode='bicubic'):
+        super(RepViT, self).__init__()
+        # setting of inverted residual blocks
+        self.cfgs = self.arch_settings[arch]
+        self.img_size = img_size
+        # building first layer
+        input_channel = self.cfgs[0][2]
+        patch_embed = torch.nn.Sequential(Conv2d_BN(3, input_channel // 2, 3, 2, 1), torch.nn.GELU(),
+                                          Conv2d_BN(input_channel // 2, input_channel, 3, 2, 1))
+        layers = [patch_embed]
+        # building inverted residual blocks
+        block = RepViTBlock
+        self.stage_idx = []
+        prev_c = input_channel
+        for idx, (k, t, c, use_se, use_hs, s) in enumerate(self.cfgs):
+            output_channel = _make_divisible(c, 8)
+            exp_size = _make_divisible(input_channel * t, 8)
+            skip_downsample = False
+            if c != prev_c:
+                self.stage_idx.append(idx - 1)
+                prev_c = c
+            layers.append(block(input_channel, exp_size, output_channel, k, s, use_se, use_hs, skip_downsample))
+            input_channel = output_channel
+        self.stage_idx.append(idx)
+        self.features = nn.ModuleList(layers)
+        stage2_channels = _make_divisible(self.cfgs[self.stage_idx[2]][2], 8)
+        stage3_channels = _make_divisible(self.cfgs[self.stage_idx[3]][2], 8)
+        self.fuse_stage2 = nn.Conv2d(stage2_channels, 256, kernel_size=1, bias=False)
+        self.fuse_stage3 = OpSequential([
+            nn.Conv2d(stage3_channels, 256, kernel_size=1, bias=False),
+            UpSampleLayer(factor=2, mode=upsample_mode),
+        ])
+        self.neck = nn.Sequential(
+            nn.Conv2d(256, 256, kernel_size=1, bias=False),
+            LayerNorm2d(256),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1, bias=False),
+            LayerNorm2d(256),
+        )
+    def forward(self, x):
+        counter = 0
+        output_dict = dict()
+        # patch_embed
+        x = self.features[0](x)
+        output_dict['stem'] = x
+        # stages
+        for idx, f in enumerate(self.features[1:]):
+            x = f(x)
+            if idx in self.stage_idx:
+                output_dict[f'stage{counter}'] = x
+                counter += 1
+        x = self.fuse_stage2(output_dict['stage2']) + self.fuse_stage3(output_dict['stage3'])
+        x = self.neck(x)
+        # hack this place because we modified the predictor of SAM for HQ-SAM in
+        # segment_anything/segment_anything/predictor.py line 91 to return intern features of the backbone
+        # self.features, self.interm_features = self.model.image_encoder(input_image)
+        return x, None
+def rep_vit_m1(img_size=1024, **kwargs):
+    return RepViT('m1', img_size, **kwargs)
+def rep_vit_m2(img_size=1024, **kwargs):
+    return RepViT('m2', img_size, **kwargs)
+def rep_vit_m3(img_size=1024, **kwargs):
+    return RepViT('m3', img_size, **kwargs)

EfficientSAM/EdgeSAM/setup_edge_sam.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from functools import partial
+from segment_anything.modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
+from EdgeSAM.rep_vit import RepViT
+prompt_embed_dim = 256
+image_size = 1024
+vit_patch_size = 16
+image_embedding_size = image_size // vit_patch_size
+def build_edge_sam(checkpoint=None, upsample_mode="bicubic"):
+    image_encoder = RepViT(
+        arch="m1",
+        img_size=image_size,
+        upsample_mode=upsample_mode
+    )
+    return _build_sam(image_encoder, checkpoint)
+sam_model_registry = {
+    "default": build_edge_sam,
+    "edge_sam": build_edge_sam,
+}
+def _build_sam_encoder(
+    encoder_embed_dim,
+    encoder_depth,
+    encoder_num_heads,
+    encoder_global_attn_indexes,
+):
+    image_encoder = ImageEncoderViT(
+        depth=encoder_depth,
+        embed_dim=encoder_embed_dim,
+        img_size=image_size,
+        mlp_ratio=4,
+        norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
+        num_heads=encoder_num_heads,
+        patch_size=vit_patch_size,
+        qkv_bias=True,
+        use_rel_pos=True,
+        global_attn_indexes=encoder_global_attn_indexes,
+        window_size=14,
+        out_chans=prompt_embed_dim,
+    )
+    return image_encoder
+def _build_sam(
+    image_encoder,
+    checkpoint=None,
+):
+    sam = Sam(
+        image_encoder=image_encoder,
+        prompt_encoder=PromptEncoder(
+            embed_dim=prompt_embed_dim,
+            image_embedding_size=(image_embedding_size, image_embedding_size),
+            input_image_size=(image_size, image_size),
+            mask_in_chans=16,
+        ),
+        mask_decoder=MaskDecoder(
+            num_multimask_outputs=3,
+            transformer=TwoWayTransformer(
+                depth=2,
+                embedding_dim=prompt_embed_dim,
+                mlp_dim=2048,
+                num_heads=8,
+            ),
+            transformer_dim=prompt_embed_dim,
+            iou_head_depth=3,
+            iou_head_hidden_dim=256,
+        ),
+        pixel_mean=[123.675, 116.28, 103.53],
+        pixel_std=[58.395, 57.12, 57.375],
+    )
+    sam.eval()
+    if checkpoint is not None:
+        with open(checkpoint, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu")
+        sam.load_state_dict(state_dict)
+    return sam

EfficientSAM/FastSAM/tools.py ADDED Viewed

	@@ -0,0 +1,413 @@

+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+import cv2
+import torch
+import os
+import clip
+def convert_box_xywh_to_xyxy(box):
+    x1 = box[0]
+    y1 = box[1]
+    x2 = box[0] + box[2]
+    y2 = box[1] + box[3]
+    return [x1, y1, x2, y2]
+def segment_image(image, bbox):
+    image_array = np.array(image)
+    segmented_image_array = np.zeros_like(image_array)
+    x1, y1, x2, y2 = bbox
+    segmented_image_array[y1:y2, x1:x2] = image_array[y1:y2, x1:x2]
+    segmented_image = Image.fromarray(segmented_image_array)
+    black_image = Image.new("RGB", image.size, (255, 255, 255))
+    # transparency_mask = np.zeros_like((), dtype=np.uint8)
+    transparency_mask = np.zeros(
+        (image_array.shape[0], image_array.shape[1]), dtype=np.uint8
+    )
+    transparency_mask[y1:y2, x1:x2] = 255
+    transparency_mask_image = Image.fromarray(transparency_mask, mode="L")
+    black_image.paste(segmented_image, mask=transparency_mask_image)
+    return black_image
+def format_results(result, filter=0):
+    annotations = []
+    n = len(result.masks.data)
+    for i in range(n):
+        annotation = {}
+        mask = result.masks.data[i] == 1.0
+        if torch.sum(mask) < filter:
+            continue
+        annotation["id"] = i
+        annotation["segmentation"] = mask.cpu().numpy()
+        annotation["bbox"] = result.boxes.data[i]
+        annotation["score"] = result.boxes.conf[i]
+        annotation["area"] = annotation["segmentation"].sum()
+        annotations.append(annotation)
+    return annotations
+def filter_masks(annotations):  # filte the overlap mask
+    annotations.sort(key=lambda x: x["area"], reverse=True)
+    to_remove = set()
+    for i in range(0, len(annotations)):
+        a = annotations[i]
+        for j in range(i + 1, len(annotations)):
+            b = annotations[j]
+            if i != j and j not in to_remove:
+                # check if
+                if b["area"] < a["area"]:
+                    if (a["segmentation"] & b["segmentation"]).sum() / b[
+                        "segmentation"
+                    ].sum() > 0.8:
+                        to_remove.add(j)
+    return [a for i, a in enumerate(annotations) if i not in to_remove], to_remove
+def get_bbox_from_mask(mask):
+    mask = mask.astype(np.uint8)
+    contours, hierarchy = cv2.findContours(
+        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+    )
+    x1, y1, w, h = cv2.boundingRect(contours[0])
+    x2, y2 = x1 + w, y1 + h
+    if len(contours) > 1:
+        for b in contours:
+            x_t, y_t, w_t, h_t = cv2.boundingRect(b)
+            # 将多个bbox合并成一个
+            x1 = min(x1, x_t)
+            y1 = min(y1, y_t)
+            x2 = max(x2, x_t + w_t)
+            y2 = max(y2, y_t + h_t)
+        h = y2 - y1
+        w = x2 - x1
+    return [x1, y1, x2, y2]
+def fast_process(
+    annotations, args, mask_random_color, bbox=None, points=None, edges=False
+):
+    if isinstance(annotations[0], dict):
+        annotations = [annotation["segmentation"] for annotation in annotations]
+    result_name = os.path.basename(args.img_path)
+    image = cv2.imread(args.img_path)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    original_h = image.shape[0]
+    original_w = image.shape[1]
+    plt.figure(figsize=(original_w/100, original_h/100))
+    plt.imshow(image)
+    if args.better_quality == True:
+        if isinstance(annotations[0], torch.Tensor):
+            annotations = np.array(annotations.cpu())
+        for i, mask in enumerate(annotations):
+            mask = cv2.morphologyEx(
+                mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8)
+            )
+            annotations[i] = cv2.morphologyEx(
+                mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8)
+            )
+    if args.device == "cpu":
+        annotations = np.array(annotations)
+        fast_show_mask(
+            annotations,
+            plt.gca(),
+            random_color=mask_random_color,
+            bbox=bbox,
+            points=points,
+            pointlabel=args.point_label,
+            retinamask=args.retina,
+            target_height=original_h,
+            target_width=original_w,
+        )
+    else:
+        if isinstance(annotations[0], np.ndarray):
+            annotations = torch.from_numpy(annotations)
+        fast_show_mask_gpu(
+            annotations,
+            plt.gca(),
+            random_color=args.randomcolor,
+            bbox=bbox,
+            points=points,
+            pointlabel=args.point_label,
+            retinamask=args.retina,
+            target_height=original_h,
+            target_width=original_w,
+        )
+    if isinstance(annotations, torch.Tensor):
+        annotations = annotations.cpu().numpy()
+    if args.withContours == True:
+        contour_all = []
+        temp = np.zeros((original_h, original_w, 1))
+        for i, mask in enumerate(annotations):
+            if type(mask) == dict:
+                mask = mask["segmentation"]
+            annotation = mask.astype(np.uint8)
+            if args.retina == False:
+                annotation = cv2.resize(
+                    annotation,
+                    (original_w, original_h),
+                    interpolation=cv2.INTER_NEAREST,
+                )
+            contours, hierarchy = cv2.findContours(
+                annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
+            )
+            for contour in contours:
+                contour_all.append(contour)
+        cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2)
+        color = np.array([0 / 255, 0 / 255, 255 / 255, 0.8])
+        contour_mask = temp / 255 * color.reshape(1, 1, -1)
+        plt.imshow(contour_mask)
+    save_path = args.output
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    plt.axis("off")
+    fig = plt.gcf()
+    plt.draw()
+    buf = fig.canvas.tostring_rgb()
+    cols, rows = fig.canvas.get_width_height()
+    img_array = np.fromstring(buf, dtype=np.uint8).reshape(rows, cols, 3)
+    return img_array
+    # cv2.imwrite(os.path.join(save_path, result_name), cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR))
+#   CPU post process
+def fast_show_mask(
+    annotation,
+    ax,
+    random_color=False,
+    bbox=None,
+    points=None,
+    pointlabel=None,
+    retinamask=True,
+    target_height=960,
+    target_width=960,
+):
+    msak_sum = annotation.shape[0]
+    height = annotation.shape[1]
+    weight = annotation.shape[2]
+    # 将annotation 按照面积 排序
+    areas = np.sum(annotation, axis=(1, 2))
+    sorted_indices = np.argsort(areas)
+    annotation = annotation[sorted_indices]
+    index = (annotation != 0).argmax(axis=0)
+    if random_color == True:
+        color = np.random.random((msak_sum, 1, 1, 3))
+    else:
+        color = np.ones((msak_sum, 1, 1, 3)) * np.array(
+            [30 / 255, 144 / 255, 255 / 255]
+        )
+    transparency = np.ones((msak_sum, 1, 1, 1)) * 0.6
+    visual = np.concatenate([color, transparency], axis=-1)
+    mask_image = np.expand_dims(annotation, -1) * visual
+    show = np.zeros((height, weight, 4))
+    h_indices, w_indices = np.meshgrid(
+        np.arange(height), np.arange(weight), indexing="ij"
+    )
+    indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
+    # 使用向量化索引更新show的值
+    show[h_indices, w_indices, :] = mask_image[indices]
+    if bbox is not None:
+        x1, y1, x2, y2 = bbox
+        ax.add_patch(
+            plt.Rectangle(
+                (x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor="b", linewidth=1
+            )
+        )
+    # draw point
+    if points is not None:
+        plt.scatter(
+            [point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
+            [point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
+            s=20,
+            c="y",
+        )
+        plt.scatter(
+            [point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
+            [point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
+            s=20,
+            c="m",
+        )
+    if retinamask == False:
+        show = cv2.resize(
+            show, (target_width, target_height), interpolation=cv2.INTER_NEAREST
+        )
+    ax.imshow(show)
+def fast_show_mask_gpu(
+    annotation,
+    ax,
+    random_color=False,
+    bbox=None,
+    points=None,
+    pointlabel=None,
+    retinamask=True,
+    target_height=960,
+    target_width=960,
+):
+    msak_sum = annotation.shape[0]
+    height = annotation.shape[1]
+    weight = annotation.shape[2]
+    areas = torch.sum(annotation, dim=(1, 2))
+    sorted_indices = torch.argsort(areas, descending=False)
+    annotation = annotation[sorted_indices]
+    # 找每个位置第一个非零值下标
+    index = (annotation != 0).to(torch.long).argmax(dim=0)
+    if random_color == True:
+        color = torch.rand((msak_sum, 1, 1, 3)).to(annotation.device)
+    else:
+        color = torch.ones((msak_sum, 1, 1, 3)).to(annotation.device) * torch.tensor(
+            [30 / 255, 144 / 255, 255 / 255]
+        ).to(annotation.device)
+    transparency = torch.ones((msak_sum, 1, 1, 1)).to(annotation.device) * 0.6
+    visual = torch.cat([color, transparency], dim=-1)
+    mask_image = torch.unsqueeze(annotation, -1) * visual
+    # 按index取数，index指每个位置选哪个batch的数，把mask_image转成一个batch的形式
+    show = torch.zeros((height, weight, 4)).to(annotation.device)
+    h_indices, w_indices = torch.meshgrid(
+        torch.arange(height), torch.arange(weight), indexing="ij"
+    )
+    indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
+    # 使用向量化索引更新show的值
+    show[h_indices, w_indices, :] = mask_image[indices]
+    show_cpu = show.cpu().numpy()
+    if bbox is not None:
+        x1, y1, x2, y2 = bbox
+        ax.add_patch(
+            plt.Rectangle(
+                (x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor="b", linewidth=1
+            )
+        )
+    # draw point
+    if points is not None:
+        plt.scatter(
+            [point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
+            [point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
+            s=20,
+            c="y",
+        )
+        plt.scatter(
+            [point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
+            [point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
+            s=20,
+            c="m",
+        )
+    if retinamask == False:
+        show_cpu = cv2.resize(
+            show_cpu, (target_width, target_height), interpolation=cv2.INTER_NEAREST
+        )
+    ax.imshow(show_cpu)
+# clip
+@torch.no_grad()
+def retriev(
+    model, preprocess, elements, search_text: str, device
+) -> int:
+    preprocessed_images = [preprocess(image).to(device) for image in elements]
+    tokenized_text = clip.tokenize([search_text]).to(device)
+    stacked_images = torch.stack(preprocessed_images)
+    image_features = model.encode_image(stacked_images)
+    text_features = model.encode_text(tokenized_text)
+    image_features /= image_features.norm(dim=-1, keepdim=True)
+    text_features /= text_features.norm(dim=-1, keepdim=True)
+    probs = 100.0 * image_features @ text_features.T
+    return probs[:, 0].softmax(dim=0)
+def crop_image(annotations, image_path):
+    image = Image.open(image_path)
+    ori_w, ori_h = image.size
+    mask_h, mask_w = annotations[0]["segmentation"].shape
+    if ori_w != mask_w or ori_h != mask_h:
+        image = image.resize((mask_w, mask_h))
+    cropped_boxes = []
+    cropped_images = []
+    not_crop = []
+    filter_id = []
+    # annotations, _ = filter_masks(annotations)
+    # filter_id = list(_)
+    for _, mask in enumerate(annotations):
+        if np.sum(mask["segmentation"]) <= 100:
+            filter_id.append(_)
+            continue
+        bbox = get_bbox_from_mask(mask["segmentation"])  # mask 的 bbox
+        cropped_boxes.append(segment_image(image, bbox))  # 保存裁剪的图片
+        # cropped_boxes.append(segment_image(image,mask["segmentation"]))
+        cropped_images.append(bbox)  # 保存裁剪的图片的bbox
+    return cropped_boxes, cropped_images, not_crop, filter_id, annotations
+def box_prompt(masks, bbox, target_height, target_width):
+    h = masks.shape[1]
+    w = masks.shape[2]
+    if h != target_height or w != target_width:
+        bbox = [
+            int(bbox[0] * w / target_width),
+            int(bbox[1] * h / target_height),
+            int(bbox[2] * w / target_width),
+            int(bbox[3] * h / target_height),
+        ]
+    bbox[0] = round(bbox[0]) if round(bbox[0]) > 0 else 0
+    bbox[1] = round(bbox[1]) if round(bbox[1]) > 0 else 0
+    bbox[2] = round(bbox[2]) if round(bbox[2]) < w else w
+    bbox[3] = round(bbox[3]) if round(bbox[3]) < h else h
+    # IoUs = torch.zeros(len(masks), dtype=torch.float32)
+    bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
+    masks_area = torch.sum(masks[:, bbox[1] : bbox[3], bbox[0] : bbox[2]], dim=(1, 2))
+    orig_masks_area = torch.sum(masks, dim=(1, 2))
+    union = bbox_area + orig_masks_area - masks_area
+    IoUs = masks_area / union
+    max_iou_index = torch.argmax(IoUs)
+    return masks[max_iou_index].cpu().numpy(), max_iou_index
+def point_prompt(masks, points, pointlabel, target_height, target_width):  # numpy 处理
+    h = masks[0]["segmentation"].shape[0]
+    w = masks[0]["segmentation"].shape[1]
+    if h != target_height or w != target_width:
+        points = [
+            [int(point[0] * w / target_width), int(point[1] * h / target_height)]
+            for point in points
+        ]
+    onemask = np.zeros((h, w))
+    for i, annotation in enumerate(masks):
+        if type(annotation) == dict:
+            mask = annotation["segmentation"]
+        else:
+            mask = annotation
+        for i, point in enumerate(points):
+            if mask[point[1], point[0]] == 1 and pointlabel[i] == 1:
+                onemask += mask
+            if mask[point[1], point[0]] == 1 and pointlabel[i] == 0:
+                onemask -= mask
+    onemask = onemask >= 1
+    return onemask, 0
+def text_prompt(annotations, args):
+    cropped_boxes, cropped_images, not_crop, filter_id, annotaions = crop_image(
+        annotations, args.img_path
+    )
+    clip_model, preprocess = clip.load("ViT-B/32", device=args.device)
+    scores = retriev(
+        clip_model, preprocess, cropped_boxes, args.text_prompt, device=args.device
+    )
+    max_idx = scores.argsort()
+    max_idx = max_idx[-1]
+    max_idx += sum(np.array(filter_id) <= int(max_idx))
+    return annotaions[max_idx]["segmentation"], max_idx

EfficientSAM/LightHQSAM/example_light_hqsam.png ADDED Viewed

Git LFS Details

SHA256: 866820ace9a150b791c00f955c2b436fc72a2e6a43b36187aba975be196161c4
Pointer size: 132 Bytes
Size of remote file: 2.32 MB

EfficientSAM/LightHQSAM/grounded_light_hqsam_annotated_image.jpg ADDED Viewed

EfficientSAM/LightHQSAM/setup_light_hqsam.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from LightHQSAM.tiny_vit_sam import TinyViT
+from segment_anything.modeling import MaskDecoderHQ, PromptEncoder, Sam, TwoWayTransformer
+def setup_model():
+    prompt_embed_dim = 256
+    image_size = 1024
+    vit_patch_size = 16
+    image_embedding_size = image_size // vit_patch_size
+    mobile_sam = Sam(
+            image_encoder=TinyViT(img_size=1024, in_chans=3, num_classes=1000,
+                embed_dims=[64, 128, 160, 320],
+                depths=[2, 2, 6, 2],
+                num_heads=[2, 4, 5, 10],
+                window_sizes=[7, 7, 14, 7],
+                mlp_ratio=4.,
+                drop_rate=0.,
+                drop_path_rate=0.0,
+                use_checkpoint=False,
+                mbconv_expand_ratio=4.0,
+                local_conv_size=3,
+                layer_lr_decay=0.8
+            ),
+            prompt_encoder=PromptEncoder(
+            embed_dim=prompt_embed_dim,
+            image_embedding_size=(image_embedding_size, image_embedding_size),
+            input_image_size=(image_size, image_size),
+            mask_in_chans=16,
+            ),
+            mask_decoder=MaskDecoderHQ(
+                    num_multimask_outputs=3,
+                    transformer=TwoWayTransformer(
+                    depth=2,
+                    embedding_dim=prompt_embed_dim,
+                    mlp_dim=2048,
+                    num_heads=8,
+                ),
+                transformer_dim=prompt_embed_dim,
+                iou_head_depth=3,
+                iou_head_hidden_dim=256,
+                vit_dim=160,
+            ),
+            pixel_mean=[123.675, 116.28, 103.53],
+            pixel_std=[58.395, 57.12, 57.375],
+        )
+    return mobile_sam

EfficientSAM/LightHQSAM/tiny_vit_sam.py ADDED Viewed

	@@ -0,0 +1,724 @@

+# --------------------------------------------------------
+# TinyViT Model Architecture
+# Copyright (c) 2022 Microsoft
+# Adapted from LeViT and Swin Transformer
+#   LeViT: (https://github.com/facebookresearch/levit)
+#   Swin: (https://github.com/microsoft/swin-transformer)
+# Build the TinyViT Model
+# --------------------------------------------------------
+import itertools
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath as TimmDropPath,\
+    to_2tuple, trunc_normal_
+from timm.models.registry import register_model
+from typing import Tuple
+class Conv2d_BN(torch.nn.Sequential):
+    def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1,
+                 groups=1, bn_weight_init=1):
+        super().__init__()
+        self.add_module('c', torch.nn.Conv2d(
+            a, b, ks, stride, pad, dilation, groups, bias=False))
+        bn = torch.nn.BatchNorm2d(b)
+        torch.nn.init.constant_(bn.weight, bn_weight_init)
+        torch.nn.init.constant_(bn.bias, 0)
+        self.add_module('bn', bn)
+    @torch.no_grad()
+    def fuse(self):
+        c, bn = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps)**0.5
+        w = c.weight * w[:, None, None, None]
+        b = bn.bias - bn.running_mean * bn.weight / \
+            (bn.running_var + bn.eps)**0.5
+        m = torch.nn.Conv2d(w.size(1) * self.c.groups, w.size(
+            0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+class DropPath(TimmDropPath):
+    def __init__(self, drop_prob=None):
+        super().__init__(drop_prob=drop_prob)
+        self.drop_prob = drop_prob
+    def __repr__(self):
+        msg = super().__repr__()
+        msg += f'(drop_prob={self.drop_prob})'
+        return msg
+class PatchEmbed(nn.Module):
+    def __init__(self, in_chans, embed_dim, resolution, activation):
+        super().__init__()
+        img_size: Tuple[int, int] = to_2tuple(resolution)
+        self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
+        self.num_patches = self.patches_resolution[0] * \
+            self.patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        n = embed_dim
+        self.seq = nn.Sequential(
+            Conv2d_BN(in_chans, n // 2, 3, 2, 1),
+            activation(),
+            Conv2d_BN(n // 2, n, 3, 2, 1),
+        )
+    def forward(self, x):
+        return self.seq(x)
+class MBConv(nn.Module):
+    def __init__(self, in_chans, out_chans, expand_ratio,
+                 activation, drop_path):
+        super().__init__()
+        self.in_chans = in_chans
+        self.hidden_chans = int(in_chans * expand_ratio)
+        self.out_chans = out_chans
+        self.conv1 = Conv2d_BN(in_chans, self.hidden_chans, ks=1)
+        self.act1 = activation()
+        self.conv2 = Conv2d_BN(self.hidden_chans, self.hidden_chans,
+                               ks=3, stride=1, pad=1, groups=self.hidden_chans)
+        self.act2 = activation()
+        self.conv3 = Conv2d_BN(
+            self.hidden_chans, out_chans, ks=1, bn_weight_init=0.0)
+        self.act3 = activation()
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        x = self.act2(x)
+        x = self.conv3(x)
+        x = self.drop_path(x)
+        x += shortcut
+        x = self.act3(x)
+        return x
+class PatchMerging(nn.Module):
+    def __init__(self, input_resolution, dim, out_dim, activation):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.out_dim = out_dim
+        self.act = activation()
+        self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0)
+        stride_c=2
+        if(out_dim==320 or out_dim==448 or out_dim==576):
+            stride_c=1
+        self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim)
+        self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)
+    def forward(self, x):
+        if x.ndim == 3:
+            H, W = self.input_resolution
+            B = len(x)
+            # (B, C, H, W)
+            x = x.view(B, H, W, -1).permute(0, 3, 1, 2)
+        x = self.conv1(x)
+        x = self.act(x)
+        x = self.conv2(x)
+        x = self.act(x)
+        x = self.conv3(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class ConvLayer(nn.Module):
+    def __init__(self, dim, input_resolution, depth,
+                 activation,
+                 drop_path=0., downsample=None, use_checkpoint=False,
+                 out_dim=None,
+                 conv_expand_ratio=4.,
+                 ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            MBConv(dim, dim, conv_expand_ratio, activation,
+                   drop_path[i] if isinstance(drop_path, list) else drop_path,
+                   )
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+        else:
+            self.downsample = None
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None,
+                 out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.norm = nn.LayerNorm(in_features)
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.act = act_layer()
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(torch.nn.Module):
+    def __init__(self, dim, key_dim, num_heads=8,
+                 attn_ratio=4,
+                 resolution=(14, 14),
+                 ):
+        super().__init__()
+        # (h, w)
+        assert isinstance(resolution, tuple) and len(resolution) == 2
+        self.num_heads = num_heads
+        self.scale = key_dim ** -0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * num_heads
+        self.attn_ratio = attn_ratio
+        h = self.dh + nh_kd * 2
+        self.norm = nn.LayerNorm(dim)
+        self.qkv = nn.Linear(dim, h)
+        self.proj = nn.Linear(self.dh, dim)
+        points = list(itertools.product(
+            range(resolution[0]), range(resolution[1])))
+        N = len(points)
+        attention_offsets = {}
+        idxs = []
+        for p1 in points:
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = torch.nn.Parameter(
+            torch.zeros(num_heads, len(attention_offsets)))
+        self.register_buffer('attention_bias_idxs',
+                             torch.LongTensor(idxs).view(N, N),
+                             persistent=False)
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+        else:
+            self.register_buffer('ab',
+                                 self.attention_biases[:, self.attention_bias_idxs],
+                                 persistent=False)
+    def forward(self, x):  # x (B,N,C)
+        B, N, _ = x.shape
+        # Normalization
+        x = self.norm(x)
+        qkv = self.qkv(x)
+        # (B, N, num_heads, d)
+        q, k, v = qkv.view(B, N, self.num_heads, -
+                           1).split([self.key_dim, self.key_dim, self.d], dim=3)
+        # (B, num_heads, N, d)
+        q = q.permute(0, 2, 1, 3)
+        k = k.permute(0, 2, 1, 3)
+        v = v.permute(0, 2, 1, 3)
+        attn = (
+            (q @ k.transpose(-2, -1)) * self.scale
+            +
+            (self.attention_biases[:, self.attention_bias_idxs]
+             if self.training else self.ab)
+        )
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
+        x = self.proj(x)
+        return x
+class TinyViTBlock(nn.Module):
+    r""" TinyViT Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int, int]): Input resolution.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        local_conv_size (int): the kernel size of the convolution between
+                               Attention and MLP. Default: 3
+        activation: the activation function. Default: nn.GELU
+    """
+    def __init__(self, dim, input_resolution, num_heads, window_size=7,
+                 mlp_ratio=4., drop=0., drop_path=0.,
+                 local_conv_size=3,
+                 activation=nn.GELU,
+                 ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        assert window_size > 0, 'window_size must be greater than 0'
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        assert dim % num_heads == 0, 'dim must be divisible by num_heads'
+        head_dim = dim // num_heads
+        window_resolution = (window_size, window_size)
+        self.attn = Attention(dim, head_dim, num_heads,
+                              attn_ratio=1, resolution=window_resolution)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        mlp_activation = activation
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim,
+                       act_layer=mlp_activation, drop=drop)
+        pad = local_conv_size // 2
+        self.local_conv = Conv2d_BN(
+            dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim)
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        res_x = x
+        if H == self.window_size and W == self.window_size:
+            x = self.attn(x)
+        else:
+            x = x.view(B, H, W, C)
+            pad_b = (self.window_size - H %
+                     self.window_size) % self.window_size
+            pad_r = (self.window_size - W %
+                     self.window_size) % self.window_size
+            padding = pad_b > 0 or pad_r > 0
+            if padding:
+                x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
+            pH, pW = H + pad_b, W + pad_r
+            nH = pH // self.window_size
+            nW = pW // self.window_size
+            # window partition
+            x = x.view(B, nH, self.window_size, nW, self.window_size, C).transpose(2, 3).reshape(
+                B * nH * nW, self.window_size * self.window_size, C)
+            x = self.attn(x)
+            # window reverse
+            x = x.view(B, nH, nW, self.window_size, self.window_size,
+                       C).transpose(2, 3).reshape(B, pH, pW, C)
+            if padding:
+                x = x[:, :H, :W].contiguous()
+            x = x.view(B, L, C)
+        x = res_x + self.drop_path(x)
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+        x = self.local_conv(x)
+        x = x.view(B, C, L).transpose(1, 2)
+        x = x + self.drop_path(self.mlp(x))
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, mlp_ratio={self.mlp_ratio}"
+class BasicLayer(nn.Module):
+    """ A basic TinyViT layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        local_conv_size: the kernel size of the depthwise convolution between attention and MLP. Default: 3
+        activation: the activation function. Default: nn.GELU
+        out_dim: the output dimension of the layer. Default: dim
+    """
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., drop=0.,
+                 drop_path=0., downsample=None, use_checkpoint=False,
+                 local_conv_size=3,
+                 activation=nn.GELU,
+                 out_dim=None,
+                 ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            TinyViTBlock(dim=dim, input_resolution=input_resolution,
+                         num_heads=num_heads, window_size=window_size,
+                         mlp_ratio=mlp_ratio,
+                         drop=drop,
+                         drop_path=drop_path[i] if isinstance(
+                             drop_path, list) else drop_path,
+                         local_conv_size=local_conv_size,
+                         activation=activation,
+                         )
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+        else:
+            self.downsample = None
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+class TinyViT(nn.Module):
+    def __init__(self, img_size=224, in_chans=3, num_classes=1000,
+                 embed_dims=[96, 192, 384, 768], depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_sizes=[7, 7, 14, 7],
+                 mlp_ratio=4.,
+                 drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_checkpoint=False,
+                 mbconv_expand_ratio=4.0,
+                 local_conv_size=3,
+                 layer_lr_decay=1.0,
+                 ):
+        super().__init__()
+        self.img_size=img_size
+        self.num_classes = num_classes
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.mlp_ratio = mlp_ratio
+        activation = nn.GELU
+        self.patch_embed = PatchEmbed(in_chans=in_chans,
+                                      embed_dim=embed_dims[0],
+                                      resolution=img_size,
+                                      activation=activation)
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate,
+                                                sum(depths))]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            kwargs = dict(dim=embed_dims[i_layer],
+                        input_resolution=(patches_resolution[0] // (2 ** (i_layer-1 if i_layer == 3 else i_layer)),
+                                patches_resolution[1] // (2 ** (i_layer-1 if i_layer == 3 else i_layer))),
+                        #   input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                        #                     patches_resolution[1] // (2 ** i_layer)),
+                          depth=depths[i_layer],
+                          drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                          downsample=PatchMerging if (
+                              i_layer < self.num_layers - 1) else None,
+                          use_checkpoint=use_checkpoint,
+                          out_dim=embed_dims[min(
+                              i_layer + 1, len(embed_dims) - 1)],
+                          activation=activation,
+                          )
+            if i_layer == 0:
+                layer = ConvLayer(
+                    conv_expand_ratio=mbconv_expand_ratio,
+                    **kwargs,
+                )
+            else:
+                layer = BasicLayer(
+                    num_heads=num_heads[i_layer],
+                    window_size=window_sizes[i_layer],
+                    mlp_ratio=self.mlp_ratio,
+                    drop=drop_rate,
+                    local_conv_size=local_conv_size,
+                    **kwargs)
+            self.layers.append(layer)
+        # Classifier head
+        self.norm_head = nn.LayerNorm(embed_dims[-1])
+        self.head = nn.Linear(
+            embed_dims[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+        # init weights
+        self.apply(self._init_weights)
+        self.set_layer_lr_decay(layer_lr_decay)
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                embed_dims[-1],
+                256,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(256),
+            nn.Conv2d(
+                256,
+                256,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(256),
+        )
+    def set_layer_lr_decay(self, layer_lr_decay):
+        decay_rate = layer_lr_decay
+        # layers -> blocks (depth)
+        depth = sum(self.depths)
+        lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)]
+        #print("LR SCALES:", lr_scales)
+        def _set_lr_scale(m, scale):
+            for p in m.parameters():
+                p.lr_scale = scale
+        self.patch_embed.apply(lambda x: _set_lr_scale(x, lr_scales[0]))
+        i = 0
+        for layer in self.layers:
+            for block in layer.blocks:
+                block.apply(lambda x: _set_lr_scale(x, lr_scales[i]))
+                i += 1
+            if layer.downsample is not None:
+                layer.downsample.apply(
+                    lambda x: _set_lr_scale(x, lr_scales[i - 1]))
+        assert i == depth
+        for m in [self.norm_head, self.head]:
+            m.apply(lambda x: _set_lr_scale(x, lr_scales[-1]))
+        for k, p in self.named_parameters():
+            p.param_name = k
+        def _check_lr_scale(m):
+            for p in m.parameters():
+                assert hasattr(p, 'lr_scale'), p.param_name
+        self.apply(_check_lr_scale)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'attention_biases'}
+    def forward_features(self, x):
+        # x: (N, C, H, W)
+        x = self.patch_embed(x)
+        x = self.layers[0](x)
+        start_i = 1
+        interm_embeddings=[]
+        for i in range(start_i, len(self.layers)):
+            layer = self.layers[i]
+            x = layer(x)
+            # print('x shape:', x.shape, '---i:', i)
+            if i == 1:
+                interm_embeddings.append(x.view(x.shape[0], 64, 64, -1))
+        B,_,C=x.size()
+        x = x.view(B, 64, 64, C)
+        x=x.permute(0, 3, 1, 2)
+        x=self.neck(x)
+        return x, interm_embeddings
+    def forward(self, x):
+        x, interm_embeddings = self.forward_features(x)
+        #x = self.norm_head(x)
+        #x = self.head(x)
+        # print('come to here is correct'* 3)
+        return x, interm_embeddings
+_checkpoint_url_format = \
+    'https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/{}.pth'
+_provided_checkpoints = {
+    'tiny_vit_5m_224': 'tiny_vit_5m_22kto1k_distill',
+    'tiny_vit_11m_224': 'tiny_vit_11m_22kto1k_distill',
+    'tiny_vit_21m_224': 'tiny_vit_21m_22kto1k_distill',
+    'tiny_vit_21m_384': 'tiny_vit_21m_22kto1k_384_distill',
+    'tiny_vit_21m_512': 'tiny_vit_21m_22kto1k_512_distill',
+}
+def register_tiny_vit_model(fn):
+    '''Register a TinyViT model
+    It is a wrapper of `register_model` with loading the pretrained checkpoint.
+    '''
+    def fn_wrapper(pretrained=False, **kwargs):
+        model = fn()
+        if pretrained:
+            model_name = fn.__name__
+            assert model_name in _provided_checkpoints, \
+                f'Sorry that the checkpoint `{model_name}` is not provided yet.'
+            url = _checkpoint_url_format.format(
+                _provided_checkpoints[model_name])
+            checkpoint = torch.hub.load_state_dict_from_url(
+                url=url,
+                map_location='cpu', check_hash=False,
+            )
+            model.load_state_dict(checkpoint['model'])
+        return model
+    # rename the name of fn_wrapper
+    fn_wrapper.__name__ = fn.__name__
+    return register_model(fn_wrapper)
+@register_tiny_vit_model
+def tiny_vit_5m_224(pretrained=False, num_classes=1000, drop_path_rate=0.0):
+    return TinyViT(
+        num_classes=num_classes,
+        embed_dims=[64, 128, 160, 320],
+        depths=[2, 2, 6, 2],
+        num_heads=[2, 4, 5, 10],
+        window_sizes=[7, 7, 14, 7],
+        drop_path_rate=drop_path_rate,
+    )
+@register_tiny_vit_model
+def tiny_vit_11m_224(pretrained=False, num_classes=1000, drop_path_rate=0.1):
+    return TinyViT(
+        num_classes=num_classes,
+        embed_dims=[64, 128, 256, 448],
+        depths=[2, 2, 6, 2],
+        num_heads=[2, 4, 8, 14],
+        window_sizes=[7, 7, 14, 7],
+        drop_path_rate=drop_path_rate,
+    )
+@register_tiny_vit_model
+def tiny_vit_21m_224(pretrained=False, num_classes=1000, drop_path_rate=0.2):
+    return TinyViT(
+        num_classes=num_classes,
+        embed_dims=[96, 192, 384, 576],
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 18],
+        window_sizes=[7, 7, 14, 7],
+        drop_path_rate=drop_path_rate,
+    )
+@register_tiny_vit_model
+def tiny_vit_21m_384(pretrained=False, num_classes=1000, drop_path_rate=0.1):
+    return TinyViT(
+        img_size=384,
+        num_classes=num_classes,
+        embed_dims=[96, 192, 384, 576],
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 18],
+        window_sizes=[12, 12, 24, 12],
+        drop_path_rate=drop_path_rate,
+    )
+@register_tiny_vit_model
+def tiny_vit_21m_512(pretrained=False, num_classes=1000, drop_path_rate=0.1):
+    return TinyViT(
+        img_size=512,
+        num_classes=num_classes,
+        embed_dims=[96, 192, 384, 576],
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 18],
+        window_sizes=[16, 16, 32, 16],
+        drop_path_rate=drop_path_rate,
+    )

EfficientSAM/MobileSAM/setup_mobile_sam.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from MobileSAM.tiny_vit_sam import TinyViT
+from segment_anything.modeling import MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
+def setup_model():
+    prompt_embed_dim = 256
+    image_size = 1024
+    vit_patch_size = 16
+    image_embedding_size = image_size // vit_patch_size
+    mobile_sam = Sam(
+            image_encoder=TinyViT(img_size=1024, in_chans=3, num_classes=1000,
+                embed_dims=[64, 128, 160, 320],
+                depths=[2, 2, 6, 2],
+                num_heads=[2, 4, 5, 10],
+                window_sizes=[7, 7, 14, 7],
+                mlp_ratio=4.,
+                drop_rate=0.,
+                drop_path_rate=0.0,
+                use_checkpoint=False,
+                mbconv_expand_ratio=4.0,
+                local_conv_size=3,
+                layer_lr_decay=0.8
+            ),
+            prompt_encoder=PromptEncoder(
+            embed_dim=prompt_embed_dim,
+            image_embedding_size=(image_embedding_size, image_embedding_size),
+            input_image_size=(image_size, image_size),
+            mask_in_chans=16,
+            ),
+            mask_decoder=MaskDecoder(
+                    num_multimask_outputs=3,
+                    transformer=TwoWayTransformer(
+                    depth=2,
+                    embedding_dim=prompt_embed_dim,
+                    mlp_dim=2048,
+                    num_heads=8,
+                ),
+                transformer_dim=prompt_embed_dim,
+                iou_head_depth=3,
+                iou_head_hidden_dim=256,
+            ),
+            pixel_mean=[123.675, 116.28, 103.53],
+            pixel_std=[58.395, 57.12, 57.375],
+        )
+    return mobile_sam

EfficientSAM/MobileSAM/tiny_vit_sam.py ADDED Viewed

	@@ -0,0 +1,716 @@

+# --------------------------------------------------------
+# TinyViT Model Architecture
+# Copyright (c) 2022 Microsoft
+# Adapted from LeViT and Swin Transformer
+#   LeViT: (https://github.com/facebookresearch/levit)
+#   Swin: (https://github.com/microsoft/swin-transformer)
+# Build the TinyViT Model
+# --------------------------------------------------------
+import itertools
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath as TimmDropPath,\
+    to_2tuple, trunc_normal_
+from timm.models.registry import register_model
+from typing import Tuple
+class Conv2d_BN(torch.nn.Sequential):
+    def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1,
+                 groups=1, bn_weight_init=1):
+        super().__init__()
+        self.add_module('c', torch.nn.Conv2d(
+            a, b, ks, stride, pad, dilation, groups, bias=False))
+        bn = torch.nn.BatchNorm2d(b)
+        torch.nn.init.constant_(bn.weight, bn_weight_init)
+        torch.nn.init.constant_(bn.bias, 0)
+        self.add_module('bn', bn)
+    @torch.no_grad()
+    def fuse(self):
+        c, bn = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps)**0.5
+        w = c.weight * w[:, None, None, None]
+        b = bn.bias - bn.running_mean * bn.weight / \
+            (bn.running_var + bn.eps)**0.5
+        m = torch.nn.Conv2d(w.size(1) * self.c.groups, w.size(
+            0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+class DropPath(TimmDropPath):
+    def __init__(self, drop_prob=None):
+        super().__init__(drop_prob=drop_prob)
+        self.drop_prob = drop_prob
+    def __repr__(self):
+        msg = super().__repr__()
+        msg += f'(drop_prob={self.drop_prob})'
+        return msg
+class PatchEmbed(nn.Module):
+    def __init__(self, in_chans, embed_dim, resolution, activation):
+        super().__init__()
+        img_size: Tuple[int, int] = to_2tuple(resolution)
+        self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
+        self.num_patches = self.patches_resolution[0] * \
+            self.patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        n = embed_dim
+        self.seq = nn.Sequential(
+            Conv2d_BN(in_chans, n // 2, 3, 2, 1),
+            activation(),
+            Conv2d_BN(n // 2, n, 3, 2, 1),
+        )
+    def forward(self, x):
+        return self.seq(x)
+class MBConv(nn.Module):
+    def __init__(self, in_chans, out_chans, expand_ratio,
+                 activation, drop_path):
+        super().__init__()
+        self.in_chans = in_chans
+        self.hidden_chans = int(in_chans * expand_ratio)
+        self.out_chans = out_chans
+        self.conv1 = Conv2d_BN(in_chans, self.hidden_chans, ks=1)
+        self.act1 = activation()
+        self.conv2 = Conv2d_BN(self.hidden_chans, self.hidden_chans,
+                               ks=3, stride=1, pad=1, groups=self.hidden_chans)
+        self.act2 = activation()
+        self.conv3 = Conv2d_BN(
+            self.hidden_chans, out_chans, ks=1, bn_weight_init=0.0)
+        self.act3 = activation()
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        x = self.act2(x)
+        x = self.conv3(x)
+        x = self.drop_path(x)
+        x += shortcut
+        x = self.act3(x)
+        return x
+class PatchMerging(nn.Module):
+    def __init__(self, input_resolution, dim, out_dim, activation):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.out_dim = out_dim
+        self.act = activation()
+        self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0)
+        stride_c=2
+        if(out_dim==320 or out_dim==448 or out_dim==576):#handongshen  576
+            stride_c=1
+        self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim)
+        self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)
+    def forward(self, x):
+        if x.ndim == 3:
+            H, W = self.input_resolution
+            B = len(x)
+            # (B, C, H, W)
+            x = x.view(B, H, W, -1).permute(0, 3, 1, 2)
+        x = self.conv1(x)
+        x = self.act(x)
+        x = self.conv2(x)
+        x = self.act(x)
+        x = self.conv3(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class ConvLayer(nn.Module):
+    def __init__(self, dim, input_resolution, depth,
+                 activation,
+                 drop_path=0., downsample=None, use_checkpoint=False,
+                 out_dim=None,
+                 conv_expand_ratio=4.,
+                 ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            MBConv(dim, dim, conv_expand_ratio, activation,
+                   drop_path[i] if isinstance(drop_path, list) else drop_path,
+                   )
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+        else:
+            self.downsample = None
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None,
+                 out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.norm = nn.LayerNorm(in_features)
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.act = act_layer()
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(torch.nn.Module):
+    def __init__(self, dim, key_dim, num_heads=8,
+                 attn_ratio=4,
+                 resolution=(14, 14),
+                 ):
+        super().__init__()
+        # (h, w)
+        assert isinstance(resolution, tuple) and len(resolution) == 2
+        self.num_heads = num_heads
+        self.scale = key_dim ** -0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * num_heads
+        self.attn_ratio = attn_ratio
+        h = self.dh + nh_kd * 2
+        self.norm = nn.LayerNorm(dim)
+        self.qkv = nn.Linear(dim, h)
+        self.proj = nn.Linear(self.dh, dim)
+        points = list(itertools.product(
+            range(resolution[0]), range(resolution[1])))
+        N = len(points)
+        attention_offsets = {}
+        idxs = []
+        for p1 in points:
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = torch.nn.Parameter(
+            torch.zeros(num_heads, len(attention_offsets)))
+        self.register_buffer('attention_bias_idxs',
+                             torch.LongTensor(idxs).view(N, N),
+                             persistent=False)
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+        else:
+            self.ab = self.attention_biases[:, self.attention_bias_idxs]
+    def forward(self, x):  # x (B,N,C)
+        B, N, _ = x.shape
+        # Normalization
+        x = self.norm(x)
+        qkv = self.qkv(x)
+        # (B, N, num_heads, d)
+        q, k, v = qkv.view(B, N, self.num_heads, -
+                           1).split([self.key_dim, self.key_dim, self.d], dim=3)
+        # (B, num_heads, N, d)
+        q = q.permute(0, 2, 1, 3)
+        k = k.permute(0, 2, 1, 3)
+        v = v.permute(0, 2, 1, 3)
+        attn = (
+            (q @ k.transpose(-2, -1)) * self.scale
+            +
+            (self.attention_biases[:, self.attention_bias_idxs]
+             if self.training else self.ab)
+        )
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
+        x = self.proj(x)
+        return x
+class TinyViTBlock(nn.Module):
+    r""" TinyViT Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int, int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        local_conv_size (int): the kernel size of the convolution between
+                               Attention and MLP. Default: 3
+        activation: the activation function. Default: nn.GELU
+    """
+    def __init__(self, dim, input_resolution, num_heads, window_size=7,
+                 mlp_ratio=4., drop=0., drop_path=0.,
+                 local_conv_size=3,
+                 activation=nn.GELU,
+                 ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        assert window_size > 0, 'window_size must be greater than 0'
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        assert dim % num_heads == 0, 'dim must be divisible by num_heads'
+        head_dim = dim // num_heads
+        window_resolution = (window_size, window_size)
+        self.attn = Attention(dim, head_dim, num_heads,
+                              attn_ratio=1, resolution=window_resolution)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        mlp_activation = activation
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim,
+                       act_layer=mlp_activation, drop=drop)
+        pad = local_conv_size // 2
+        self.local_conv = Conv2d_BN(
+            dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim)
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        res_x = x
+        if H == self.window_size and W == self.window_size:
+            x = self.attn(x)
+        else:
+            x = x.view(B, H, W, C)
+            pad_b = (self.window_size - H %
+                     self.window_size) % self.window_size
+            pad_r = (self.window_size - W %
+                     self.window_size) % self.window_size
+            padding = pad_b > 0 or pad_r > 0
+            if padding:
+                x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
+            pH, pW = H + pad_b, W + pad_r
+            nH = pH // self.window_size
+            nW = pW // self.window_size
+            # window partition
+            x = x.view(B, nH, self.window_size, nW, self.window_size, C).transpose(2, 3).reshape(
+                B * nH * nW, self.window_size * self.window_size, C)
+            x = self.attn(x)
+            # window reverse
+            x = x.view(B, nH, nW, self.window_size, self.window_size,
+                       C).transpose(2, 3).reshape(B, pH, pW, C)
+            if padding:
+                x = x[:, :H, :W].contiguous()
+            x = x.view(B, L, C)
+        x = res_x + self.drop_path(x)
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+        x = self.local_conv(x)
+        x = x.view(B, C, L).transpose(1, 2)
+        x = x + self.drop_path(self.mlp(x))
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, mlp_ratio={self.mlp_ratio}"
+class BasicLayer(nn.Module):
+    """ A basic TinyViT layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        local_conv_size: the kernel size of the depthwise convolution between attention and MLP. Default: 3
+        activation: the activation function. Default: nn.GELU
+        out_dim: the output dimension of the layer. Default: dim
+    """
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., drop=0.,
+                 drop_path=0., downsample=None, use_checkpoint=False,
+                 local_conv_size=3,
+                 activation=nn.GELU,
+                 out_dim=None,
+                 ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            TinyViTBlock(dim=dim, input_resolution=input_resolution,
+                         num_heads=num_heads, window_size=window_size,
+                         mlp_ratio=mlp_ratio,
+                         drop=drop,
+                         drop_path=drop_path[i] if isinstance(
+                             drop_path, list) else drop_path,
+                         local_conv_size=local_conv_size,
+                         activation=activation,
+                         )
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+        else:
+            self.downsample = None
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+class TinyViT(nn.Module):
+    def __init__(self, img_size=224, in_chans=3, num_classes=1000,
+                 embed_dims=[96, 192, 384, 768], depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_sizes=[7, 7, 14, 7],
+                 mlp_ratio=4.,
+                 drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_checkpoint=False,
+                 mbconv_expand_ratio=4.0,
+                 local_conv_size=3,
+                 layer_lr_decay=1.0,
+                 ):
+        super().__init__()
+        self.img_size=img_size
+        self.num_classes = num_classes
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.mlp_ratio = mlp_ratio
+        activation = nn.GELU
+        self.patch_embed = PatchEmbed(in_chans=in_chans,
+                                      embed_dim=embed_dims[0],
+                                      resolution=img_size,
+                                      activation=activation)
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate,
+                                                sum(depths))]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            kwargs = dict(dim=embed_dims[i_layer],
+                        input_resolution=(patches_resolution[0] // (2 ** (i_layer-1 if i_layer == 3 else i_layer)),
+                                patches_resolution[1] // (2 ** (i_layer-1 if i_layer == 3 else i_layer))),
+                        #   input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                        #                     patches_resolution[1] // (2 ** i_layer)),
+                          depth=depths[i_layer],
+                          drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                          downsample=PatchMerging if (
+                              i_layer < self.num_layers - 1) else None,
+                          use_checkpoint=use_checkpoint,
+                          out_dim=embed_dims[min(
+                              i_layer + 1, len(embed_dims) - 1)],
+                          activation=activation,
+                          )
+            if i_layer == 0:
+                layer = ConvLayer(
+                    conv_expand_ratio=mbconv_expand_ratio,
+                    **kwargs,
+                )
+            else:
+                layer = BasicLayer(
+                    num_heads=num_heads[i_layer],
+                    window_size=window_sizes[i_layer],
+                    mlp_ratio=self.mlp_ratio,
+                    drop=drop_rate,
+                    local_conv_size=local_conv_size,
+                    **kwargs)
+            self.layers.append(layer)
+        # Classifier head
+        self.norm_head = nn.LayerNorm(embed_dims[-1])
+        self.head = nn.Linear(
+            embed_dims[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+        # init weights
+        self.apply(self._init_weights)
+        self.set_layer_lr_decay(layer_lr_decay)
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                embed_dims[-1],#handongshen
+                256,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(256),
+            nn.Conv2d(
+                256,
+                256,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(256),
+        )
+    def set_layer_lr_decay(self, layer_lr_decay):
+        decay_rate = layer_lr_decay
+        # layers -> blocks (depth)
+        depth = sum(self.depths)
+        lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)]
+        print("LR SCALES:", lr_scales)
+        def _set_lr_scale(m, scale):
+            for p in m.parameters():
+                p.lr_scale = scale
+        self.patch_embed.apply(lambda x: _set_lr_scale(x, lr_scales[0]))
+        i = 0
+        for layer in self.layers:
+            for block in layer.blocks:
+                block.apply(lambda x: _set_lr_scale(x, lr_scales[i]))
+                i += 1
+            if layer.downsample is not None:
+                layer.downsample.apply(
+                    lambda x: _set_lr_scale(x, lr_scales[i - 1]))
+        assert i == depth
+        for m in [self.norm_head, self.head]:
+            m.apply(lambda x: _set_lr_scale(x, lr_scales[-1]))
+        for k, p in self.named_parameters():
+            p.param_name = k
+        def _check_lr_scale(m):
+            for p in m.parameters():
+                assert hasattr(p, 'lr_scale'), p.param_name
+        self.apply(_check_lr_scale)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'attention_biases'}
+    def forward_features(self, x):
+        # x: (N, C, H, W)
+        x = self.patch_embed(x)
+        x = self.layers[0](x)
+        start_i = 1
+        for i in range(start_i, len(self.layers)):
+            layer = self.layers[i]
+            x = layer(x)
+        B,_,C=x.size()
+        x = x.view(B, 64, 64, C)
+        x=x.permute(0, 3, 1, 2)
+        x=self.neck(x)
+        return x
+    def forward(self, x):
+        x = self.forward_features(x)
+        # We have made some hack changes here to make it compatible with SAM-HQ
+        return x, None
+_checkpoint_url_format = \
+    'https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/{}.pth'
+_provided_checkpoints = {
+    'tiny_vit_5m_224': 'tiny_vit_5m_22kto1k_distill',
+    'tiny_vit_11m_224': 'tiny_vit_11m_22kto1k_distill',
+    'tiny_vit_21m_224': 'tiny_vit_21m_22kto1k_distill',
+    'tiny_vit_21m_384': 'tiny_vit_21m_22kto1k_384_distill',
+    'tiny_vit_21m_512': 'tiny_vit_21m_22kto1k_512_distill',
+}
+def register_tiny_vit_model(fn):
+    '''Register a TinyViT model
+    It is a wrapper of `register_model` with loading the pretrained checkpoint.
+    '''
+    def fn_wrapper(pretrained=False, **kwargs):
+        model = fn()
+        if pretrained:
+            model_name = fn.__name__
+            assert model_name in _provided_checkpoints, \
+                f'Sorry that the checkpoint `{model_name}` is not provided yet.'
+            url = _checkpoint_url_format.format(
+                _provided_checkpoints[model_name])
+            checkpoint = torch.hub.load_state_dict_from_url(
+                url=url,
+                map_location='cpu', check_hash=False,
+            )
+            model.load_state_dict(checkpoint['model'])
+        return model
+    # rename the name of fn_wrapper
+    fn_wrapper.__name__ = fn.__name__
+    return register_model(fn_wrapper)
+@register_tiny_vit_model
+def tiny_vit_5m_224(pretrained=False, num_classes=1000, drop_path_rate=0.0):
+    return TinyViT(
+        num_classes=num_classes,
+        embed_dims=[64, 128, 160, 320],
+        depths=[2, 2, 6, 2],
+        num_heads=[2, 4, 5, 10],
+        window_sizes=[7, 7, 14, 7],
+        drop_path_rate=drop_path_rate,
+    )
+@register_tiny_vit_model
+def tiny_vit_11m_224(pretrained=False, num_classes=1000, drop_path_rate=0.1):
+    return TinyViT(
+        num_classes=num_classes,
+        embed_dims=[64, 128, 256, 448],
+        depths=[2, 2, 6, 2],
+        num_heads=[2, 4, 8, 14],
+        window_sizes=[7, 7, 14, 7],
+        drop_path_rate=drop_path_rate,
+    )
+@register_tiny_vit_model
+def tiny_vit_21m_224(pretrained=False, num_classes=1000, drop_path_rate=0.2):
+    return TinyViT(
+        num_classes=num_classes,
+        embed_dims=[96, 192, 384, 576],
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 18],
+        window_sizes=[7, 7, 14, 7],
+        drop_path_rate=drop_path_rate,
+    )
+@register_tiny_vit_model
+def tiny_vit_21m_384(pretrained=False, num_classes=1000, drop_path_rate=0.1):
+    return TinyViT(
+        img_size=384,
+        num_classes=num_classes,
+        embed_dims=[96, 192, 384, 576],
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 18],
+        window_sizes=[12, 12, 24, 12],
+        drop_path_rate=drop_path_rate,
+    )
+@register_tiny_vit_model
+def tiny_vit_21m_512(pretrained=False, num_classes=1000, drop_path_rate=0.1):
+    return TinyViT(
+        img_size=512,
+        num_classes=num_classes,
+        embed_dims=[96, 192, 384, 576],
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 18],
+        window_sizes=[16, 16, 32, 16],
+        drop_path_rate=drop_path_rate,
+    )

EfficientSAM/README.md ADDED Viewed

	@@ -0,0 +1,194 @@

+## Efficient Grounded-SAM
+We're going to combine [Grounding-DINO](https://github.com/IDEA-Research/GroundingDINO) with efficient SAM variants for faster annotating.
+<!-- Combining [Grounding-DINO](https://github.com/IDEA-Research/GroundingDINO) and [Fast-SAM](https://github.com/CASIA-IVA-Lab/FastSAM) for faster zero-shot detect and segment anything. -->
+### Table of Contents
+- [Installation](#installation)
+- [Efficient SAM Series](#efficient-sams)
+- [Run Grounded-FastSAM Demo](#run-grounded-fastsam-demo)
+- [Run Grounded-MobileSAM Demo](#run-grounded-mobilesam-demo)
+- [Run Grounded-LightHQSAM Demo](#run-grounded-light-hqsam-demo)
+- [Run Grounded-Efficient-SAM Demo](#run-grounded-efficient-sam-demo)
+- [Run Grounded-Edge-SAM Demo](#run-grounded-edge-sam-demo)
+- [Run Grounded-RepViT-SAM Demo](#run-grounded-repvit-sam-demo)
+### Installation
+- Install [Grounded-SAM](https://github.com/IDEA-Research/Grounded-Segment-Anything#installation)
+- Install [Fast-SAM](https://github.com/CASIA-IVA-Lab/FastSAM#installation)
+- Note that we may use the sam image as the demo image in order to compare the inference results of different efficient-sam variants.
+### Efficient SAMs
+Here's the list of Efficient SAM variants:
+<div align="center">
+| Title | Intro | Description | Links |
+|:----:|:----:|:----:|:----:|
+| [FastSAM](https://arxiv.org/pdf/2306.12156.pdf) | ![](https://github.com/CASIA-IVA-Lab/FastSAM/blob/main/assets/Overview.png) | The Fast Segment Anything Model(FastSAM) is a CNN Segment Anything Model trained by only 2% of the SA-1B dataset published by SAM authors. The FastSAM achieve a comparable performance with the SAM method at 50× higher run-time speed. | [[Github](https://github.com/CASIA-IVA-Lab/FastSAM)]  [[Demo](https://huggingface.co/spaces/An-619/FastSAM)] |
+| [MobileSAM](https://arxiv.org/pdf/2306.14289.pdf) | ![](https://github.com/ChaoningZhang/MobileSAM/blob/master/assets/model_diagram.jpg?raw=true) | MobileSAM performs on par with the original SAM (at least visually) and keeps exactly the same pipeline as the original SAM except for a change on the image encoder. Specifically, we replace the original heavyweight ViT-H encoder (632M) with a much smaller Tiny-ViT (5M). On a single GPU, MobileSAM runs around 12ms per image: 8ms on the image encoder and 4ms on the mask decoder. | [[Github](https://github.com/ChaoningZhang/MobileSAM)] |
+| [Light-HQSAM](https://arxiv.org/pdf/2306.01567.pdf) | ![](https://github.com/SysCV/sam-hq/blob/main/figs/sam-hf-framework.png?raw=true) | Light HQ-SAM is based on the tiny vit image encoder provided by MobileSAM. We design a learnable High-Quality Output Token, which is injected into SAM's mask decoder and is responsible for predicting the high-quality mask. Instead of only applying it on mask-decoder features, we first fuse them with ViT features for improved mask details. Refer to [Light HQ-SAM vs. MobileSAM](https://github.com/SysCV/sam-hq#light-hq-sam-vs-mobilesam-on-coco) for more details. | [[Github](https://github.com/SysCV/sam-hq)] |
+| [Efficient-SAM](https://github.com/yformer/EfficientSAM) | ![](https://yformer.github.io/efficient-sam/EfficientSAM_files/overview.png) |Segment Anything Model (SAM) has emerged as a powerful tool for numerous vision applications. However, the huge computation cost of SAM model has limited its applications to wider real-world applications. To address this limitation, we propose EfficientSAMs, light-weight SAM models that exhibit decent performance with largely reduced complexity. Our idea is based on leveraging masked image pretraining, SAMI, which learns to reconstruct features from SAM image encoder for effective visual representation learning. Further, we take SAMI-pretrained light-weight image encoders and mask decoder to build EfficientSAMs, and finetune the models on SA-1B for segment anything task. Refer to [EfficientSAM arXiv](https://arxiv.org/pdf/2312.00863.pdf) for more details.| [[Github](https://github.com/yformer/EfficientSAM)] |
+| [Edge-SAM](https://github.com/chongzhou96/EdgeSAM) | ![](https://www.mmlab-ntu.com/project/edgesam/img/arch.png) | EdgeSAM involves distilling the original ViT-based SAM image encoder into a purely CNN-based architecture, better suited for edge devices. We carefully benchmark various distillation strategies and demonstrate that task-agnostic encoder distillation fails to capture the full knowledge embodied in SAM. Refer to [Edge-SAM arXiv](https://arxiv.org/abs/2312.06660) for more details. | [[Github](https://github.com/chongzhou96/EdgeSAM)] |
+| [RepViT-SAM](https://github.com/THU-MIG/RepViT/tree/main/sam) | ![](https://jameslahm.github.io/repvit-sam/static/images/edge.png) | Recently, RepViT achieves the state-of-the-art performance and latency trade-off on mobile devices by incorporating efficient architectural designs of ViTs into CNNs. Here, to achieve real-time segmenting anything on mobile devices, following MobileSAM, we replace the heavyweight image encoder in SAM with RepViT model, ending up with the RepViT-SAM model. Extensive experiments show that RepViT-SAM can enjoy significantly better zero-shot transfer capability than MobileSAM, along with nearly 10× faster inference speed. Refer to [RepViT-SAM arXiv](https://arxiv.org/pdf/2312.05760.pdf) for more details. | [[Github](https://github.com/THU-MIG/RepViT)] |
+</div>
+### Run Grounded-FastSAM Demo
+- Firstly, download the pretrained Fast-SAM weight [here](https://github.com/CASIA-IVA-Lab/FastSAM#model-checkpoints)
+- Run the demo with the following script:
+```bash
+cd Grounded-Segment-Anything
+python EfficientSAM/grounded_fast_sam.py --model_path "./FastSAM-x.pt" --img_path "assets/demo4.jpg" --text "the black dog." --output "./output/"
+```
+- And the results will be saved in `./output/` as:
+<div style="text-align: center">
+| Input | Text | Output |
+|:---:|:---:|:---:|
+|![](/assets/demo4.jpg) | "The black dog." | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/fast_sam/demo4_0_caption_the%20black%20dog.jpg?raw=true) |
+</div>
+**Note**: Due to the post process of FastSAM, only one box can be annotated at a time, if there're multiple box prompts, we simply save multiple annotate images to `./output` now, which will be modified in the future release.
+### Run Grounded-MobileSAM Demo
+- Firstly, download the pretrained MobileSAM weight [here](https://github.com/ChaoningZhang/MobileSAM/tree/master/weights)
+- Run the demo with the following script:
+```bash
+cd Grounded-Segment-Anything
+python EfficientSAM/grounded_mobile_sam.py --MOBILE_SAM_CHECKPOINT_PATH "./EfficientSAM/mobile_sam.pt" --SOURCE_IMAGE_PATH "./assets/demo2.jpg" --CAPTION "the running dog"
+```
+- And the result will be saved as `./gronded_mobile_sam_anontated_image.jpg` as:
+<div style="text-align: center">
+| Input | Text | Output |
+|:---:|:---:|:---:|
+|![](/assets/demo2.jpg) | "the running dog" | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/mobile_sam/grounded_mobile_sam_annotated_image.jpg?raw=true) |
+</div>
+### Run Grounded-Light-HQSAM Demo
+- Firstly, download the pretrained Light-HQSAM weight [here](https://github.com/SysCV/sam-hq#model-checkpoints)
+- Run the demo with the following script:
+```bash
+cd Grounded-Segment-Anything
+python EfficientSAM/grounded_light_hqsam.py
+```
+- And the result will be saved as `./gronded_light_hqsam_anontated_image.jpg` as:
+<div style="text-align: center">
+| Input | Text | Output |
+|:---:|:---:|:---:|
+|![](/EfficientSAM/LightHQSAM/example_light_hqsam.png) | "bench" | ![](/EfficientSAM/LightHQSAM/grounded_light_hqsam_annotated_image.jpg) |
+</div>
+### Run Grounded-Efficient-SAM Demo
+- Download the pretrained EfficientSAM checkpoint from [here](https://github.com/yformer/EfficientSAM#model) and put it under `Grounded-Segment-Anything/EfficientSAM`
+- Run the demo with the following script:
+```bash
+cd Grounded-Segment-Anything
+python EfficientSAM/grounded_efficient_sam.py
+```
+- And the result will be saved as `./gronded_efficient_sam_anontated_image.jpg` as:
+<div style="text-align: center">
+| Input | Text | Output |
+|:---:|:---:|:---:|
+|![](/EfficientSAM/LightHQSAM/example_light_hqsam.png) | "bench" | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/efficient_sam/grounded_efficient_sam_annotated_image.jpg?raw=true) |
+</div>
+### Run Grounded-Edge-SAM Demo
+- Download the pretrained [Edge-SAM](https://github.com/chongzhou96/EdgeSAM) checkpoint follow the [official instruction](https://github.com/chongzhou96/EdgeSAM?tab=readme-ov-file#usage-) as:
+```bash
+cd Grounded-Segment-Anything
+wget -P EfficientSAM/ https://huggingface.co/spaces/chongzhou/EdgeSAM/resolve/main/weights/edge_sam.pth
+wget -P EfficientSAM/ https://huggingface.co/spaces/chongzhou/EdgeSAM/resolve/main/weights/edge_sam_3x.pth
+```
+- Run the demo with the following script:
+```bash
+cd Grounded-Segment-Anything
+python EfficientSAM/grounded_edge_sam.py
+```
+- And the result will be saved as `./gronded_edge_sam_anontated_image.jpg` as:
+<div style="text-align: center">
+| Input | Text | Output |
+|:---:|:---:|:---:|
+|![](/EfficientSAM/LightHQSAM/example_light_hqsam.png) | "bench" | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/edge_sam/grounded_edge_sam_annotated_image.jpg?raw=true) |
+</div>
+### Run Grounded-RepViT-SAM Demo
+- Download the pretrained [RepViT-SAM](https://github.com/THU-MIG/RepViT) checkpoint follow the [official instruction](https://github.com/THU-MIG/RepViT/tree/main/sam#installation) as:
+```bash
+cd Grounded-Segment-Anything
+wget -P EfficientSAM/ https://github.com/THU-MIG/RepViT/releases/download/v1.0/repvit_sam.pt
+```
+- Run the demo with the following script:
+```bash
+cd Grounded-Segment-Anything
+python EfficientSAM/grounded_repvit_sam.py
+```
+- And the result will be saved as `./gronded_repvit_sam_anontated_image.jpg` as:
+<div style="text-align: center">
+| Input | Text | Output |
+|:---:|:---:|:---:|
+|![](/EfficientSAM/LightHQSAM/example_light_hqsam.png) | "bench" | ![](https://github.com/IDEA-Research/detrex-storage/blob/main/assets/grounded_sam/repvit_sam/grounded_repvit_sam_annotated_image.jpg?raw=true) |
+</div>

EfficientSAM/RepViTSAM/repvit.py ADDED Viewed

	@@ -0,0 +1,364 @@

+import torch.nn as nn
+__all__ = ['repvit_m1']
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+from timm.models.layers import SqueezeExcite
+import torch
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+class Conv2d_BN(torch.nn.Sequential):
+    def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1,
+                 groups=1, bn_weight_init=1, resolution=-10000):
+        super().__init__()
+        self.add_module('c', torch.nn.Conv2d(
+            a, b, ks, stride, pad, dilation, groups, bias=False))
+        self.add_module('bn', torch.nn.BatchNorm2d(b))
+        torch.nn.init.constant_(self.bn.weight, bn_weight_init)
+        torch.nn.init.constant_(self.bn.bias, 0)
+    @torch.no_grad()
+    def fuse(self):
+        c, bn = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps)**0.5
+        w = c.weight * w[:, None, None, None]
+        b = bn.bias - bn.running_mean * bn.weight / \
+            (bn.running_var + bn.eps)**0.5
+        m = torch.nn.Conv2d(w.size(1) * self.c.groups, w.size(
+            0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups,
+            device=c.weight.device)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+class Residual(torch.nn.Module):
+    def __init__(self, m, drop=0.):
+        super().__init__()
+        self.m = m
+        self.drop = drop
+    def forward(self, x):
+        if self.training and self.drop > 0:
+            return x + self.m(x) * torch.rand(x.size(0), 1, 1, 1,
+                                              device=x.device).ge_(self.drop).div(1 - self.drop).detach()
+        else:
+            return x + self.m(x)
+    @torch.no_grad()
+    def fuse(self):
+        if isinstance(self.m, Conv2d_BN):
+            m = self.m.fuse()
+            assert(m.groups == m.in_channels)
+            identity = torch.ones(m.weight.shape[0], m.weight.shape[1], 1, 1)
+            identity = torch.nn.functional.pad(identity, [1,1,1,1])
+            m.weight += identity.to(m.weight.device)
+            return m
+        elif isinstance(self.m, torch.nn.Conv2d):
+            m = self.m
+            assert(m.groups != m.in_channels)
+            identity = torch.ones(m.weight.shape[0], m.weight.shape[1], 1, 1)
+            identity = torch.nn.functional.pad(identity, [1,1,1,1])
+            m.weight += identity.to(m.weight.device)
+            return m
+        else:
+            return self
+class RepVGGDW(torch.nn.Module):
+    def __init__(self, ed) -> None:
+        super().__init__()
+        self.conv = Conv2d_BN(ed, ed, 3, 1, 1, groups=ed)
+        self.conv1 = torch.nn.Conv2d(ed, ed, 1, 1, 0, groups=ed)
+        self.dim = ed
+        self.bn = torch.nn.BatchNorm2d(ed)
+    def forward(self, x):
+        return self.bn((self.conv(x) + self.conv1(x)) + x)
+    @torch.no_grad()
+    def fuse(self):
+        conv = self.conv.fuse()
+        conv1 = self.conv1
+        conv_w = conv.weight
+        conv_b = conv.bias
+        conv1_w = conv1.weight
+        conv1_b = conv1.bias
+        conv1_w = torch.nn.functional.pad(conv1_w, [1,1,1,1])
+        identity = torch.nn.functional.pad(torch.ones(conv1_w.shape[0], conv1_w.shape[1], 1, 1, device=conv1_w.device), [1,1,1,1])
+        final_conv_w = conv_w + conv1_w + identity
+        final_conv_b = conv_b + conv1_b
+        conv.weight.data.copy_(final_conv_w)
+        conv.bias.data.copy_(final_conv_b)
+        bn = self.bn
+        w = bn.weight / (bn.running_var + bn.eps)**0.5
+        w = conv.weight * w[:, None, None, None]
+        b = bn.bias + (conv.bias - bn.running_mean) * bn.weight / \
+            (bn.running_var + bn.eps)**0.5
+        conv.weight.data.copy_(w)
+        conv.bias.data.copy_(b)
+        return conv
+class RepViTBlock(nn.Module):
+    def __init__(self, inp, hidden_dim, oup, kernel_size, stride, use_se, use_hs):
+        super(RepViTBlock, self).__init__()
+        assert stride in [1, 2]
+        self.identity = stride == 1 and inp == oup
+        assert(hidden_dim == 2 * inp)
+        if stride == 2:
+            self.token_mixer = nn.Sequential(
+                Conv2d_BN(inp, inp, kernel_size, stride if inp != 320 else 1, (kernel_size - 1) // 2, groups=inp),
+                SqueezeExcite(inp, 0.25) if use_se else nn.Identity(),
+                Conv2d_BN(inp, oup, ks=1, stride=1, pad=0)
+            )
+            self.channel_mixer = Residual(nn.Sequential(
+                    # pw
+                    Conv2d_BN(oup, 2 * oup, 1, 1, 0),
+                    nn.GELU() if use_hs else nn.GELU(),
+                    # pw-linear
+                    Conv2d_BN(2 * oup, oup, 1, 1, 0, bn_weight_init=0),
+                ))
+        else:
+            # assert(self.identity)
+            self.token_mixer = nn.Sequential(
+                RepVGGDW(inp),
+                SqueezeExcite(inp, 0.25) if use_se else nn.Identity(),
+            )
+            if self.identity:
+                self.channel_mixer = Residual(nn.Sequential(
+                        # pw
+                        Conv2d_BN(inp, hidden_dim, 1, 1, 0),
+                        nn.GELU() if use_hs else nn.GELU(),
+                        # pw-linear
+                        Conv2d_BN(hidden_dim, oup, 1, 1, 0, bn_weight_init=0),
+                    ))
+            else:
+                self.channel_mixer = nn.Sequential(
+                        # pw
+                        Conv2d_BN(inp, hidden_dim, 1, 1, 0),
+                        nn.GELU() if use_hs else nn.GELU(),
+                        # pw-linear
+                        Conv2d_BN(hidden_dim, oup, 1, 1, 0, bn_weight_init=0),
+                    )
+    def forward(self, x):
+        return self.channel_mixer(self.token_mixer(x))
+from timm.models.vision_transformer import trunc_normal_
+class BN_Linear(torch.nn.Sequential):
+    def __init__(self, a, b, bias=True, std=0.02):
+        super().__init__()
+        self.add_module('bn', torch.nn.BatchNorm1d(a))
+        self.add_module('l', torch.nn.Linear(a, b, bias=bias))
+        trunc_normal_(self.l.weight, std=std)
+        if bias:
+            torch.nn.init.constant_(self.l.bias, 0)
+    @torch.no_grad()
+    def fuse(self):
+        bn, l = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps)**0.5
+        b = bn.bias - self.bn.running_mean * \
+            self.bn.weight / (bn.running_var + bn.eps)**0.5
+        w = l.weight * w[None, :]
+        if l.bias is None:
+            b = b @ self.l.weight.T
+        else:
+            b = (l.weight @ b[:, None]).view(-1) + self.l.bias
+        m = torch.nn.Linear(w.size(1), w.size(0), device=l.weight.device)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+class Classfier(nn.Module):
+    def __init__(self, dim, num_classes, distillation=True):
+        super().__init__()
+        self.classifier = BN_Linear(dim, num_classes) if num_classes > 0 else torch.nn.Identity()
+        self.distillation = distillation
+        if distillation:
+            self.classifier_dist = BN_Linear(dim, num_classes) if num_classes > 0 else torch.nn.Identity()
+    def forward(self, x):
+        if self.distillation:
+            x = self.classifier(x), self.classifier_dist(x)
+            if not self.training:
+                x = (x[0] + x[1]) / 2
+        else:
+            x = self.classifier(x)
+        return x
+    @torch.no_grad()
+    def fuse(self):
+        classifier = self.classifier.fuse()
+        if self.distillation:
+            classifier_dist = self.classifier_dist.fuse()
+            classifier.weight += classifier_dist.weight
+            classifier.bias += classifier_dist.bias
+            classifier.weight /= 2
+            classifier.bias /= 2
+            return classifier
+        else:
+            return classifier
+class RepViT(nn.Module):
+    def __init__(self, cfgs, num_classes=1000, distillation=False, img_size=1024):
+        super(RepViT, self).__init__()
+        # setting of inverted residual blocks
+        self.cfgs = cfgs
+        self.img_size = img_size
+        # building first layer
+        input_channel = self.cfgs[0][2]
+        patch_embed = torch.nn.Sequential(Conv2d_BN(3, input_channel // 2, 3, 2, 1), torch.nn.GELU(),
+                           Conv2d_BN(input_channel // 2, input_channel, 3, 2, 1))
+        layers = [patch_embed]
+        # building inverted residual blocks
+        block = RepViTBlock
+        for k, t, c, use_se, use_hs, s in self.cfgs:
+            output_channel = _make_divisible(c, 8)
+            exp_size = _make_divisible(input_channel * t, 8)
+            layers.append(block(input_channel, exp_size, output_channel, k, s, use_se, use_hs))
+            input_channel = output_channel
+        self.features = nn.ModuleList(layers)
+        # self.classifier = Classfier(output_channel, num_classes, distillation)
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                output_channel,
+                256,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(256),
+            nn.Conv2d(
+                256,
+                256,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(256),
+        )
+    def forward(self, x):
+        # x = self.features(x)
+        for f in self.features:
+            x = f(x)
+        # x = torch.nn.functional.adaptive_avg_pool2d(x, 1).flatten(1)
+        x = self.neck(x)
+        return x, None
+from timm.models import register_model
+@register_model
+def repvit(pretrained=False, num_classes = 1000, distillation=False, **kwargs):
+    """
+    Constructs a MobileNetV3-Large model
+    """
+    cfgs = [
+        # k, t, c, SE, HS, s
+        [3,   2,  80, 1, 0, 1],
+        [3,   2,  80, 0, 0, 1],
+        [3,   2,  80, 1, 0, 1],
+        [3,   2,  80, 0, 0, 1],
+        [3,   2,  80, 1, 0, 1],
+        [3,   2,  80, 0, 0, 1],
+        [3,   2,  80, 0, 0, 1],
+        [3,   2,  160, 0, 0, 2],
+        [3,   2,  160, 1, 0, 1],
+        [3,   2,  160, 0, 0, 1],
+        [3,   2,  160, 1, 0, 1],
+        [3,   2,  160, 0, 0, 1],
+        [3,   2,  160, 1, 0, 1],
+        [3,   2,  160, 0, 0, 1],
+        [3,   2,  160, 0, 0, 1],
+        [3,   2,  320, 0, 1, 2],
+        [3,   2,  320, 1, 1, 1],
+        [3,   2,  320, 0, 1, 1],
+        [3,   2,  320, 1, 1, 1],
+        [3,   2,  320, 0, 1, 1],
+        [3,   2,  320, 1, 1, 1],
+        [3,   2,  320, 0, 1, 1],
+        [3,   2,  320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        # [3,   2, 320, 1, 1, 1],
+        # [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 640, 0, 1, 2],
+        [3,   2, 640, 1, 1, 1],
+        [3,   2, 640, 0, 1, 1],
+        # [3,   2, 640, 1, 1, 1],
+        # [3,   2, 640, 0, 1, 1]
+    ]
+    return RepViT(cfgs, num_classes=num_classes, distillation=distillation)

EfficientSAM/RepViTSAM/setup_repvit_sam.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from functools import partial
+from segment_anything.modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
+from RepViTSAM import repvit
+from timm.models import create_model
+def build_sam_repvit(checkpoint=None):
+    prompt_embed_dim = 256
+    image_size = 1024
+    vit_patch_size = 16
+    image_embedding_size = image_size // vit_patch_size
+    repvit_sam = Sam(
+            image_encoder=create_model('repvit'),
+            prompt_encoder=PromptEncoder(
+            embed_dim=prompt_embed_dim,
+            image_embedding_size=(image_embedding_size, image_embedding_size),
+            input_image_size=(image_size, image_size),
+            mask_in_chans=16,
+            ),
+            mask_decoder=MaskDecoder(
+                    num_multimask_outputs=3,
+                    transformer=TwoWayTransformer(
+                    depth=2,
+                    embedding_dim=prompt_embed_dim,
+                    mlp_dim=2048,
+                    num_heads=8,
+                ),
+                transformer_dim=prompt_embed_dim,
+                iou_head_depth=3,
+                iou_head_hidden_dim=256,
+            ),
+            pixel_mean=[123.675, 116.28, 103.53],
+            pixel_std=[58.395, 57.12, 57.375],
+        )
+    repvit_sam.eval()
+    if checkpoint is not None:
+        with open(checkpoint, "rb") as f:
+            state_dict = torch.load(f)
+        repvit_sam.load_state_dict(state_dict)
+    return repvit_sam
+from functools import partial
+sam_model_registry = {
+    "repvit": partial(build_sam_repvit),
+}

EfficientSAM/grounded_edge_sam.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import cv2
+import numpy as np
+import supervision as sv
+import torch
+import torchvision
+from groundingdino.util.inference import Model
+from segment_anything import SamPredictor
+from EdgeSAM.setup_edge_sam import build_edge_sam
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# GroundingDINO config and checkpoint
+GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
+GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth"
+# Building GroundingDINO inference model
+grounding_dino_model = Model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH)
+# Building MobileSAM predictor
+EdgeSAM_CHECKPOINT_PATH = "./EfficientSAM/edge_sam_3x.pth"
+edge_sam = build_edge_sam(checkpoint=EdgeSAM_CHECKPOINT_PATH)
+edge_sam.to(device=DEVICE)
+sam_predictor = SamPredictor(edge_sam)
+# Predict classes and hyper-param for GroundingDINO
+SOURCE_IMAGE_PATH = "./EfficientSAM/LightHQSAM/example_light_hqsam.png"
+CLASSES = ["bench"]
+BOX_THRESHOLD = 0.25
+TEXT_THRESHOLD = 0.25
+NMS_THRESHOLD = 0.8
+# load image
+image = cv2.imread(SOURCE_IMAGE_PATH)
+# detect objects
+detections = grounding_dino_model.predict_with_classes(
+    image=image,
+    classes=CLASSES,
+    box_threshold=BOX_THRESHOLD,
+    text_threshold=TEXT_THRESHOLD
+)
+# annotate image with detections
+box_annotator = sv.BoxAnnotator()
+labels = [
+    f"{CLASSES[class_id]} {confidence:0.2f}"
+    for _, _, confidence, class_id, _, _
+    in detections]
+annotated_frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels)
+# save the annotated grounding dino image
+cv2.imwrite("EfficientSAM/LightHQSAM/groundingdino_annotated_image.jpg", annotated_frame)
+# NMS post process
+print(f"Before NMS: {len(detections.xyxy)} boxes")
+nms_idx = torchvision.ops.nms(
+    torch.from_numpy(detections.xyxy),
+    torch.from_numpy(detections.confidence),
+    NMS_THRESHOLD
+).numpy().tolist()
+detections.xyxy = detections.xyxy[nms_idx]
+detections.confidence = detections.confidence[nms_idx]
+detections.class_id = detections.class_id[nms_idx]
+print(f"After NMS: {len(detections.xyxy)} boxes")
+# Prompting SAM with detected boxes
+def segment(sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray:
+    sam_predictor.set_image(image)
+    result_masks = []
+    for box in xyxy:
+        masks, scores, logits = sam_predictor.predict(
+            box=box,
+            multimask_output=False,
+            hq_token_only=True,
+        )
+        index = np.argmax(scores)
+        result_masks.append(masks[index])
+    return np.array(result_masks)
+# convert detections to masks
+detections.mask = segment(
+    sam_predictor=sam_predictor,
+    image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB),
+    xyxy=detections.xyxy
+)
+# annotate image with detections
+box_annotator = sv.BoxAnnotator()
+mask_annotator = sv.MaskAnnotator()
+labels = [
+    f"{CLASSES[class_id]} {confidence:0.2f}"
+    for _, _, confidence, class_id, _, _
+    in detections]
+annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections)
+annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
+# save the annotated grounded-sam image
+cv2.imwrite("EfficientSAM/grounded_edge_sam_annotated_image.jpg", annotated_image)

EfficientSAM/grounded_efficient_sam.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import cv2
+import numpy as np
+import supervision as sv
+import torch
+import torchvision
+from torchvision.transforms import ToTensor
+from groundingdino.util.inference import Model
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# GroundingDINO config and checkpoint
+GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
+GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth"
+# Building GroundingDINO inference model
+grounding_dino_model = Model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH)
+# Building MobileSAM predictor
+EFFICIENT_SAM_CHECHPOINT_PATH = "./EfficientSAM/efficientsam_s_gpu.jit"
+efficientsam = torch.jit.load(EFFICIENT_SAM_CHECHPOINT_PATH)
+# Predict classes and hyper-param for GroundingDINO
+SOURCE_IMAGE_PATH = "./EfficientSAM/LightHQSAM/example_light_hqsam.png"
+CLASSES = ["bench"]
+BOX_THRESHOLD = 0.25
+TEXT_THRESHOLD = 0.25
+NMS_THRESHOLD = 0.8
+# load image
+image = cv2.imread(SOURCE_IMAGE_PATH)
+# detect objects
+detections = grounding_dino_model.predict_with_classes(
+    image=image,
+    classes=CLASSES,
+    box_threshold=BOX_THRESHOLD,
+    text_threshold=TEXT_THRESHOLD
+)
+# annotate image with detections
+box_annotator = sv.BoxAnnotator()
+labels = [
+    f"{CLASSES[class_id]} {confidence:0.2f}"
+    for _, _, confidence, class_id, _, _
+    in detections]
+annotated_frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels)
+# save the annotated grounding dino image
+cv2.imwrite("EfficientSAM/LightHQSAM/groundingdino_annotated_image.jpg", annotated_frame)
+# NMS post process
+print(f"Before NMS: {len(detections.xyxy)} boxes")
+nms_idx = torchvision.ops.nms(
+    torch.from_numpy(detections.xyxy),
+    torch.from_numpy(detections.confidence),
+    NMS_THRESHOLD
+).numpy().tolist()
+detections.xyxy = detections.xyxy[nms_idx]
+detections.confidence = detections.confidence[nms_idx]
+detections.class_id = detections.class_id[nms_idx]
+print(f"After NMS: {len(detections.xyxy)} boxes")
+def efficient_sam_box_prompt_segment(image, pts_sampled, model):
+    bbox = torch.reshape(torch.tensor(pts_sampled), [1, 1, 2, 2])
+    bbox_labels = torch.reshape(torch.tensor([2, 3]), [1, 1, 2])
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    img_tensor = ToTensor()(image)
+    predicted_logits, predicted_iou = model(
+        img_tensor[None, ...].cuda(),
+        bbox.cuda(),
+        bbox_labels.cuda(),
+    )
+    predicted_logits = predicted_logits.cpu()
+    all_masks = torch.ge(torch.sigmoid(predicted_logits[0, 0, :, :, :]), 0.5).numpy()
+    predicted_iou = predicted_iou[0, 0, ...].cpu().detach().numpy()
+    max_predicted_iou = -1
+    selected_mask_using_predicted_iou = None
+    for m in range(all_masks.shape[0]):
+        curr_predicted_iou = predicted_iou[m]
+        if (
+            curr_predicted_iou > max_predicted_iou
+            or selected_mask_using_predicted_iou is None
+        ):
+            max_predicted_iou = curr_predicted_iou
+            selected_mask_using_predicted_iou = all_masks[m]
+    return selected_mask_using_predicted_iou
+# collect segment results from EfficientSAM
+result_masks = []
+for box in detections.xyxy:
+    mask = efficient_sam_box_prompt_segment(image, box, efficientsam)
+    result_masks.append(mask)
+detections.mask = np.array(result_masks)
+# annotate image with detections
+box_annotator = sv.BoxAnnotator()
+mask_annotator = sv.MaskAnnotator()
+labels = [
+    f"{CLASSES[class_id]} {confidence:0.2f}"
+    for _, _, confidence, class_id, _, _
+    in detections]
+annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections)
+annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
+# save the annotated grounded-sam image
+cv2.imwrite("EfficientSAM/gronded_efficient_sam_anontated_image.jpg", annotated_image)

EfficientSAM/grounded_fast_sam.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import argparse
+import cv2
+from ultralytics import YOLO
+from FastSAM.tools import *
+from groundingdino.util.inference import load_model, load_image, predict, annotate, Model
+from torchvision.ops import box_convert
+import ast
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_path", type=str, default="./FastSAM/FastSAM-x.pt", help="model"
+    )
+    parser.add_argument(
+        "--img_path", type=str, default="./images/dogs.jpg", help="path to image file"
+    )
+    parser.add_argument(
+        "--text", type=str, default="the black dog.", help="text prompt for GroundingDINO"
+    )
+    parser.add_argument("--imgsz", type=int, default=1024, help="image size")
+    parser.add_argument(
+        "--iou",
+        type=float,
+        default=0.9,
+        help="iou threshold for filtering the annotations",
+    )
+    parser.add_argument(
+        "--conf", type=float, default=0.4, help="object confidence threshold"
+    )
+    parser.add_argument(
+        "--output", type=str, default="./output/", help="image save path"
+    )
+    parser.add_argument(
+        "--randomcolor", type=bool, default=True, help="mask random color"
+    )
+    parser.add_argument(
+        "--point_prompt", type=str, default="[[0,0]]", help="[[x1,y1],[x2,y2]]"
+    )
+    parser.add_argument(
+        "--point_label",
+        type=str,
+        default="[0]",
+        help="[1,0] 0:background, 1:foreground",
+    )
+    parser.add_argument("--box_prompt", type=str, default="[0,0,0,0]", help="[x,y,w,h]")
+    parser.add_argument(
+        "--better_quality",
+        type=str,
+        default=False,
+        help="better quality using morphologyEx",
+    )
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    parser.add_argument(
+        "--device", type=str, default=device, help="cuda:[0,1,2,3,4] or cpu"
+    )
+    parser.add_argument(
+        "--retina",
+        type=bool,
+        default=True,
+        help="draw high-resolution segmentation masks",
+    )
+    parser.add_argument(
+        "--withContours", type=bool, default=False, help="draw the edges of the masks"
+    )
+    return parser.parse_args()
+def main(args):
+    # Image Path
+    img_path = args.img_path
+    text = args.text
+    # path to save img
+    save_path = args.output
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    basename = os.path.basename(args.img_path).split(".")[0]
+    # Build Fast-SAM Model
+    # ckpt_path = "/comp_robot/rentianhe/code/Grounded-Segment-Anything/FastSAM/FastSAM-x.pt"
+    model = YOLO(args.model_path)
+    results = model(
+        args.img_path,
+        imgsz=args.imgsz,
+        device=args.device,
+        retina_masks=args.retina,
+        iou=args.iou,
+        conf=args.conf,
+        max_det=100,
+    )
+    # Build GroundingDINO Model
+    groundingdino_config = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
+    groundingdino_ckpt_path = "./groundingdino_swint_ogc.pth"
+    image_source, image = load_image(img_path)
+    model = load_model(groundingdino_config, groundingdino_ckpt_path)
+    boxes, logits, phrases = predict(
+        model=model,
+        image=image,
+        caption=text,
+        box_threshold=0.3,
+        text_threshold=0.25,
+        device=args.device,
+    )
+    # Grounded-Fast-SAM
+    ori_img = cv2.imread(img_path)
+    ori_h = ori_img.shape[0]
+    ori_w = ori_img.shape[1]
+    # Save each frame due to the post process from FastSAM
+    boxes = boxes * torch.Tensor([ori_w, ori_h, ori_w, ori_h])
+    print(f"Detected Boxes: {len(boxes)}")
+    boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").cpu().numpy().tolist()
+    for box_idx in range(len(boxes)):
+        mask, _ = box_prompt(
+            results[0].masks.data,
+            boxes[box_idx],
+            ori_h,
+            ori_w,
+        )
+        annotations = np.array([mask])
+        img_array = fast_process(
+            annotations=annotations,
+            args=args,
+            mask_random_color=True,
+            bbox=boxes[box_idx],
+        )
+        cv2.imwrite(os.path.join(save_path, basename + f"_{str(box_idx)}_caption_{phrases[box_idx]}.jpg"), cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR))
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

EfficientSAM/grounded_light_hqsam.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import cv2
+import numpy as np
+import supervision as sv
+import torch
+import torchvision
+from groundingdino.util.inference import Model
+from segment_anything import SamPredictor
+from LightHQSAM.setup_light_hqsam import setup_model
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# GroundingDINO config and checkpoint
+GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
+GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth"
+# Building GroundingDINO inference model
+grounding_dino_model = Model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH)
+# Building MobileSAM predictor
+HQSAM_CHECKPOINT_PATH = "./EfficientSAM/sam_hq_vit_tiny.pth"
+checkpoint = torch.load(HQSAM_CHECKPOINT_PATH)
+light_hqsam = setup_model()
+light_hqsam.load_state_dict(checkpoint, strict=True)
+light_hqsam.to(device=DEVICE)
+sam_predictor = SamPredictor(light_hqsam)
+# Predict classes and hyper-param for GroundingDINO
+SOURCE_IMAGE_PATH = "./EfficientSAM/LightHQSAM/example_light_hqsam.png"
+CLASSES = ["bench"]
+BOX_THRESHOLD = 0.25
+TEXT_THRESHOLD = 0.25
+NMS_THRESHOLD = 0.8
+# load image
+image = cv2.imread(SOURCE_IMAGE_PATH)
+# detect objects
+detections = grounding_dino_model.predict_with_classes(
+    image=image,
+    classes=CLASSES,
+    box_threshold=BOX_THRESHOLD,
+    text_threshold=TEXT_THRESHOLD
+)
+# annotate image with detections
+box_annotator = sv.BoxAnnotator()
+labels = [
+    f"{CLASSES[class_id]} {confidence:0.2f}"
+    for _, _, confidence, class_id, _, _
+    in detections]
+annotated_frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels)
+# save the annotated grounding dino image
+cv2.imwrite("EfficientSAM/LightHQSAM/groundingdino_annotated_image.jpg", annotated_frame)
+# NMS post process
+print(f"Before NMS: {len(detections.xyxy)} boxes")
+nms_idx = torchvision.ops.nms(
+    torch.from_numpy(detections.xyxy),
+    torch.from_numpy(detections.confidence),
+    NMS_THRESHOLD
+).numpy().tolist()
+detections.xyxy = detections.xyxy[nms_idx]
+detections.confidence = detections.confidence[nms_idx]
+detections.class_id = detections.class_id[nms_idx]
+print(f"After NMS: {len(detections.xyxy)} boxes")
+# Prompting SAM with detected boxes
+def segment(sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray:
+    sam_predictor.set_image(image)
+    result_masks = []
+    for box in xyxy:
+        masks, scores, logits = sam_predictor.predict(
+            box=box,
+            multimask_output=False,
+            hq_token_only=True,
+        )
+        index = np.argmax(scores)
+        result_masks.append(masks[index])
+    return np.array(result_masks)
+# convert detections to masks
+detections.mask = segment(
+    sam_predictor=sam_predictor,
+    image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB),
+    xyxy=detections.xyxy
+)
+# annotate image with detections
+box_annotator = sv.BoxAnnotator()
+mask_annotator = sv.MaskAnnotator()
+labels = [
+    f"{CLASSES[class_id]} {confidence:0.2f}"
+    for _, _, confidence, class_id, _, _
+    in detections]
+annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections)
+annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
+# save the annotated grounded-sam image
+cv2.imwrite("EfficientSAM/LightHQSAM/grounded_light_hqsam_annotated_image.jpg", annotated_image)

EfficientSAM/grounded_mobile_sam.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import cv2
+import numpy as np
+import supervision as sv
+import argparse
+import torch
+import torchvision
+from groundingdino.util.inference import Model
+from segment_anything import SamPredictor
+from MobileSAM.setup_mobile_sam import setup_model
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--MOBILE_SAM_CHECKPOINT_PATH", type=str, default="./EfficientSAM/mobile_sam.pt", help="model"
+    )
+    parser.add_argument(
+        "--SOURCE_IMAGE_PATH", type=str, default="./assets/demo2.jpg", help="path to image file"
+    )
+    parser.add_argument(
+        "--CAPTION", type=str, default="The running dog", help="text prompt for GroundingDINO"
+    )
+    parser.add_argument(
+        "--OUT_FILE_BOX", type=str, default="groundingdino_annotated_image.jpg", help="the output filename"
+    )
+    parser.add_argument(
+        "--OUT_FILE_SEG", type=str, default="grounded_mobile_sam_annotated_image.jpg", help="the output filename"
+    )
+    parser.add_argument(
+        "--OUT_FILE_BIN_MASK", type=str, default="grounded_mobile_sam_bin_mask.jpg", help="the output filename"
+    )
+    parser.add_argument("--BOX_THRESHOLD", type=float, default=0.25, help="")
+    parser.add_argument("--TEXT_THRESHOLD", type=float, default=0.25, help="")
+    parser.add_argument("--NMS_THRESHOLD", type=float, default=0.8, help="")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    parser.add_argument(
+        "--DEVICE", type=str, default=device, help="cuda:[0,1,2,3,4] or cpu"
+    )
+    return parser.parse_args()
+def main(args):
+  DEVICE = args.DEVICE
+  # GroundingDINO config and checkpoint
+  GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
+  GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth"
+  # Building GroundingDINO inference model
+  grounding_dino_model = Model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH)
+  # Building MobileSAM predictor
+  MOBILE_SAM_CHECKPOINT_PATH = args.MOBILE_SAM_CHECKPOINT_PATH
+  checkpoint = torch.load(MOBILE_SAM_CHECKPOINT_PATH)
+  mobile_sam = setup_model()
+  mobile_sam.load_state_dict(checkpoint, strict=True)
+  mobile_sam.to(device=DEVICE)
+  sam_predictor = SamPredictor(mobile_sam)
+  # Predict classes and hyper-param for GroundingDINO
+  SOURCE_IMAGE_PATH = args.SOURCE_IMAGE_PATH
+  CLASSES = [args.CAPTION]
+  BOX_THRESHOLD = args.BOX_THRESHOLD
+  TEXT_THRESHOLD = args.TEXT_THRESHOLD
+  NMS_THRESHOLD = args.NMS_THRESHOLD
+  # load image
+  image = cv2.imread(SOURCE_IMAGE_PATH)
+  # detect objects
+  detections = grounding_dino_model.predict_with_classes(
+      image=image,
+      classes=CLASSES,
+      box_threshold=BOX_THRESHOLD,
+      text_threshold=TEXT_THRESHOLD
+  )
+  # annotate image with detections
+  box_annotator = sv.BoxAnnotator()
+  labels = [
+      f"{CLASSES[class_id]} {confidence:0.2f}"
+      for _, _, confidence, class_id, _, _
+      in detections]
+  annotated_frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels)
+  # save the annotated grounding dino image
+  cv2.imwrite(args.OUT_FILE_BOX, annotated_frame)
+  # NMS post process
+  print(f"Before NMS: {len(detections.xyxy)} boxes")
+  nms_idx = torchvision.ops.nms(
+      torch.from_numpy(detections.xyxy),
+      torch.from_numpy(detections.confidence),
+      NMS_THRESHOLD
+  ).numpy().tolist()
+  detections.xyxy = detections.xyxy[nms_idx]
+  detections.confidence = detections.confidence[nms_idx]
+  detections.class_id = detections.class_id[nms_idx]
+  print(f"After NMS: {len(detections.xyxy)} boxes")
+  # Prompting SAM with detected boxes
+  def segment(sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray:
+      sam_predictor.set_image(image)
+      result_masks = []
+      for box in xyxy:
+          masks, scores, logits = sam_predictor.predict(
+              box=box,
+              multimask_output=True
+          )
+          index = np.argmax(scores)
+          result_masks.append(masks[index])
+      return np.array(result_masks)
+  # convert detections to masks
+  detections.mask = segment(
+      sam_predictor=sam_predictor,
+      image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB),
+      xyxy=detections.xyxy
+  )
+  binary_mask = detections.mask[0].astype(np.uint8)*255
+  cv2.imwrite(args.OUT_FILE_BIN_MASK, binary_mask)
+  # annotate image with detections
+  box_annotator = sv.BoxAnnotator()
+  mask_annotator = sv.MaskAnnotator()
+  labels = [
+      f"{CLASSES[class_id]} {confidence:0.2f}"
+      for _, _, confidence, class_id, _, _
+      in detections]
+  annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections)
+  annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
+  # save the annotated grounded-sam image
+  cv2.imwrite(args.OUT_FILE_SEG, annotated_image)
+if __name__ == "__main__":
+  args = parse_args()
+  main(args)

EfficientSAM/grounded_repvit_sam.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import cv2
+import numpy as np
+import supervision as sv
+import torch
+import torchvision
+from groundingdino.util.inference import Model
+from segment_anything import SamPredictor
+from RepViTSAM.setup_repvit_sam import build_sam_repvit
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# GroundingDINO config and checkpoint
+GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
+GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth"
+# Building GroundingDINO inference model
+grounding_dino_model = Model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH)
+# Building MobileSAM predictor
+RepViTSAM_CHECKPOINT_PATH = "./EfficientSAM/repvit_sam.pt"
+repvit_sam = build_sam_repvit(checkpoint=RepViTSAM_CHECKPOINT_PATH)
+repvit_sam.to(device=DEVICE)
+sam_predictor = SamPredictor(repvit_sam)
+# Predict classes and hyper-param for GroundingDINO
+SOURCE_IMAGE_PATH = "./EfficientSAM/LightHQSAM/example_light_hqsam.png"
+CLASSES = ["bench"]
+BOX_THRESHOLD = 0.25
+TEXT_THRESHOLD = 0.25
+NMS_THRESHOLD = 0.8
+# load image
+image = cv2.imread(SOURCE_IMAGE_PATH)
+# detect objects
+detections = grounding_dino_model.predict_with_classes(
+    image=image,
+    classes=CLASSES,
+    box_threshold=BOX_THRESHOLD,
+    text_threshold=TEXT_THRESHOLD
+)
+# annotate image with detections
+box_annotator = sv.BoxAnnotator()
+labels = [
+    f"{CLASSES[class_id]} {confidence:0.2f}"
+    for _, _, confidence, class_id, _, _
+    in detections]
+annotated_frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels)
+# save the annotated grounding dino image
+cv2.imwrite("EfficientSAM/LightHQSAM/groundingdino_annotated_image.jpg", annotated_frame)
+# NMS post process
+print(f"Before NMS: {len(detections.xyxy)} boxes")
+nms_idx = torchvision.ops.nms(
+    torch.from_numpy(detections.xyxy),
+    torch.from_numpy(detections.confidence),
+    NMS_THRESHOLD
+).numpy().tolist()
+detections.xyxy = detections.xyxy[nms_idx]
+detections.confidence = detections.confidence[nms_idx]
+detections.class_id = detections.class_id[nms_idx]
+print(f"After NMS: {len(detections.xyxy)} boxes")
+# Prompting SAM with detected boxes
+def segment(sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray:
+    sam_predictor.set_image(image)
+    result_masks = []
+    for box in xyxy:
+        masks, scores, logits = sam_predictor.predict(
+            box=box,
+            multimask_output=False,
+            hq_token_only=True,
+        )
+        index = np.argmax(scores)
+        result_masks.append(masks[index])
+    return np.array(result_masks)
+# convert detections to masks
+detections.mask = segment(
+    sam_predictor=sam_predictor,
+    image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB),
+    xyxy=detections.xyxy
+)
+# annotate image with detections
+box_annotator = sv.BoxAnnotator()
+mask_annotator = sv.MaskAnnotator()
+labels = [
+    f"{CLASSES[class_id]} {confidence:0.2f}"
+    for _, _, confidence, class_id, _, _
+    in detections]
+annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections)
+annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
+# save the annotated grounded-sam image
+cv2.imwrite("EfficientSAM/grounded_repvit_sam_annotated_image.jpg", annotated_image)

GroundingDINO/.asset/COCO.png ADDED Viewed

GroundingDINO/.asset/GD_GLIGEN.png ADDED Viewed

Git LFS Details

SHA256: 6e36d497ace68412ecd6c064fff6d7481a685963ffc2ec047a8892411fb0ab8e
Pointer size: 132 Bytes
Size of remote file: 1.23 MB

GroundingDINO/.asset/GD_SD.png ADDED Viewed

Git LFS Details

SHA256: 92c8a690a2de028d42c9b876c73dca53b7736134eb77cce5b3cbda9d1c4b62de
Pointer size: 132 Bytes
Size of remote file: 1.16 MB

GroundingDINO/.asset/ODinW.png ADDED Viewed

GroundingDINO/.asset/arch.png ADDED Viewed

GroundingDINO/.asset/cats.png ADDED Viewed

GroundingDINO/.asset/hero_figure.png ADDED Viewed

Git LFS Details

SHA256: 24b18b31e9f150bae0ae01b09608d7bf7fc34f42c8e17d85eda55ea4a55b1e91
Pointer size: 132 Bytes
Size of remote file: 2.98 MB

GroundingDINO/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2020 - present, Facebook, Inc
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

GroundingDINO/README.md ADDED Viewed

	@@ -0,0 +1,163 @@

+# Grounding DINO
+---
+[![arXiv](https://img.shields.io/badge/arXiv-2303.05499-b31b1b.svg)](https://arxiv.org/abs/2303.05499)
+[![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/wxWDt5UiwY8)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb)
+[![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/cMa77r3YrDk)
+[![HuggingFace space](https://img.shields.io/badge/🤗-HuggingFace%20Space-cyan.svg)](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/zero-shot-object-detection-on-mscoco)](https://paperswithcode.com/sota/zero-shot-object-detection-on-mscoco?p=grounding-dino-marrying-dino-with-grounded) \
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/zero-shot-object-detection-on-odinw)](https://paperswithcode.com/sota/zero-shot-object-detection-on-odinw?p=grounding-dino-marrying-dino-with-grounded) \
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/object-detection-on-coco-minival)](https://paperswithcode.com/sota/object-detection-on-coco-minival?p=grounding-dino-marrying-dino-with-grounded) \
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/object-detection-on-coco)](https://paperswithcode.com/sota/object-detection-on-coco?p=grounding-dino-marrying-dino-with-grounded)
+Official PyTorch implementation of [Grounding DINO](https://arxiv.org/abs/2303.05499), a stronger open-set object detector. Code is available now!
+## Highlight
+- **Open-Set Detection.** Detect **everything** with language!
+- **High Performancce.** COCO zero-shot **52.5 AP** (training without COCO data!). COCO fine-tune **63.0 AP**.
+- **Flexible.** Collaboration with Stable Diffusion for Image Editting.
+## News
+[2023/03/28] A YouTube [video](https://youtu.be/cMa77r3YrDk) about Grounding DINO and basic object detection prompt engineering. [[SkalskiP](https://github.com/SkalskiP)] \
+[2023/03/28] Add a [demo](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo) on Hugging Face Space! \
+[2023/03/27] Support CPU-only mode. Now the model can run on machines without GPUs.\
+[2023/03/25] A [demo](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb) for Grounding DINO is available at Colab. [[SkalskiP](https://github.com/SkalskiP)] \
+[2023/03/22] Code is available Now!
+<details open>
+<summary><font size="4">
+Description
+</font></summary>
+<img src=".asset/hero_figure.png" alt="ODinW" width="100%">
+</details>
+## TODO
+- [x] Release inference code and demo.
+- [x] Release checkpoints.
+- [ ] Grounding DINO with Stable Diffusion and GLIGEN demos.
+- [ ] Release training codes.
+## Install
+If you have a CUDA environment, please make sure the environment variable `CUDA_HOME` is set. It will be compiled under CPU-only mode if no CUDA available.
+```bash
+pip install -e .
+```
+## Demo
+```bash
+CUDA_VISIBLE_DEVICES=6 python demo/inference_on_a_image.py \
+  -c /path/to/config \
+  -p /path/to/checkpoint \
+  -i .asset/cats.png \
+  -o "outputs/0" \
+  -t "cat ear." \
+  [--cpu-only] # open it for cpu mode
+```
+See the `demo/inference_on_a_image.py` for more details.
+**Web UI**
+We also provide a demo code to integrate Grounding DINO with Gradio Web UI. See the file `demo/gradio_app.py` for more details.
+## Checkpoints
+<!-- insert a table -->
+<table>
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>name</th>
+      <th>backbone</th>
+      <th>Data</th>
+      <th>box AP on COCO</th>
+      <th>Checkpoint</th>
+      <th>Config</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>1</th>
+      <td>GroundingDINO-T</td>
+      <td>Swin-T</td>
+      <td>O365,GoldG,Cap4M</td>
+      <td>48.4 (zero-shot) / 57.2 (fine-tune)</td>
+      <td><a href="https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth">Github link</a> | <a href="https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth">HF link</a></td>
+      <td><a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/groundingdino/config/GroundingDINO_SwinT_OGC.py">link</a></td>
+    </tr>
+  </tbody>
+</table>
+## Results
+<details open>
+<summary><font size="4">
+COCO Object Detection Results
+</font></summary>
+<img src=".asset/COCO.png" alt="COCO" width="100%">
+</details>
+<details open>
+<summary><font size="4">
+ODinW Object Detection Results
+</font></summary>
+<img src=".asset/ODinW.png" alt="ODinW" width="100%">
+</details>
+<details open>
+<summary><font size="4">
+Marrying Grounding DINO with <a href="https://github.com/Stability-AI/StableDiffusion">Stable Diffusion</a> for Image Editing
+</font></summary>
+<img src=".asset/GD_SD.png" alt="GD_SD" width="100%">
+</details>
+<details open>
+<summary><font size="4">
+Marrying Grounding DINO with <a href="https://github.com/gligen/GLIGEN">GLIGEN</a> for more Detailed Image Editing
+</font></summary>
+<img src=".asset/GD_GLIGEN.png" alt="GD_GLIGEN" width="100%">
+</details>
+## Model
+Includes: a text backbone, an image backbone, a feature enhancer, a language-guided query selection, and a cross-modality decoder.
+![arch](.asset/arch.png)
+## Acknowledgement
+Our model is related to [DINO](https://github.com/IDEA-Research/DINO) and [GLIP](https://github.com/microsoft/GLIP). Thanks for their great work!
+We also thank great previous work including DETR, Deformable DETR, SMCA, Conditional DETR, Anchor DETR, Dynamic DETR, DAB-DETR, DN-DETR, etc. More related work are available at [Awesome Detection Transformer](https://github.com/IDEACVR/awesome-detection-transformer). A new toolbox [detrex](https://github.com/IDEA-Research/detrex) is available as well.
+Thanks [Stable Diffusion](https://github.com/Stability-AI/StableDiffusion) and [GLIGEN](https://github.com/gligen/GLIGEN) for their awesome models.
+## Citation
+If you find our work helpful for your research, please consider citing the following BibTeX entry.
+```bibtex
+@inproceedings{ShilongLiu2023GroundingDM,
+  title={Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection},
+  author={Shilong Liu and Zhaoyang Zeng and Tianhe Ren and Feng Li and Hao Zhang and Jie Yang and Chunyuan Li and Jianwei Yang and Hang Su and Jun Zhu and Lei Zhang},
+  year={2023}
+}
+```

GroundingDINO/demo/gradio_app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import argparse
+from functools import partial
+import cv2
+import requests
+import os
+from io import BytesIO
+from PIL import Image
+import numpy as np
+from pathlib import Path
+import warnings
+import torch
+# prepare the environment
+os.system("python setup.py build develop --user")
+os.system("pip install packaging==21.3")
+os.system("pip install gradio")
+warnings.filterwarnings("ignore")
+import gradio as gr
+from groundingdino.models import build_model
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict
+from groundingdino.util.inference import annotate, load_image, predict
+import groundingdino.datasets.transforms as T
+from huggingface_hub import hf_hub_download
+# Use this command for evaluate the GLIP-T model
+config_file = "groundingdino/config/GroundingDINO_SwinT_OGC.py"
+ckpt_repo_id = "ShilongLiu/GroundingDINO"
+ckpt_filenmae = "groundingdino_swint_ogc.pth"
+def load_model_hf(model_config_path, repo_id, filename, device='cpu'):
+    args = SLConfig.fromfile(model_config_path)
+    model = build_model(args)
+    args.device = device
+    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
+    checkpoint = torch.load(cache_file, map_location='cpu')
+    log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
+    print("Model loaded from {} \n => {}".format(cache_file, log))
+    _ = model.eval()
+    return model
+def image_transform_grounding(init_image):
+    transform = T.Compose([
+        T.RandomResize([800], max_size=1333),
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    image, _ = transform(init_image, None) # 3, h, w
+    return init_image, image
+def image_transform_grounding_for_vis(init_image):
+    transform = T.Compose([
+        T.RandomResize([800], max_size=1333),
+    ])
+    image, _ = transform(init_image, None) # 3, h, w
+    return image
+model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
+def run_grounding(input_image, grounding_caption, box_threshold, text_threshold):
+    init_image = input_image.convert("RGB")
+    original_size = init_image.size
+    _, image_tensor = image_transform_grounding(init_image)
+    image_pil: Image = image_transform_grounding_for_vis(init_image)
+    # run grounidng
+    boxes, logits, phrases = predict(model, image_tensor, grounding_caption, box_threshold, text_threshold, device='cpu')
+    annotated_frame = annotate(image_source=np.asarray(image_pil), boxes=boxes, logits=logits, phrases=phrases)
+    image_with_box = Image.fromarray(cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB))
+    return image_with_box
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Grounding DINO demo", add_help=True)
+    parser.add_argument("--debug", action="store_true", help="using debug mode")
+    parser.add_argument("--share", action="store_true", help="share the app")
+    args = parser.parse_args()
+    block = gr.Blocks().queue()
+    with block:
+        gr.Markdown("# [Grounding DINO](https://github.com/IDEA-Research/GroundingDINO)")
+        gr.Markdown("### Open-World Detection with Grounding DINO")
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(source='upload', type="pil")
+                grounding_caption = gr.Textbox(label="Detection Prompt")
+                run_button = gr.Button(label="Run")
+                with gr.Accordion("Advanced options", open=False):
+                    box_threshold = gr.Slider(
+                        label="Box Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
+                    )
+                    text_threshold = gr.Slider(
+                        label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
+                    )
+            with gr.Column():
+                gallery = gr.outputs.Image(
+                    type="pil",
+                    # label="grounding results"
+                ).style(full_width=True, full_height=True)
+                # gallery = gr.Gallery(label="Generated images", show_label=False).style(
+                #         grid=[1], height="auto", container=True, full_width=True, full_height=True)
+        run_button.click(fn=run_grounding, inputs=[
+                        input_image, grounding_caption, box_threshold, text_threshold], outputs=[gallery])
+    block.launch(server_name='0.0.0.0', server_port=7579, debug=args.debug, share=args.share)

GroundingDINO/demo/inference_on_a_image.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import argparse
+import os
+import sys
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+import groundingdino.datasets.transforms as T
+from groundingdino.models import build_model
+from groundingdino.util import box_ops
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+    # draw boxes and masks
+    for box, label in zip(boxes, labels):
+        # from 0..1 to 0..W, 0..H
+        box = box * torch.Tensor([W, H, W, H])
+        # from xywh to xyxy
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        # random color
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        # draw
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+        # draw.text((x0, y0), str(label), fill=color)
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        # bbox = draw.textbbox((x0, y0), str(label))
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+    return image_pil, mask
+def load_image(image_path):
+    # load image
+    image_pil = Image.open(image_path).convert("RGB")  # load image
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image_pil, image
+def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = "cuda" if not cpu_only else "cpu"
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    print(load_res)
+    _ = model.eval()
+    return model
+def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, cpu_only=False):
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    device = "cuda" if not cpu_only else "cpu"
+    model = model.to(device)
+    image = image.to(device)
+    with torch.no_grad():
+        outputs = model(image[None], captions=[caption])
+    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
+    logits.shape[0]
+    # filter output
+    logits_filt = logits.clone()
+    boxes_filt = boxes.clone()
+    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+    logits_filt = logits_filt[filt_mask]  # num_filt, 256
+    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+    logits_filt.shape[0]
+    # get phrase
+    tokenlizer = model.tokenizer
+    tokenized = tokenlizer(caption)
+    # build pred
+    pred_phrases = []
+    for logit, box in zip(logits_filt, boxes_filt):
+        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+        if with_logits:
+            pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+        else:
+            pred_phrases.append(pred_phrase)
+    return boxes_filt, pred_phrases
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Grounding DINO example", add_help=True)
+    parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
+    parser.add_argument(
+        "--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
+    )
+    parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
+    parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
+    parser.add_argument(
+        "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
+    )
+    parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
+    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
+    args = parser.parse_args()
+    # cfg
+    config_file = args.config_file  # change the path of the model config file
+    checkpoint_path = args.checkpoint_path  # change the path of the model
+    image_path = args.image_path
+    text_prompt = args.text_prompt
+    output_dir = args.output_dir
+    box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
+    # make dir
+    os.makedirs(output_dir, exist_ok=True)
+    # load image
+    image_pil, image = load_image(image_path)
+    # load model
+    model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
+    # visualize raw image
+    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+    # run model
+    boxes_filt, pred_phrases = get_grounding_output(
+        model, image, text_prompt, box_threshold, text_threshold, cpu_only=args.cpu_only
+    )
+    # visualize pred
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    # import ipdb; ipdb.set_trace()
+    image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
+    image_with_box.save(os.path.join(output_dir, "pred.jpg"))

GroundingDINO/groundingdino/__init__.py ADDED Viewed

File without changes

GroundingDINO/groundingdino/config/GroundingDINO_SwinB.py ADDED Viewed

	@@ -0,0 +1,43 @@

+batch_size = 1
+modelname = "groundingdino"
+backbone = "swin_B_384_22k"
+position_embedding = "sine"
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+two_stage_type = "standard"
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+transformer_activation = "relu"
+dec_pred_bbox_embed_share = True
+dn_box_noise_scale = 1.0
+dn_label_noise_ratio = 0.5
+dn_label_coef = 1.0
+dn_bbox_coef = 1.0
+embed_init_tgt = True
+dn_labelbook_size = 2000
+max_text_len = 256
+text_encoder_type = "bert-base-uncased"
+use_text_enhancer = True
+use_fusion_layer = True
+use_checkpoint = True
+use_transformer_ckpt = True
+use_text_cross_attention = True
+text_dropout = 0.0
+fusion_dropout = 0.0
+fusion_droppath = 0.1
+sub_sentence_present = True

GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py ADDED Viewed

	@@ -0,0 +1,43 @@

+batch_size = 1
+modelname = "groundingdino"
+backbone = "swin_T_224_1k"
+position_embedding = "sine"
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+two_stage_type = "standard"
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+transformer_activation = "relu"
+dec_pred_bbox_embed_share = True
+dn_box_noise_scale = 1.0
+dn_label_noise_ratio = 0.5
+dn_label_coef = 1.0
+dn_bbox_coef = 1.0
+embed_init_tgt = True
+dn_labelbook_size = 2000
+max_text_len = 256
+text_encoder_type = "bert-base-uncased"
+use_text_enhancer = True
+use_fusion_layer = True
+use_checkpoint = True
+use_transformer_ckpt = True
+use_text_cross_attention = True
+text_dropout = 0.0
+fusion_dropout = 0.0
+fusion_droppath = 0.1
+sub_sentence_present = True

GroundingDINO/groundingdino/datasets/__init__.py ADDED Viewed

File without changes

GroundingDINO/groundingdino/datasets/transforms.py ADDED Viewed

	@@ -0,0 +1,311 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Transforms and data augmentation for both image + bbox.
+"""
+import os
+import random
+import PIL
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+from groundingdino.util.box_ops import box_xyxy_to_cxcywh
+from groundingdino.util.misc import interpolate
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+    target = target.copy()
+    i, j, h, w = region
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+    fields = ["labels", "area", "iscrowd", "positive_map"]
+    if "boxes" in target:
+        boxes = target["boxes"]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes?
+        target["masks"] = target["masks"][:, i : i + h, j : j + w]
+        fields.append("masks")
+    # remove elements for which the boxes or masks that have zero area
+    if "boxes" in target or "masks" in target:
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if "boxes" in target:
+            cropped_boxes = target["boxes"].reshape(-1, 2, 2)
+            keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target["masks"].flatten(1).any(1)
+        for field in fields:
+            if field in target:
+                target[field] = target[field][keep]
+    if os.environ.get("IPDB_SHILONG_DEBUG", None) == "INFO":
+        # for debug and visualization only.
+        if "strings_positive" in target:
+            target["strings_positive"] = [
+                _i for _i, _j in zip(target["strings_positive"], keep) if _j
+            ]
+    return cropped_image, target
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+    w, h = image.size
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor(
+            [w, 0, w, 0]
+        )
+        target["boxes"] = boxes
+    if "masks" in target:
+        target["masks"] = target["masks"].flip(-1)
+    return flipped_image, target
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+        return (oh, ow)
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+    if target is None:
+        return rescaled_image, None
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor(
+            [ratio_width, ratio_height, ratio_width, ratio_height]
+        )
+        target["boxes"] = scaled_boxes
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+    if "masks" in target:
+        target["masks"] = (
+            interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+        )
+    return rescaled_image, target
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image.size[::-1])
+    if "masks" in target:
+        target["masks"] = torch.nn.functional.pad(target["masks"], (0, padding[0], 0, padding[1]))
+    return padded_image, target
+class ResizeDebug(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        return resize(img, target, self.size)
+class RandomCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        region = T.RandomCrop.get_params(img, self.size)
+        return crop(img, target, region)
+class RandomSizeCrop(object):
+    def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False):
+        # respect_boxes:    True to keep all boxes
+        #                   False to tolerence box filter
+        self.min_size = min_size
+        self.max_size = max_size
+        self.respect_boxes = respect_boxes
+    def __call__(self, img: PIL.Image.Image, target: dict):
+        init_boxes = len(target["boxes"])
+        max_patience = 10
+        for i in range(max_patience):
+            w = random.randint(self.min_size, min(img.width, self.max_size))
+            h = random.randint(self.min_size, min(img.height, self.max_size))
+            region = T.RandomCrop.get_params(img, [h, w])
+            result_img, result_target = crop(img, target, region)
+            if (
+                not self.respect_boxes
+                or len(result_target["boxes"]) == init_boxes
+                or i == max_patience - 1
+            ):
+                return result_img, result_target
+        return result_img, result_target
+class CenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.0))
+        crop_left = int(round((image_width - crop_width) / 2.0))
+        return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+class RandomResize(object):
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        return resize(img, target, size, self.max_size)
+class RandomPad(object):
+    def __init__(self, max_pad):
+        self.max_pad = max_pad
+    def __call__(self, img, target):
+        pad_x = random.randint(0, self.max_pad)
+        pad_y = random.randint(0, self.max_pad)
+        return pad(img, target, (pad_x, pad_y))
+class RandomSelect(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    with probability p for transforms1 and (1 - p) for transforms2
+    """
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return self.transforms1(img, target)
+        return self.transforms2(img, target)
+class ToTensor(object):
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+class RandomErasing(object):
+    def __init__(self, *args, **kwargs):
+        self.eraser = T.RandomErasing(*args, **kwargs)
+    def __call__(self, img, target):
+        return self.eraser(img), target
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image.shape[-2:]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            target["boxes"] = boxes
+        return image, target
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string

GroundingDINO/groundingdino/models/GroundingDINO/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+from .groundingdino import build_groundingdino

GroundingDINO/groundingdino/models/GroundingDINO/backbone/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .backbone import build_backbone

GroundingDINO/groundingdino/models/GroundingDINO/backbone/backbone.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+Backbone modules.
+"""
+from typing import Dict, List
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from groundingdino.util.misc import NestedTensor, clean_state_dict, is_main_process
+from .position_encoding import build_position_encoding
+from .swin_transformer import build_swin_transformer
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+class BackboneBase(nn.Module):
+    def __init__(
+        self,
+        backbone: nn.Module,
+        train_backbone: bool,
+        num_channels: int,
+        return_interm_indices: list,
+    ):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if (
+                not train_backbone
+                or "layer2" not in name
+                and "layer3" not in name
+                and "layer4" not in name
+            ):
+                parameter.requires_grad_(False)
+        return_layers = {}
+        for idx, layer_index in enumerate(return_interm_indices):
+            return_layers.update(
+                {"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)}
+            )
+        # if len:
+        #     if use_stage1_feature:
+        #         return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
+        #     else:
+        #         return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"}
+        # else:
+        #     return_layers = {'layer4': "0"}
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.num_channels = num_channels
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        out: Dict[str, NestedTensor] = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        # import ipdb; ipdb.set_trace()
+        return out
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+    def __init__(
+        self,
+        name: str,
+        train_backbone: bool,
+        dilation: bool,
+        return_interm_indices: list,
+        batch_norm=FrozenBatchNorm2d,
+    ):
+        if name in ["resnet18", "resnet34", "resnet50", "resnet101"]:
+            backbone = getattr(torchvision.models, name)(
+                replace_stride_with_dilation=[False, False, dilation],
+                pretrained=is_main_process(),
+                norm_layer=batch_norm,
+            )
+        else:
+            raise NotImplementedError("Why you can get here with name {}".format(name))
+        # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
+        assert name not in ("resnet18", "resnet34"), "Only resnet50 and resnet101 are available."
+        assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
+        num_channels_all = [256, 512, 1024, 2048]
+        num_channels = num_channels_all[4 - len(return_interm_indices) :]
+        super().__init__(backbone, train_backbone, num_channels, return_interm_indices)
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+    def forward(self, tensor_list: NestedTensor):
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+            # position encoding
+            pos.append(self[1](x).to(x.tensors.dtype))
+        return out, pos
+def build_backbone(args):
+    """
+    Useful args:
+        - backbone: backbone name
+        - lr_backbone:
+        - dilation
+        - return_interm_indices: available: [0,1,2,3], [1,2,3], [3]
+        - backbone_freeze_keywords:
+        - use_checkpoint: for swin only for now
+    """
+    position_embedding = build_position_encoding(args)
+    train_backbone = True
+    if not train_backbone:
+        raise ValueError("Please set lr_backbone > 0")
+    return_interm_indices = args.return_interm_indices
+    assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
+    args.backbone_freeze_keywords
+    use_checkpoint = getattr(args, "use_checkpoint", False)
+    if args.backbone in ["resnet50", "resnet101"]:
+        backbone = Backbone(
+            args.backbone,
+            train_backbone,
+            args.dilation,
+            return_interm_indices,
+            batch_norm=FrozenBatchNorm2d,
+        )
+        bb_num_channels = backbone.num_channels
+    elif args.backbone in [
+        "swin_T_224_1k",
+        "swin_B_224_22k",
+        "swin_B_384_22k",
+        "swin_L_224_22k",
+        "swin_L_384_22k",
+    ]:
+        pretrain_img_size = int(args.backbone.split("_")[-2])
+        backbone = build_swin_transformer(
+            args.backbone,
+            pretrain_img_size=pretrain_img_size,
+            out_indices=tuple(return_interm_indices),
+            dilation=False,
+            use_checkpoint=use_checkpoint,
+        )
+        bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :]
+    else:
+        raise NotImplementedError("Unknown backbone {}".format(args.backbone))
+    assert len(bb_num_channels) == len(
+        return_interm_indices
+    ), f"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}"
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = bb_num_channels
+    assert isinstance(
+        bb_num_channels, List
+    ), "bb_num_channels is expected to be a List but {}".format(type(bb_num_channels))
+    # import ipdb; ipdb.set_trace()
+    return model

GroundingDINO/groundingdino/models/GroundingDINO/backbone/position_encoding.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+Various positional encodings for the transformer.
+"""
+import math
+import torch
+from torch import nn
+from groundingdino.util.misc import NestedTensor
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            # if os.environ.get("SHILONG_AMP", None) == '1':
+            #     eps = 1e-4
+            # else:
+            #     eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+class PositionEmbeddingSineHW(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(
+        self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None
+    ):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperatureH = temperatureH
+        self.temperatureW = temperatureW
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        # import ipdb; ipdb.set_trace()
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_tx = self.temperatureW ** (2 * (torch.div(dim_tx, 2, rounding_mode='floor')) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_tx
+        dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_ty = self.temperatureH ** (2 * (torch.div(dim_ty, 2, rounding_mode='floor')) / self.num_pos_feats)
+        pos_y = y_embed[:, :, :, None] / dim_ty
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        # import ipdb; ipdb.set_trace()
+        return pos
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = (
+            torch.cat(
+                [
+                    x_emb.unsqueeze(0).repeat(h, 1, 1),
+                    y_emb.unsqueeze(1).repeat(1, w, 1),
+                ],
+                dim=-1,
+            )
+            .permute(2, 0, 1)
+            .unsqueeze(0)
+            .repeat(x.shape[0], 1, 1, 1)
+        )
+        return pos
+def build_position_encoding(args):
+    N_steps = args.hidden_dim // 2
+    if args.position_embedding in ("v2", "sine"):
+        # TODO find a better way of exposing other arguments
+        position_embedding = PositionEmbeddingSineHW(
+            N_steps,
+            temperatureH=args.pe_temperatureH,
+            temperatureW=args.pe_temperatureW,
+            normalize=True,
+        )
+    elif args.position_embedding in ("v3", "learned"):
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f"not supported {args.position_embedding}")
+    return position_embedding

GroundingDINO/groundingdino/models/GroundingDINO/backbone/swin_transformer.py ADDED Viewed

	@@ -0,0 +1,802 @@

+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# --------------------------------------------------------
+# modified from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from groundingdino.util.misc import NestedTensor
+class Mlp(nn.Module):
+    """Multilayer perceptron."""
+    def __init__(
+        self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    """Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
+        )
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(
+        self,
+        dim,
+        depth,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device, dtype=x.dtype)  # 1 Hp Wp 1
+        h_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        w_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(
+            img_mask, self.window_size
+        )  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+            attn_mask == 0, float(0.0)
+        )
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(nn.Module):
+    """Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        dilation (bool): if True, the output size if 16x downsample, ow 32x downsample.
+    """
+    def __init__(
+        self,
+        pretrain_img_size=224,
+        patch_size=4,
+        in_chans=3,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.2,
+        norm_layer=nn.LayerNorm,
+        ape=False,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        dilation=False,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.dilation = dilation
+        # if use_checkpoint:
+        #     print("use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!!")
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+        )
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1],
+            ]
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])
+            )
+            trunc_normal_(self.absolute_pos_embed, std=0.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        # prepare downsample list
+        downsamplelist = [PatchMerging for i in range(self.num_layers)]
+        downsamplelist[-1] = None
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        if self.dilation:
+            downsamplelist[-2] = None
+            num_features[-1] = int(embed_dim * 2 ** (self.num_layers - 1)) // 2
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                # dim=int(embed_dim * 2 ** i_layer),
+                dim=num_features[i_layer],
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
+                norm_layer=norm_layer,
+                # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                downsample=downsamplelist[i_layer],
+                use_checkpoint=use_checkpoint,
+            )
+            self.layers.append(layer)
+        # num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f"norm{i_layer}"
+            self.add_module(layer_name, layer)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    # def init_weights(self, pretrained=None):
+    #     """Initialize the weights in backbone.
+    #     Args:
+    #         pretrained (str, optional): Path to pre-trained weights.
+    #             Defaults to None.
+    #     """
+    #     def _init_weights(m):
+    #         if isinstance(m, nn.Linear):
+    #             trunc_normal_(m.weight, std=.02)
+    #             if isinstance(m, nn.Linear) and m.bias is not None:
+    #                 nn.init.constant_(m.bias, 0)
+    #         elif isinstance(m, nn.LayerNorm):
+    #             nn.init.constant_(m.bias, 0)
+    #             nn.init.constant_(m.weight, 1.0)
+    #     if isinstance(pretrained, str):
+    #         self.apply(_init_weights)
+    #         logger = get_root_logger()
+    #         load_checkpoint(self, pretrained, strict=False, logger=logger)
+    #     elif pretrained is None:
+    #         self.apply(_init_weights)
+    #     else:
+    #         raise TypeError('pretrained must be a str or None')
+    def forward_raw(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
+            )
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            # import ipdb; ipdb.set_trace()
+            if i in self.out_indices:
+                norm_layer = getattr(self, f"norm{i}")
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+        # in:
+        #   torch.Size([2, 3, 1024, 1024])
+        # outs:
+        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
+        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
+        return tuple(outs)
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        """Forward function."""
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
+            )
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f"norm{i}")
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+        # in:
+        #   torch.Size([2, 3, 1024, 1024])
+        # out:
+        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
+        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
+        # collect for nesttensors
+        outs_dict = {}
+        for idx, out_i in enumerate(outs):
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0]
+            outs_dict[idx] = NestedTensor(out_i, mask)
+        return outs_dict
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+def build_swin_transformer(modelname, pretrain_img_size, **kw):
+    assert modelname in [
+        "swin_T_224_1k",
+        "swin_B_224_22k",
+        "swin_B_384_22k",
+        "swin_L_224_22k",
+        "swin_L_384_22k",
+    ]
+    model_para_dict = {
+        "swin_T_224_1k": dict(
+            embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7
+        ),
+        "swin_B_224_22k": dict(
+            embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7
+        ),
+        "swin_B_384_22k": dict(
+            embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=12
+        ),
+        "swin_L_224_22k": dict(
+            embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=7
+        ),
+        "swin_L_384_22k": dict(
+            embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12
+        ),
+    }
+    kw_cgf = model_para_dict[modelname]
+    kw_cgf.update(kw)
+    model = SwinTransformer(pretrain_img_size=pretrain_img_size, **kw_cgf)
+    return model
+if __name__ == "__main__":
+    model = build_swin_transformer("swin_L_384_22k", 384, dilation=True)
+    x = torch.rand(2, 3, 1024, 1024)
+    y = model.forward_raw(x)
+    import ipdb
+    ipdb.set_trace()
+    x = torch.rand(2, 3, 384, 384)
+    y = model.forward_raw(x)

GroundingDINO/groundingdino/models/GroundingDINO/bertwarper.py ADDED Viewed

	@@ -0,0 +1,273 @@

+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from torch import Tensor, nn
+from torchvision.ops.boxes import nms
+from transformers import BertConfig, BertModel, BertPreTrainedModel
+from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
+class BertModelWarper(nn.Module):
+    def __init__(self, bert_model):
+        super().__init__()
+        # self.bert = bert_modelc
+        self.config = bert_model.config
+        self.embeddings = bert_model.embeddings
+        self.encoder = bert_model.encoder
+        self.pooler = bert_model.pooler
+        self.get_extended_attention_mask = bert_model.get_extended_attention_mask
+        self.invert_attention_mask = bert_model.invert_attention_mask
+        self.get_head_mask = bert_model.get_head_mask
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        )
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)), device=device
+            )
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device
+        )
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+class TextEncoderShell(nn.Module):
+    def __init__(self, text_encoder):
+        super().__init__()
+        self.text_encoder = text_encoder
+        self.config = self.text_encoder.config
+    def forward(self, **kw):
+        # feed into text encoder
+        return self.text_encoder(**kw)
+def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer):
+    """Generate attention mask between each pair of special tokens
+    Args:
+        input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+        special_tokens_mask (list): special tokens mask.
+    Returns:
+        torch.Tensor: attention mask between each special tokens.
+    """
+    input_ids = tokenized["input_ids"]
+    bs, num_token = input_ids.shape
+    # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
+    for special_token in special_tokens_list:
+        special_tokens_mask |= input_ids == special_token
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
+    # generate attention mask and positional ids
+    attention_mask = (
+        torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
+    )
+    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device
+            )
+        previous_col = col
+    # # padding mask
+    # padding_mask = tokenized['attention_mask']
+    # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
+    return attention_mask, position_ids.to(torch.long)
+def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list, tokenizer):
+    """Generate attention mask between each pair of special tokens
+    Args:
+        input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+        special_tokens_mask (list): special tokens mask.
+    Returns:
+        torch.Tensor: attention mask between each special tokens.
+    """
+    input_ids = tokenized["input_ids"]
+    bs, num_token = input_ids.shape
+    # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
+    for special_token in special_tokens_list:
+        special_tokens_mask |= input_ids == special_token
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
+    # generate attention mask and positional ids
+    attention_mask = (
+        torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
+    )
+    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
+    cate_to_token_mask_list = [[] for _ in range(bs)]
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device
+            )
+            c2t_maski = torch.zeros((num_token), device=input_ids.device).bool()
+            c2t_maski[previous_col + 1 : col] = True
+            cate_to_token_mask_list[row].append(c2t_maski)
+        previous_col = col
+    cate_to_token_mask_list = [
+        torch.stack(cate_to_token_mask_listi, dim=0)
+        for cate_to_token_mask_listi in cate_to_token_mask_list
+    ]
+    # # padding mask
+    # padding_mask = tokenized['attention_mask']
+    # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
+    return attention_mask, position_ids.to(torch.long), cate_to_token_mask_list

GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn.h ADDED Viewed

	@@ -0,0 +1,64 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include "ms_deform_attn_cpu.h"
+#ifdef WITH_CUDA
+#include "ms_deform_attn_cuda.h"
+#endif
+namespace groundingdino {
+at::Tensor
+ms_deform_attn_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_forward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+std::vector<at::Tensor>
+ms_deform_attn_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_backward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+} // namespace groundingdino

GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp ADDED Viewed

	@@ -0,0 +1,43 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+namespace groundingdino {
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+} // namespace groundingdino

GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.h ADDED Viewed

	@@ -0,0 +1,35 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include <torch/extension.h>
+namespace groundingdino {
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+} // namespace groundingdino

GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cuda.cu ADDED Viewed

	@@ -0,0 +1,156 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include <vector>
+#include "ms_deform_im2col_cuda.cuh"
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+namespace groundingdino {
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+    const int num_levels = spatial_shapes.size(0);
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+    const int im2col_step_ = std::min(batch, im2col_step);
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data<int64_t>(),
+                level_start_index.data<int64_t>(),
+                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data<scalar_t>());
+        }));
+    }
+    output = output.view({batch, num_query, num_heads*channels});
+    return output;
+}
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+    const int num_levels = spatial_shapes.size(0);
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+    const int im2col_step_ = std::min(batch, im2col_step);
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data<scalar_t>(),
+                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data<int64_t>(),
+                                    level_start_index.data<int64_t>(),
+                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+        }));
+    }
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}
+} // namespace groundingdino