image-matching-webui

Running

App Files Files Community

Vincentqyw commited on Apr 15

Commit

2947428

•

1 Parent(s): a9f1fc6

fix:roma

Browse files

Files changed (3) hide show

common/utils.py +4 -4
third_party/Roma/roma/models/encoders.py +33 -81
third_party/Roma/roma/models/matcher.py +145 -267

common/utils.py CHANGED Viewed

@@ -49,7 +49,7 @@ def gen_examples():
         "topicfm",
         "superpoint+superglue",
         "disk+dualsoftmax",
-        "lanet",
     ]
     def gen_images_pairs(path: str, count: int = 5):
@@ -452,12 +452,11 @@ ransac_zoo = {
 # Matchers collections
 matcher_zoo = {
-    "gluestick": {"config": match_dense.confs["gluestick"], "dense": True},
-    "sold2": {"config": match_dense.confs["sold2"], "dense": True},
     # 'dedode-sparse': {
     #     'config': match_dense.confs['dedode_sparse'],
     #     'dense': True  # dense mode, we need 2 images
     # },
     "loftr": {"config": match_dense.confs["loftr"], "dense": True},
     "topicfm": {"config": match_dense.confs["topicfm"], "dense": True},
     "aspanformer": {"config": match_dense.confs["aspanformer"], "dense": True},
@@ -556,6 +555,7 @@ matcher_zoo = {
         "config_feature": extract_features.confs["sift"],
         "dense": False,
     },
-    # "roma": {"config": match_dense.confs["roma"], "dense": True},
     # "DKMv3": {"config": match_dense.confs["dkm"], "dense": True},
 }

         "topicfm",
         "superpoint+superglue",
         "disk+dualsoftmax",
+        "roma",
     ]
     def gen_images_pairs(path: str, count: int = 5):
 # Matchers collections
 matcher_zoo = {
     # 'dedode-sparse': {
     #     'config': match_dense.confs['dedode_sparse'],
     #     'dense': True  # dense mode, we need 2 images
     # },
+    "roma": {"config": match_dense.confs["roma"], "dense": True},
     "loftr": {"config": match_dense.confs["loftr"], "dense": True},
     "topicfm": {"config": match_dense.confs["topicfm"], "dense": True},
     "aspanformer": {"config": match_dense.confs["aspanformer"], "dense": True},
         "config_feature": extract_features.confs["sift"],
         "dense": False,
     },
+    "gluestick": {"config": match_dense.confs["gluestick"], "dense": True},
+    "sold2": {"config": match_dense.confs["sold2"], "dense": True},
     # "DKMv3": {"config": match_dense.confs["dkm"], "dense": True},
 }

third_party/Roma/roma/models/encoders.py CHANGED Viewed

@@ -6,59 +6,37 @@ import torch.nn.functional as F
 import torchvision.models as tvm
 import gc
-device = "cuda" if torch.cuda.is_available() else "cpu"
 class ResNet50(nn.Module):
-    def __init__(
-        self,
-        pretrained=False,
-        high_res=False,
-        weights=None,
-        dilation=None,
-        freeze_bn=True,
-        anti_aliased=False,
-        early_exit=False,
-        amp=False,
-    ) -> None:
         super().__init__()
         if dilation is None:
-            dilation = [False, False, False]
         if anti_aliased:
             pass
         else:
             if weights is not None:
-                self.net = tvm.resnet50(
-                    weights=weights, replace_stride_with_dilation=dilation
-                )
             else:
-                self.net = tvm.resnet50(
-                    pretrained=pretrained, replace_stride_with_dilation=dilation
-                )
         self.high_res = high_res
         self.freeze_bn = freeze_bn
         self.early_exit = early_exit
         self.amp = amp
-        if torch.cuda.is_available():
-            if torch.cuda.is_bf16_supported():
-                self.amp_dtype = torch.bfloat16
-            else:
-                self.amp_dtype = torch.float16
-        else:
-            self.amp_dtype = torch.float32
     def forward(self, x, **kwargs):
-        with torch.autocast(device, enabled=self.amp, dtype=self.amp_dtype):
             net = self.net
-            feats = {1: x}
             x = net.conv1(x)
             x = net.bn1(x)
             x = net.relu(x)
-            feats[2] = x
             x = net.maxpool(x)
             x = net.layer1(x)
-            feats[4] = x
             x = net.layer2(x)
             feats[8] = x
             if self.early_exit:
@@ -77,48 +55,35 @@ class ResNet50(nn.Module):
                     m.eval()
                 pass
 class VGG19(nn.Module):
-    def __init__(self, pretrained=False, amp=False) -> None:
         super().__init__()
         self.layers = nn.ModuleList(tvm.vgg19_bn(pretrained=pretrained).features[:40])
         self.amp = amp
-        if torch.cuda.is_available():
-            if torch.cuda.is_bf16_supported():
-                self.amp_dtype = torch.bfloat16
-            else:
-                self.amp_dtype = torch.float16
-        else:
-            self.amp_dtype = torch.float32
     def forward(self, x, **kwargs):
-        with torch.autocast(device, enabled=self.amp, dtype=self.amp_dtype):
             feats = {}
             scale = 1
             for layer in self.layers:
                 if isinstance(layer, nn.MaxPool2d):
                     feats[scale] = x
-                    scale = scale * 2
                 x = layer(x)
             return feats
 class CNNandDinov2(nn.Module):
-    def __init__(self, cnn_kwargs=None, amp=False, use_vgg=False, dinov2_weights=None):
         super().__init__()
         if dinov2_weights is None:
-            dinov2_weights = torch.hub.load_state_dict_from_url(
-                "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth",
-                map_location="cpu",
-            )
         from .transformer import vit_large
-        vit_kwargs = dict(
-            img_size=518,
-            patch_size=14,
-            init_values=1.0,
-            ffn_layer="mlp",
-            block_chunks=0,
         )
         dinov2_vitl14 = vit_large(**vit_kwargs).eval()
@@ -129,38 +94,25 @@ class CNNandDinov2(nn.Module):
         else:
             self.cnn = VGG19(**cnn_kwargs)
         self.amp = amp
-        if torch.cuda.is_available():
-            if torch.cuda.is_bf16_supported():
-                self.amp_dtype = torch.bfloat16
-            else:
-                self.amp_dtype = torch.float16
-        else:
-            self.amp_dtype = torch.float32
         if self.amp:
             dinov2_vitl14 = dinov2_vitl14.to(self.amp_dtype)
-        self.dinov2_vitl14 = [dinov2_vitl14]  # ugly hack to not show parameters to DDP
     def train(self, mode: bool = True):
         return self.cnn.train(mode)
-    def forward(self, x, upsample=False):
-        B, C, H, W = x.shape
         feature_pyramid = self.cnn(x)
         if not upsample:
             with torch.no_grad():
                 if self.dinov2_vitl14[0].device != x.device:
-                    self.dinov2_vitl14[0] = (
-                        self.dinov2_vitl14[0].to(x.device).to(self.amp_dtype)
-                    )
-                dinov2_features_16 = self.dinov2_vitl14[0].forward_features(
-                    x.to(self.amp_dtype)
-                )
-                features_16 = (
-                    dinov2_features_16["x_norm_patchtokens"]
-                    .permute(0, 2, 1)
-                    .reshape(B, 1024, H // 14, W // 14)
-                )
                 del dinov2_features_16
                 feature_pyramid[16] = features_16
-        return feature_pyramid

 import torchvision.models as tvm
 import gc
 class ResNet50(nn.Module):
+    def __init__(self, pretrained=False, high_res = False, weights = None, dilation = None, freeze_bn = True, anti_aliased = False, early_exit = False, amp = False) -> None:
         super().__init__()
         if dilation is None:
+            dilation = [False,False,False]
         if anti_aliased:
             pass
         else:
             if weights is not None:
+                self.net = tvm.resnet50(weights = weights,replace_stride_with_dilation=dilation)
             else:
+                self.net = tvm.resnet50(pretrained=pretrained,replace_stride_with_dilation=dilation)
         self.high_res = high_res
         self.freeze_bn = freeze_bn
         self.early_exit = early_exit
         self.amp = amp
+        self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
     def forward(self, x, **kwargs):
+        with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype):
             net = self.net
+            feats = {1:x}
             x = net.conv1(x)
             x = net.bn1(x)
             x = net.relu(x)
+            feats[2] = x
             x = net.maxpool(x)
             x = net.layer1(x)
+            feats[4] = x
             x = net.layer2(x)
             feats[8] = x
             if self.early_exit:
                     m.eval()
                 pass
 class VGG19(nn.Module):
+    def __init__(self, pretrained=False, amp = False) -> None:
         super().__init__()
         self.layers = nn.ModuleList(tvm.vgg19_bn(pretrained=pretrained).features[:40])
         self.amp = amp
+        self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
     def forward(self, x, **kwargs):
+        with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype):
             feats = {}
             scale = 1
             for layer in self.layers:
                 if isinstance(layer, nn.MaxPool2d):
                     feats[scale] = x
+                    scale = scale*2
                 x = layer(x)
             return feats
 class CNNandDinov2(nn.Module):
+    def __init__(self, cnn_kwargs = None, amp = False, use_vgg = False, dinov2_weights = None):
         super().__init__()
         if dinov2_weights is None:
+            dinov2_weights = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth", map_location="cpu")
         from .transformer import vit_large
+        vit_kwargs = dict(img_size= 518,
+            patch_size= 14,
+            init_values = 1.0,
+            ffn_layer = "mlp",
+            block_chunks = 0,
         )
         dinov2_vitl14 = vit_large(**vit_kwargs).eval()
         else:
             self.cnn = VGG19(**cnn_kwargs)
         self.amp = amp
+        self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
         if self.amp:
             dinov2_vitl14 = dinov2_vitl14.to(self.amp_dtype)
+        self.dinov2_vitl14 = [dinov2_vitl14] # ugly hack to not show parameters to DDP
     def train(self, mode: bool = True):
         return self.cnn.train(mode)
+    def forward(self, x, upsample = False):
+        B,C,H,W = x.shape
         feature_pyramid = self.cnn(x)
         if not upsample:
             with torch.no_grad():
                 if self.dinov2_vitl14[0].device != x.device:
+                    self.dinov2_vitl14[0] = self.dinov2_vitl14[0].to(x.device).to(self.amp_dtype)
+                dinov2_features_16 = self.dinov2_vitl14[0].forward_features(x.to(self.amp_dtype))
+                features_16 = dinov2_features_16['x_norm_patchtokens'].permute(0,2,1).reshape(B,1024,H//14, W//14)
                 del dinov2_features_16
                 feature_pyramid[16] = features_16
+        return feature_pyramid

third_party/Roma/roma/models/matcher.py CHANGED Viewed

@@ -14,9 +14,6 @@ from roma.utils.local_correlation import local_correlation
 from roma.utils.utils import cls_to_flow_refine
 from roma.utils.kde import kde
-device = "cuda" if torch.cuda.is_available() else "cpu"
 class ConvRefiner(nn.Module):
     def __init__(
         self,
@@ -26,29 +23,25 @@ class ConvRefiner(nn.Module):
         dw=False,
         kernel_size=5,
         hidden_blocks=3,
-        displacement_emb=None,
-        displacement_emb_dim=None,
-        local_corr_radius=None,
-        corr_in_other=None,
-        no_im_B_fm=False,
-        amp=False,
-        concat_logits=False,
-        use_bias_block_1=True,
-        use_cosine_corr=False,
-        disable_local_corr_grad=False,
-        is_classifier=False,
-        sample_mode="bilinear",
-        norm_type=nn.BatchNorm2d,
-        bn_momentum=0.1,
     ):
         super().__init__()
         self.bn_momentum = bn_momentum
         self.block1 = self.create_block(
-            in_dim,
-            hidden_dim,
-            dw=dw,
-            kernel_size=kernel_size,
-            bias=use_bias_block_1,
         )
         self.hidden_blocks = nn.Sequential(
             *[
@@ -66,7 +59,7 @@ class ConvRefiner(nn.Module):
         self.out_conv = nn.Conv2d(hidden_dim, out_dim, 1, 1, 0)
         if displacement_emb:
             self.has_displacement_emb = True
-            self.disp_emb = nn.Conv2d(2, displacement_emb_dim, 1, 1, 0)
         else:
             self.has_displacement_emb = False
         self.local_corr_radius = local_corr_radius
@@ -78,22 +71,16 @@ class ConvRefiner(nn.Module):
         self.disable_local_corr_grad = disable_local_corr_grad
         self.is_classifier = is_classifier
         self.sample_mode = sample_mode
-        if torch.cuda.is_available():
-            if torch.cuda.is_bf16_supported():
-                self.amp_dtype = torch.bfloat16
-            else:
-                self.amp_dtype = torch.float16
-        else:
-            self.amp_dtype = torch.float32
     def create_block(
         self,
         in_dim,
         out_dim,
         dw=False,
         kernel_size=5,
-        bias=True,
-        norm_type=nn.BatchNorm2d,
     ):
         num_groups = 1 if not dw else in_dim
         if dw:
@@ -109,56 +96,38 @@ class ConvRefiner(nn.Module):
             groups=num_groups,
             bias=bias,
         )
-        norm = (
-            norm_type(out_dim, momentum=self.bn_momentum)
-            if norm_type is nn.BatchNorm2d
-            else norm_type(num_channels=out_dim)
-        )
         relu = nn.ReLU(inplace=True)
         conv2 = nn.Conv2d(out_dim, out_dim, 1, 1, 0)
         return nn.Sequential(conv1, norm, relu, conv2)
-    def forward(self, x, y, flow, scale_factor=1, logits=None):
-        b, c, hs, ws = x.shape
-        with torch.autocast(device, enabled=self.amp, dtype=self.amp_dtype):
             with torch.no_grad():
-                x_hat = F.grid_sample(
-                    y,
-                    flow.permute(0, 2, 3, 1),
-                    align_corners=False,
-                    mode=self.sample_mode,
-                )
             if self.has_displacement_emb:
                 im_A_coords = torch.meshgrid(
-                    (
-                        torch.linspace(-1 + 1 / hs, 1 - 1 / hs, hs, device=device),
-                        torch.linspace(-1 + 1 / ws, 1 - 1 / ws, ws, device=device),
-                    )
                 )
                 im_A_coords = torch.stack((im_A_coords[1], im_A_coords[0]))
                 im_A_coords = im_A_coords[None].expand(b, 2, hs, ws)
-                in_displacement = flow - im_A_coords
-                emb_in_displacement = self.disp_emb(
-                    40 / 32 * scale_factor * in_displacement
-                )
                 if self.local_corr_radius:
                     if self.corr_in_other:
                         # Corr in other means take a kxk grid around the predicted coordinate in other image
-                        local_corr = local_correlation(
-                            x,
-                            y,
-                            local_radius=self.local_corr_radius,
-                            flow=flow,
-                            sample_mode=self.sample_mode,
-                        )
                     else:
-                        raise NotImplementedError(
-                            "Local corr in own frame should not be used."
-                        )
                     if self.no_im_B_fm:
                         x_hat = torch.zeros_like(x)
                     d = torch.cat((x, x_hat, emb_in_displacement, local_corr), dim=1)
-                else:
                     d = torch.cat((x, x_hat, emb_in_displacement), dim=1)
             else:
                 if self.no_im_B_fm:
@@ -172,7 +141,6 @@ class ConvRefiner(nn.Module):
         displacement, certainty = d[:, :-1], d[:, -1:]
         return displacement, certainty
 class CosKernel(nn.Module):  # similar to softmax kernel
     def __init__(self, T, learn_temperature=False):
         super().__init__()
@@ -193,7 +161,6 @@ class CosKernel(nn.Module):  # similar to softmax kernel
         K = ((c - 1.0) / T).exp()
         return K
 class GP(nn.Module):
     def __init__(
         self,
@@ -207,7 +174,7 @@ class GP(nn.Module):
         only_nearest_neighbour=False,
         sigma_noise=0.1,
         no_cov=False,
-        predict_features=False,
     ):
         super().__init__()
         self.K = kernel(T=T, learn_temperature=learn_temperature)
@@ -295,9 +262,7 @@ class GP(nn.Module):
         mu_x = rearrange(mu_x, "b (h w) d -> b d h w", h=h1, w=w1)
         if not self.no_cov:
             cov_x = K_xx - K_xy.matmul(K_yy_inv.matmul(K_yx))
-            cov_x = rearrange(
-                cov_x, "b (h w) (r c) -> b h w r c", h=h1, w=w1, r=h1, c=w1
-            )
             local_cov_x = self.get_local_cov(cov_x)
             local_cov_x = rearrange(local_cov_x, "b h w K -> b K h w")
             gp_feats = torch.cat((mu_x, local_cov_x), dim=1)
@@ -305,22 +270,11 @@ class GP(nn.Module):
             gp_feats = mu_x
         return gp_feats
 class Decoder(nn.Module):
     def __init__(
-        self,
-        embedding_decoder,
-        gps,
-        proj,
-        conv_refiner,
-        detach=False,
-        scales="all",
-        pos_embeddings=None,
-        num_refinement_steps_per_scale=1,
-        warp_noise_std=0.0,
-        displacement_dropout_p=0.0,
-        gm_warp_dropout_p=0.0,
-        flow_upsample_mode="bilinear",
     ):
         super().__init__()
         self.embedding_decoder = embedding_decoder
@@ -342,14 +296,8 @@ class Decoder(nn.Module):
         self.displacement_dropout_p = displacement_dropout_p
         self.gm_warp_dropout_p = gm_warp_dropout_p
         self.flow_upsample_mode = flow_upsample_mode
-        if torch.cuda.is_available():
-            if torch.cuda.is_bf16_supported():
-                self.amp_dtype = torch.bfloat16
-            else:
-                self.amp_dtype = torch.float16
-        else:
-            self.amp_dtype = torch.float32
     def get_placeholder_flow(self, b, h, w, device):
         coarse_coords = torch.meshgrid(
             (
@@ -362,8 +310,8 @@ class Decoder(nn.Module):
         ].expand(b, h, w, 2)
         coarse_coords = rearrange(coarse_coords, "b h w d -> b d h w")
         return coarse_coords
-    def get_positional_embedding(self, b, h, w, device):
         coarse_coords = torch.meshgrid(
             (
                 torch.linspace(-1 + 1 / h, 1 - 1 / h, h, device=device),
@@ -378,29 +326,16 @@ class Decoder(nn.Module):
         coarse_embedded_coords = self.pos_embedding(coarse_coords)
         return coarse_embedded_coords
-    def forward(
-        self,
-        f1,
-        f2,
-        gt_warp=None,
-        gt_prob=None,
-        upsample=False,
-        flow=None,
-        certainty=None,
-        scale_factor=1,
-    ):
         coarse_scales = self.embedding_decoder.scales()
-        all_scales = self.scales if not upsample else ["8", "4", "2", "1"]
         sizes = {scale: f1[scale].shape[-2:] for scale in f1}
         h, w = sizes[1]
         b = f1[1].shape[0]
         device = f1[1].device
         coarsest_scale = int(all_scales[0])
         old_stuff = torch.zeros(
-            b,
-            self.embedding_decoder.hidden_dim,
-            *sizes[coarsest_scale],
-            device=f1[coarsest_scale].device,
         )
         corresps = {}
         if not upsample:
@@ -408,24 +343,24 @@ class Decoder(nn.Module):
             certainty = 0.0
         else:
             flow = F.interpolate(
-                flow,
-                size=sizes[coarsest_scale],
-                align_corners=False,
-                mode="bilinear",
-            )
             certainty = F.interpolate(
-                certainty,
-                size=sizes[coarsest_scale],
-                align_corners=False,
-                mode="bilinear",
-            )
         displacement = 0.0
         for new_scale in all_scales:
             ins = int(new_scale)
             corresps[ins] = {}
             f1_s, f2_s = f1[ins], f2[ins]
             if new_scale in self.proj:
-                with torch.autocast(device, self.amp_dtype):
                     f1_s, f2_s = self.proj[new_scale](f1_s), self.proj[new_scale](f2_s)
             if ins in coarse_scales:
@@ -436,59 +371,32 @@ class Decoder(nn.Module):
                 gm_warp_or_cls, certainty, old_stuff = self.embedding_decoder(
                     gp_posterior, f1_s, old_stuff, new_scale
                 )
                 if self.embedding_decoder.is_classifier:
                     flow = cls_to_flow_refine(
                         gm_warp_or_cls,
-                    ).permute(0, 3, 1, 2)
-                    corresps[ins].update(
-                        {
-                            "gm_cls": gm_warp_or_cls,
-                            "gm_certainty": certainty,
-                        }
-                    ) if self.training else None
                 else:
-                    corresps[ins].update(
-                        {
-                            "gm_flow": gm_warp_or_cls,
-                            "gm_certainty": certainty,
-                        }
-                    ) if self.training else None
                     flow = gm_warp_or_cls.detach()
             if new_scale in self.conv_refiner:
-                corresps[ins].update(
-                    {"flow_pre_delta": flow}
-                ) if self.training else None
                 delta_flow, delta_certainty = self.conv_refiner[new_scale](
-                    f1_s,
-                    f2_s,
-                    flow,
-                    scale_factor=scale_factor,
-                    logits=certainty,
-                )
-                corresps[ins].update(
-                    {
-                        "delta_flow": delta_flow,
-                    }
-                ) if self.training else None
-                displacement = ins * torch.stack(
-                    (
-                        delta_flow[:, 0].float() / (self.refine_init * w),
-                        delta_flow[:, 1].float() / (self.refine_init * h),
-                    ),
-                    dim=1,
-                )
                 flow = flow + displacement
                 certainty = (
                     certainty + delta_certainty
                 )  # predict both certainty and displacement
-            corresps[ins].update(
-                {
-                    "certainty": certainty,
-                    "flow": flow,
-                }
-            )
             if new_scale != "1":
                 flow = F.interpolate(
                     flow,
@@ -503,7 +411,7 @@ class Decoder(nn.Module):
                 if self.detach:
                     flow = flow.detach()
                     certainty = certainty.detach()
-            # torch.cuda.empty_cache()
         return corresps
@@ -514,11 +422,11 @@ class RegressionMatcher(nn.Module):
         decoder,
         h=448,
         w=448,
-        sample_mode="threshold",
-        upsample_preds=False,
-        symmetric=False,
-        name=None,
-        attenuate_cert=None,
     ):
         super().__init__()
         self.attenuate_cert = attenuate_cert
@@ -530,26 +438,24 @@ class RegressionMatcher(nn.Module):
         self.og_transforms = get_tuple_transform_ops(resize=None, normalize=True)
         self.sample_mode = sample_mode
         self.upsample_preds = upsample_preds
-        self.upsample_res = (14 * 16 * 6, 14 * 16 * 6)
         self.symmetric = symmetric
         self.sample_thresh = 0.05
     def get_output_resolution(self):
         if not self.upsample_preds:
             return self.h_resized, self.w_resized
         else:
             return self.upsample_res
-    def extract_backbone_features(self, batch, batched=True, upsample=False):
         x_q = batch["im_A"]
         x_s = batch["im_B"]
         if batched:
-            X = torch.cat((x_q, x_s), dim=0)
-            feature_pyramid = self.encoder(X, upsample=upsample)
         else:
-            feature_pyramid = self.encoder(x_q, upsample=upsample), self.encoder(
-                x_s, upsample=upsample
-            )
         return feature_pyramid
     def sample(
@@ -567,28 +473,22 @@ class RegressionMatcher(nn.Module):
             certainty.reshape(-1),
         )
         expansion_factor = 4 if "balanced" in self.sample_mode else 1
-        good_samples = torch.multinomial(
-            certainty,
-            num_samples=min(expansion_factor * num, len(certainty)),
-            replacement=False,
-        )
         good_matches, good_certainty = matches[good_samples], certainty[good_samples]
         if "balanced" not in self.sample_mode:
             return good_matches, good_certainty
         density = kde(good_matches, std=0.1)
-        p = 1 / (density + 1)
-        p[
-            density < 10
-        ] = 1e-7  # Basically should have at least 10 perfect neighbours, or around 100 ok ones
-        balanced_samples = torch.multinomial(
-            p, num_samples=min(num, len(good_certainty)), replacement=False
-        )
         return good_matches[balanced_samples], good_certainty[balanced_samples]
-    def forward(self, batch, batched=True, upsample=False, scale_factor=1):
-        feature_pyramid = self.extract_backbone_features(
-            batch, batched=batched, upsample=upsample
-        )
         if batched:
             f_q_pyramid = {
                 scale: f_scale.chunk(2)[0] for scale, f_scale in feature_pyramid.items()
@@ -598,42 +498,32 @@ class RegressionMatcher(nn.Module):
             }
         else:
             f_q_pyramid, f_s_pyramid = feature_pyramid
-        corresps = self.decoder(
-            f_q_pyramid,
-            f_s_pyramid,
-            upsample=upsample,
-            **(batch["corresps"] if "corresps" in batch else {}),
-            scale_factor=scale_factor,
-        )
         return corresps
-    def forward_symmetric(self, batch, batched=True, upsample=False, scale_factor=1):
-        feature_pyramid = self.extract_backbone_features(
-            batch, batched=batched, upsample=upsample
-        )
         f_q_pyramid = feature_pyramid
         f_s_pyramid = {
-            scale: torch.cat((f_scale.chunk(2)[1], f_scale.chunk(2)[0]), dim=0)
             for scale, f_scale in feature_pyramid.items()
         }
-        corresps = self.decoder(
-            f_q_pyramid,
-            f_s_pyramid,
-            upsample=upsample,
-            **(batch["corresps"] if "corresps" in batch else {}),
-            scale_factor=scale_factor,
-        )
         return corresps
     def to_pixel_coordinates(self, matches, H_A, W_A, H_B, W_B):
-        kpts_A, kpts_B = matches[..., :2], matches[..., 2:]
-        kpts_A = torch.stack(
-            (W_A / 2 * (kpts_A[..., 0] + 1), H_A / 2 * (kpts_A[..., 1] + 1)), axis=-1
-        )
-        kpts_B = torch.stack(
-            (W_B / 2 * (kpts_B[..., 0] + 1), H_B / 2 * (kpts_B[..., 1] + 1)), axis=-1
-        )
         return kpts_A, kpts_B
     def match(
@@ -642,12 +532,11 @@ class RegressionMatcher(nn.Module):
         im_B_path,
         *args,
         batched=False,
-        device=None,
     ):
         if device is None:
-            device = torch.device(device if torch.cuda.is_available() else "cpu")
         from PIL import Image
         if isinstance(im_A_path, (str, os.PathLike)):
             im_A, im_B = Image.open(im_A_path), Image.open(im_B_path)
         else:
@@ -663,9 +552,9 @@ class RegressionMatcher(nn.Module):
                 # Get images in good format
                 ws = self.w_resized
                 hs = self.h_resized
                 test_transform = get_tuple_transform_ops(
-                    resize=(hs, ws), normalize=True, clahe=False
                 )
                 im_A, im_B = test_transform((im_A, im_B))
                 batch = {"im_A": im_A[None].to(device), "im_B": im_B[None].to(device)}
@@ -675,32 +564,25 @@ class RegressionMatcher(nn.Module):
                 assert w == w2 and h == h2, "For batched images we assume same size"
                 batch = {"im_A": im_A.to(device), "im_B": im_B.to(device)}
                 if h != self.h_resized or self.w_resized != w:
-                    warn(
-                        "Model resolution and batch resolution differ, may produce unexpected results"
-                    )
                 hs, ws = h, w
             finest_scale = 1
             # Run matcher
             if symmetric:
-                corresps = self.forward_symmetric(batch)
             else:
-                corresps = self.forward(batch, batched=True)
             if self.upsample_preds:
                 hs, ws = self.upsample_res
             if self.attenuate_cert:
                 low_res_certainty = F.interpolate(
-                    corresps[16]["certainty"],
-                    size=(hs, ws),
-                    align_corners=False,
-                    mode="bilinear",
                 )
                 cert_clamp = 0
                 factor = 0.5
-                low_res_certainty = (
-                    factor * low_res_certainty * (low_res_certainty < cert_clamp)
-                )
             if self.upsample_preds:
                 finest_corresps = corresps[finest_scale]
@@ -711,38 +593,30 @@ class RegressionMatcher(nn.Module):
                 im_A, im_B = Image.open(im_A_path), Image.open(im_B_path)
                 im_A, im_B = test_transform((im_A, im_B))
                 im_A, im_B = im_A[None].to(device), im_B[None].to(device)
-                scale_factor = math.sqrt(
-                    self.upsample_res[0]
-                    * self.upsample_res[1]
-                    / (self.w_resized * self.h_resized)
-                )
                 batch = {"im_A": im_A, "im_B": im_B, "corresps": finest_corresps}
                 if symmetric:
-                    corresps = self.forward_symmetric(
-                        batch, upsample=True, batched=True, scale_factor=scale_factor
-                    )
                 else:
-                    corresps = self.forward(
-                        batch, batched=True, upsample=True, scale_factor=scale_factor
-                    )
-            im_A_to_im_B = corresps[finest_scale]["flow"]
-            certainty = corresps[finest_scale]["certainty"] - (
-                low_res_certainty if self.attenuate_cert else 0
-            )
             if finest_scale != 1:
                 im_A_to_im_B = F.interpolate(
-                    im_A_to_im_B, size=(hs, ws), align_corners=False, mode="bilinear"
                 )
                 certainty = F.interpolate(
-                    certainty, size=(hs, ws), align_corners=False, mode="bilinear"
                 )
-            im_A_to_im_B = im_A_to_im_B.permute(0, 2, 3, 1)
             # Create im_A meshgrid
             im_A_coords = torch.meshgrid(
                 (
-                    torch.linspace(-1 + 1 / hs, 1 - 1 / hs, hs, device=device),
-                    torch.linspace(-1 + 1 / ws, 1 - 1 / ws, ws, device=device),
                 )
             )
             im_A_coords = torch.stack((im_A_coords[1], im_A_coords[0]))
@@ -751,21 +625,25 @@ class RegressionMatcher(nn.Module):
             im_A_coords = im_A_coords.permute(0, 2, 3, 1)
             if (im_A_to_im_B.abs() > 1).any() and True:
                 wrong = (im_A_to_im_B.abs() > 1).sum(dim=-1) > 0
-                certainty[wrong[:, None]] = 0
             im_A_to_im_B = torch.clamp(im_A_to_im_B, -1, 1)
             if symmetric:
                 A_to_B, B_to_A = im_A_to_im_B.chunk(2)
                 q_warp = torch.cat((im_A_coords, A_to_B), dim=-1)
                 im_B_coords = im_A_coords
                 s_warp = torch.cat((B_to_A, im_B_coords), dim=-1)
-                warp = torch.cat((q_warp, s_warp), dim=2)
                 certainty = torch.cat(certainty.chunk(2), dim=3)
             else:
                 warp = torch.cat((im_A_coords, im_A_to_im_B), dim=-1)
             if batched:
-                return (warp, certainty[:, 0])
             else:
                 return (
                     warp[0],
                     certainty[0, 0],
                 )

 from roma.utils.utils import cls_to_flow_refine
 from roma.utils.kde import kde
 class ConvRefiner(nn.Module):
     def __init__(
         self,
         dw=False,
         kernel_size=5,
         hidden_blocks=3,
+        displacement_emb = None,
+        displacement_emb_dim = None,
+        local_corr_radius = None,
+        corr_in_other = None,
+        no_im_B_fm = False,
+        amp = False,
+        concat_logits = False,
+        use_bias_block_1 = True,
+        use_cosine_corr = False,
+        disable_local_corr_grad = False,
+        is_classifier = False,
+        sample_mode = "bilinear",
+        norm_type = nn.BatchNorm2d,
+        bn_momentum = 0.1,
     ):
         super().__init__()
         self.bn_momentum = bn_momentum
         self.block1 = self.create_block(
+            in_dim, hidden_dim, dw=dw, kernel_size=kernel_size, bias = use_bias_block_1,
         )
         self.hidden_blocks = nn.Sequential(
             *[
         self.out_conv = nn.Conv2d(hidden_dim, out_dim, 1, 1, 0)
         if displacement_emb:
             self.has_displacement_emb = True
+            self.disp_emb = nn.Conv2d(2,displacement_emb_dim,1,1,0)
         else:
             self.has_displacement_emb = False
         self.local_corr_radius = local_corr_radius
         self.disable_local_corr_grad = disable_local_corr_grad
         self.is_classifier = is_classifier
         self.sample_mode = sample_mode
+        self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
     def create_block(
         self,
         in_dim,
         out_dim,
         dw=False,
         kernel_size=5,
+        bias = True,
+        norm_type = nn.BatchNorm2d,
     ):
         num_groups = 1 if not dw else in_dim
         if dw:
             groups=num_groups,
             bias=bias,
         )
+        norm = norm_type(out_dim, momentum = self.bn_momentum) if norm_type is nn.BatchNorm2d else norm_type(num_channels = out_dim)
         relu = nn.ReLU(inplace=True)
         conv2 = nn.Conv2d(out_dim, out_dim, 1, 1, 0)
         return nn.Sequential(conv1, norm, relu, conv2)
+    def forward(self, x, y, flow, scale_factor = 1, logits = None):
+        b,c,hs,ws = x.shape
+        with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype):
             with torch.no_grad():
+                x_hat = F.grid_sample(y, flow.permute(0, 2, 3, 1), align_corners=False, mode = self.sample_mode)
             if self.has_displacement_emb:
                 im_A_coords = torch.meshgrid(
+                (
+                    torch.linspace(-1 + 1 / hs, 1 - 1 / hs, hs, device="cuda"),
+                    torch.linspace(-1 + 1 / ws, 1 - 1 / ws, ws, device="cuda"),
+                )
                 )
                 im_A_coords = torch.stack((im_A_coords[1], im_A_coords[0]))
                 im_A_coords = im_A_coords[None].expand(b, 2, hs, ws)
+                in_displacement = flow-im_A_coords
+                emb_in_displacement = self.disp_emb(40/32 * scale_factor * in_displacement)
                 if self.local_corr_radius:
                     if self.corr_in_other:
                         # Corr in other means take a kxk grid around the predicted coordinate in other image
+                        local_corr = local_correlation(x,y,local_radius=self.local_corr_radius,flow = flow,
+                                                       sample_mode = self.sample_mode)
                     else:
+                        raise NotImplementedError("Local corr in own frame should not be used.")
                     if self.no_im_B_fm:
                         x_hat = torch.zeros_like(x)
                     d = torch.cat((x, x_hat, emb_in_displacement, local_corr), dim=1)
+                else:
                     d = torch.cat((x, x_hat, emb_in_displacement), dim=1)
             else:
                 if self.no_im_B_fm:
         displacement, certainty = d[:, :-1], d[:, -1:]
         return displacement, certainty
 class CosKernel(nn.Module):  # similar to softmax kernel
     def __init__(self, T, learn_temperature=False):
         super().__init__()
         K = ((c - 1.0) / T).exp()
         return K
 class GP(nn.Module):
     def __init__(
         self,
         only_nearest_neighbour=False,
         sigma_noise=0.1,
         no_cov=False,
+        predict_features = False,
     ):
         super().__init__()
         self.K = kernel(T=T, learn_temperature=learn_temperature)
         mu_x = rearrange(mu_x, "b (h w) d -> b d h w", h=h1, w=w1)
         if not self.no_cov:
             cov_x = K_xx - K_xy.matmul(K_yy_inv.matmul(K_yx))
+            cov_x = rearrange(cov_x, "b (h w) (r c) -> b h w r c", h=h1, w=w1, r=h1, c=w1)
             local_cov_x = self.get_local_cov(cov_x)
             local_cov_x = rearrange(local_cov_x, "b h w K -> b K h w")
             gp_feats = torch.cat((mu_x, local_cov_x), dim=1)
             gp_feats = mu_x
         return gp_feats
 class Decoder(nn.Module):
     def __init__(
+        self, embedding_decoder, gps, proj, conv_refiner, detach=False, scales="all", pos_embeddings = None,
+        num_refinement_steps_per_scale = 1, warp_noise_std = 0.0, displacement_dropout_p = 0.0, gm_warp_dropout_p = 0.0,
+        flow_upsample_mode = "bilinear"
     ):
         super().__init__()
         self.embedding_decoder = embedding_decoder
         self.displacement_dropout_p = displacement_dropout_p
         self.gm_warp_dropout_p = gm_warp_dropout_p
         self.flow_upsample_mode = flow_upsample_mode
+        self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
     def get_placeholder_flow(self, b, h, w, device):
         coarse_coords = torch.meshgrid(
             (
         ].expand(b, h, w, 2)
         coarse_coords = rearrange(coarse_coords, "b h w d -> b d h w")
         return coarse_coords
+    def get_positional_embedding(self, b, h ,w, device):
         coarse_coords = torch.meshgrid(
             (
                 torch.linspace(-1 + 1 / h, 1 - 1 / h, h, device=device),
         coarse_embedded_coords = self.pos_embedding(coarse_coords)
         return coarse_embedded_coords
+    def forward(self, f1, f2, gt_warp = None, gt_prob = None, upsample = False, flow = None, certainty = None, scale_factor = 1):
         coarse_scales = self.embedding_decoder.scales()
+        all_scales = self.scales if not upsample else ["8", "4", "2", "1"]
         sizes = {scale: f1[scale].shape[-2:] for scale in f1}
         h, w = sizes[1]
         b = f1[1].shape[0]
         device = f1[1].device
         coarsest_scale = int(all_scales[0])
         old_stuff = torch.zeros(
+            b, self.embedding_decoder.hidden_dim, *sizes[coarsest_scale], device=f1[coarsest_scale].device
         )
         corresps = {}
         if not upsample:
             certainty = 0.0
         else:
             flow = F.interpolate(
+                    flow,
+                    size=sizes[coarsest_scale],
+                    align_corners=False,
+                    mode="bilinear",
+                )
             certainty = F.interpolate(
+                    certainty,
+                    size=sizes[coarsest_scale],
+                    align_corners=False,
+                    mode="bilinear",
+                )
         displacement = 0.0
         for new_scale in all_scales:
             ins = int(new_scale)
             corresps[ins] = {}
             f1_s, f2_s = f1[ins], f2[ins]
             if new_scale in self.proj:
+                with torch.autocast("cuda", self.amp_dtype):
                     f1_s, f2_s = self.proj[new_scale](f1_s), self.proj[new_scale](f2_s)
             if ins in coarse_scales:
                 gm_warp_or_cls, certainty, old_stuff = self.embedding_decoder(
                     gp_posterior, f1_s, old_stuff, new_scale
                 )
                 if self.embedding_decoder.is_classifier:
                     flow = cls_to_flow_refine(
                         gm_warp_or_cls,
+                    ).permute(0,3,1,2)
+                    corresps[ins].update({"gm_cls": gm_warp_or_cls,"gm_certainty": certainty,}) if self.training else None
                 else:
+                    corresps[ins].update({"gm_flow": gm_warp_or_cls,"gm_certainty": certainty,}) if self.training else None
                     flow = gm_warp_or_cls.detach()
             if new_scale in self.conv_refiner:
+                corresps[ins].update({"flow_pre_delta": flow}) if self.training else None
                 delta_flow, delta_certainty = self.conv_refiner[new_scale](
+                    f1_s, f2_s, flow, scale_factor = scale_factor, logits = certainty,
+                )
+                corresps[ins].update({"delta_flow": delta_flow,}) if self.training else None
+                displacement = ins*torch.stack((delta_flow[:, 0].float() / (self.refine_init * w),
+                                                delta_flow[:, 1].float() / (self.refine_init * h),),dim=1,)
                 flow = flow + displacement
                 certainty = (
                     certainty + delta_certainty
                 )  # predict both certainty and displacement
+            corresps[ins].update({
+                "certainty": certainty,
+                "flow": flow,
+            })
             if new_scale != "1":
                 flow = F.interpolate(
                     flow,
                 if self.detach:
                     flow = flow.detach()
                     certainty = certainty.detach()
+            #torch.cuda.empty_cache()
         return corresps
         decoder,
         h=448,
         w=448,
+        sample_mode = "threshold",
+        upsample_preds = False,
+        symmetric = False,
+        name = None,
+        attenuate_cert = None,
     ):
         super().__init__()
         self.attenuate_cert = attenuate_cert
         self.og_transforms = get_tuple_transform_ops(resize=None, normalize=True)
         self.sample_mode = sample_mode
         self.upsample_preds = upsample_preds
+        self.upsample_res = (14*16*6, 14*16*6)
         self.symmetric = symmetric
         self.sample_thresh = 0.05
     def get_output_resolution(self):
         if not self.upsample_preds:
             return self.h_resized, self.w_resized
         else:
             return self.upsample_res
+    def extract_backbone_features(self, batch, batched = True, upsample = False):
         x_q = batch["im_A"]
         x_s = batch["im_B"]
         if batched:
+            X = torch.cat((x_q, x_s), dim = 0)
+            feature_pyramid = self.encoder(X, upsample = upsample)
         else:
+            feature_pyramid = self.encoder(x_q, upsample = upsample), self.encoder(x_s, upsample = upsample)
         return feature_pyramid
     def sample(
             certainty.reshape(-1),
         )
         expansion_factor = 4 if "balanced" in self.sample_mode else 1
+        good_samples = torch.multinomial(certainty,
+                          num_samples = min(expansion_factor*num, len(certainty)),
+                          replacement=False)
         good_matches, good_certainty = matches[good_samples], certainty[good_samples]
         if "balanced" not in self.sample_mode:
             return good_matches, good_certainty
         density = kde(good_matches, std=0.1)
+        p = 1 / (density+1)
+        p[density < 10] = 1e-7 # Basically should have at least 10 perfect neighbours, or around 100 ok ones
+        balanced_samples = torch.multinomial(p,
+                          num_samples = min(num,len(good_certainty)),
+                          replacement=False)
         return good_matches[balanced_samples], good_certainty[balanced_samples]
+    def forward(self, batch, batched = True, upsample = False, scale_factor = 1):
+        feature_pyramid = self.extract_backbone_features(batch, batched=batched, upsample = upsample)
         if batched:
             f_q_pyramid = {
                 scale: f_scale.chunk(2)[0] for scale, f_scale in feature_pyramid.items()
             }
         else:
             f_q_pyramid, f_s_pyramid = feature_pyramid
+        corresps = self.decoder(f_q_pyramid,
+                                f_s_pyramid,
+                                upsample = upsample,
+                                **(batch["corresps"] if "corresps" in batch else {}),
+                                scale_factor=scale_factor)
         return corresps
+    def forward_symmetric(self, batch, batched = True, upsample = False, scale_factor = 1):
+        feature_pyramid = self.extract_backbone_features(batch, batched = batched, upsample = upsample)
         f_q_pyramid = feature_pyramid
         f_s_pyramid = {
+            scale: torch.cat((f_scale.chunk(2)[1], f_scale.chunk(2)[0]), dim = 0)
             for scale, f_scale in feature_pyramid.items()
         }
+        corresps = self.decoder(f_q_pyramid,
+                                f_s_pyramid,
+                                upsample = upsample,
+                                **(batch["corresps"] if "corresps" in batch else {}),
+                                scale_factor=scale_factor)
         return corresps
     def to_pixel_coordinates(self, matches, H_A, W_A, H_B, W_B):
+        kpts_A, kpts_B = matches[...,:2], matches[...,2:]
+        kpts_A = torch.stack((W_A/2 * (kpts_A[...,0]+1), H_A/2 * (kpts_A[...,1]+1)),axis=-1)
+        kpts_B = torch.stack((W_B/2 * (kpts_B[...,0]+1), H_B/2 * (kpts_B[...,1]+1)),axis=-1)
         return kpts_A, kpts_B
     def match(
         im_B_path,
         *args,
         batched=False,
+        device = None,
     ):
         if device is None:
+            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         from PIL import Image
         if isinstance(im_A_path, (str, os.PathLike)):
             im_A, im_B = Image.open(im_A_path), Image.open(im_B_path)
         else:
                 # Get images in good format
                 ws = self.w_resized
                 hs = self.h_resized
                 test_transform = get_tuple_transform_ops(
+                    resize=(hs, ws), normalize=True, clahe = False
                 )
                 im_A, im_B = test_transform((im_A, im_B))
                 batch = {"im_A": im_A[None].to(device), "im_B": im_B[None].to(device)}
                 assert w == w2 and h == h2, "For batched images we assume same size"
                 batch = {"im_A": im_A.to(device), "im_B": im_B.to(device)}
                 if h != self.h_resized or self.w_resized != w:
+                    warn("Model resolution and batch resolution differ, may produce unexpected results")
                 hs, ws = h, w
             finest_scale = 1
             # Run matcher
             if symmetric:
+                corresps  = self.forward_symmetric(batch)
             else:
+                corresps = self.forward(batch, batched = True)
             if self.upsample_preds:
                 hs, ws = self.upsample_res
             if self.attenuate_cert:
                 low_res_certainty = F.interpolate(
+                corresps[16]["certainty"], size=(hs, ws), align_corners=False, mode="bilinear"
                 )
                 cert_clamp = 0
                 factor = 0.5
+                low_res_certainty = factor*low_res_certainty*(low_res_certainty < cert_clamp)
             if self.upsample_preds:
                 finest_corresps = corresps[finest_scale]
                 im_A, im_B = Image.open(im_A_path), Image.open(im_B_path)
                 im_A, im_B = test_transform((im_A, im_B))
                 im_A, im_B = im_A[None].to(device), im_B[None].to(device)
+                scale_factor = math.sqrt(self.upsample_res[0] * self.upsample_res[1] / (self.w_resized * self.h_resized))
                 batch = {"im_A": im_A, "im_B": im_B, "corresps": finest_corresps}
                 if symmetric:
+                    corresps = self.forward_symmetric(batch, upsample = True, batched=True, scale_factor = scale_factor)
                 else:
+                    corresps = self.forward(batch, batched = True, upsample=True, scale_factor = scale_factor)
+            im_A_to_im_B = corresps[finest_scale]["flow"]
+            certainty = corresps[finest_scale]["certainty"] - (low_res_certainty if self.attenuate_cert else 0)
             if finest_scale != 1:
                 im_A_to_im_B = F.interpolate(
+                im_A_to_im_B, size=(hs, ws), align_corners=False, mode="bilinear"
                 )
                 certainty = F.interpolate(
+                certainty, size=(hs, ws), align_corners=False, mode="bilinear"
+                )
+            im_A_to_im_B = im_A_to_im_B.permute(
+                0, 2, 3, 1
                 )
             # Create im_A meshgrid
             im_A_coords = torch.meshgrid(
                 (
+                    torch.linspace(-1 + 1 / hs, 1 - 1 / hs, hs, device="cuda"),
+                    torch.linspace(-1 + 1 / ws, 1 - 1 / ws, ws, device="cuda"),
                 )
             )
             im_A_coords = torch.stack((im_A_coords[1], im_A_coords[0]))
             im_A_coords = im_A_coords.permute(0, 2, 3, 1)
             if (im_A_to_im_B.abs() > 1).any() and True:
                 wrong = (im_A_to_im_B.abs() > 1).sum(dim=-1) > 0
+                certainty[wrong[:,None]] = 0
             im_A_to_im_B = torch.clamp(im_A_to_im_B, -1, 1)
             if symmetric:
                 A_to_B, B_to_A = im_A_to_im_B.chunk(2)
                 q_warp = torch.cat((im_A_coords, A_to_B), dim=-1)
                 im_B_coords = im_A_coords
                 s_warp = torch.cat((B_to_A, im_B_coords), dim=-1)
+                warp = torch.cat((q_warp, s_warp),dim=2)
                 certainty = torch.cat(certainty.chunk(2), dim=3)
             else:
                 warp = torch.cat((im_A_coords, im_A_to_im_B), dim=-1)
             if batched:
+                return (
+                    warp,
+                    certainty[:, 0]
+                )
             else:
                 return (
                     warp[0],
                     certainty[0, 0],
                 )