File size: 12,291 Bytes
b334e29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa

from os import path as osp

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.utils import _pair
from torch.onnx.operators import shape_as_tensor


def bilinear_grid_sample(im, grid, align_corners=False):
    """Given an input and a flow-field grid, computes the output using input
    values and pixel locations from grid. Supported only bilinear interpolation
    method to sample the input pixels.

    Args:
        im (torch.Tensor): Input feature map, shape (N, C, H, W)
        grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2)
        align_corners {bool}: If set to True, the extrema (-1 and 1) are
            considered as referring to the center points of the input’s
            corner pixels. If set to False, they are instead considered as
            referring to the corner points of the input’s corner pixels,
            making the sampling more resolution agnostic.
    Returns:
        torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg)
    """
    n, c, h, w = im.shape
    gn, gh, gw, _ = grid.shape
    assert n == gn

    x = grid[:, :, :, 0]
    y = grid[:, :, :, 1]

    if align_corners:
        x = ((x + 1) / 2) * (w - 1)
        y = ((y + 1) / 2) * (h - 1)
    else:
        x = ((x + 1) * w - 1) / 2
        y = ((y + 1) * h - 1) / 2

    x = x.view(n, -1)
    y = y.view(n, -1)

    x0 = torch.floor(x).long()
    y0 = torch.floor(y).long()
    x1 = x0 + 1
    y1 = y0 + 1

    wa = ((x1 - x) * (y1 - y)).unsqueeze(1)
    wb = ((x1 - x) * (y - y0)).unsqueeze(1)
    wc = ((x - x0) * (y1 - y)).unsqueeze(1)
    wd = ((x - x0) * (y - y0)).unsqueeze(1)

    # Apply default for grid_sample function zero padding
    im_padded = F.pad(im, pad=[1, 1, 1, 1], mode='constant', value=0)
    padded_h = h + 2
    padded_w = w + 2
    # save points positions after padding
    x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1

    # Clip coordinates to padded image size
    x0 = torch.where(x0 < 0, torch.tensor(0), x0)
    x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1), x0)
    x1 = torch.where(x1 < 0, torch.tensor(0), x1)
    x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1), x1)
    y0 = torch.where(y0 < 0, torch.tensor(0), y0)
    y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1), y0)
    y1 = torch.where(y1 < 0, torch.tensor(0), y1)
    y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1), y1)

    im_padded = im_padded.view(n, c, -1)

    x0_y0 = (x0 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
    x0_y1 = (x0 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
    x1_y0 = (x1 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
    x1_y1 = (x1 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)

    Ia = torch.gather(im_padded, 2, x0_y0)
    Ib = torch.gather(im_padded, 2, x0_y1)
    Ic = torch.gather(im_padded, 2, x1_y0)
    Id = torch.gather(im_padded, 2, x1_y1)

    return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw)


def is_in_onnx_export_without_custom_ops():
    from annotator.uniformer.mmcv.ops import get_onnxruntime_op_path
    ort_custom_op_path = get_onnxruntime_op_path()
    return torch.onnx.is_in_onnx_export(
    ) and not osp.exists(ort_custom_op_path)


def normalize(grid):
    """Normalize input grid from [-1, 1] to [0, 1]
    Args:
        grid (Tensor): The grid to be normalize, range [-1, 1].
    Returns:
        Tensor: Normalized grid, range [0, 1].
    """

    return (grid + 1.0) / 2.0


def denormalize(grid):
    """Denormalize input grid from range [0, 1] to [-1, 1]
    Args:
        grid (Tensor): The grid to be denormalize, range [0, 1].
    Returns:
        Tensor: Denormalized grid, range [-1, 1].
    """

    return grid * 2.0 - 1.0


def generate_grid(num_grid, size, device):
    """Generate regular square grid of points in [0, 1] x [0, 1] coordinate
    space.

    Args:
        num_grid (int): The number of grids to sample, one for each region.
        size (tuple(int, int)): The side size of the regular grid.
        device (torch.device): Desired device of returned tensor.

    Returns:
        (torch.Tensor): A tensor of shape (num_grid, size[0]*size[1], 2) that
            contains coordinates for the regular grids.
    """

    affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device)
    grid = F.affine_grid(
        affine_trans, torch.Size((1, 1, *size)), align_corners=False)
    grid = normalize(grid)
    return grid.view(1, -1, 2).expand(num_grid, -1, -1)


def rel_roi_point_to_abs_img_point(rois, rel_roi_points):
    """Convert roi based relative point coordinates to image based absolute
    point coordinates.

    Args:
        rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
        rel_roi_points (Tensor): Point coordinates inside RoI, relative to
            RoI, location, range (0, 1), shape (N, P, 2)
    Returns:
        Tensor: Image based absolute point coordinates, shape (N, P, 2)
    """

    with torch.no_grad():
        assert rel_roi_points.size(0) == rois.size(0)
        assert rois.dim() == 2
        assert rel_roi_points.dim() == 3
        assert rel_roi_points.size(2) == 2
        # remove batch idx
        if rois.size(1) == 5:
            rois = rois[:, 1:]
        abs_img_points = rel_roi_points.clone()
        # To avoid an error during exporting to onnx use independent
        # variables instead inplace computation
        xs = abs_img_points[:, :, 0] * (rois[:, None, 2] - rois[:, None, 0])
        ys = abs_img_points[:, :, 1] * (rois[:, None, 3] - rois[:, None, 1])
        xs += rois[:, None, 0]
        ys += rois[:, None, 1]
        abs_img_points = torch.stack([xs, ys], dim=2)
    return abs_img_points


def get_shape_from_feature_map(x):
    """Get spatial resolution of input feature map considering exporting to
    onnx mode.

    Args:
        x (torch.Tensor): Input tensor, shape (N, C, H, W)
    Returns:
        torch.Tensor: Spatial resolution (width, height), shape (1, 1, 2)
    """
    if torch.onnx.is_in_onnx_export():
        img_shape = shape_as_tensor(x)[2:].flip(0).view(1, 1, 2).to(
            x.device).float()
    else:
        img_shape = torch.tensor(x.shape[2:]).flip(0).view(1, 1, 2).to(
            x.device).float()
    return img_shape


def abs_img_point_to_rel_img_point(abs_img_points, img, spatial_scale=1.):
    """Convert image based absolute point coordinates to image based relative
    coordinates for sampling.

    Args:
        abs_img_points (Tensor): Image based absolute point coordinates,
            shape (N, P, 2)
        img (tuple/Tensor): (height, width) of image or feature map.
        spatial_scale (float): Scale points by this factor. Default: 1.

    Returns:
        Tensor: Image based relative point coordinates for sampling,
            shape (N, P, 2)
    """

    assert (isinstance(img, tuple) and len(img) == 2) or \
           (isinstance(img, torch.Tensor) and len(img.shape) == 4)

    if isinstance(img, tuple):
        h, w = img
        scale = torch.tensor([w, h],
                             dtype=torch.float,
                             device=abs_img_points.device)
        scale = scale.view(1, 1, 2)
    else:
        scale = get_shape_from_feature_map(img)

    return abs_img_points / scale * spatial_scale


def rel_roi_point_to_rel_img_point(rois,
                                   rel_roi_points,
                                   img,
                                   spatial_scale=1.):
    """Convert roi based relative point coordinates to image based absolute
    point coordinates.

    Args:
        rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
        rel_roi_points (Tensor): Point coordinates inside RoI, relative to
            RoI, location, range (0, 1), shape (N, P, 2)
        img (tuple/Tensor): (height, width) of image or feature map.
        spatial_scale (float): Scale points by this factor. Default: 1.

    Returns:
        Tensor: Image based relative point coordinates for sampling,
            shape (N, P, 2)
    """

    abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points)
    rel_img_point = abs_img_point_to_rel_img_point(abs_img_point, img,
                                                   spatial_scale)

    return rel_img_point


def point_sample(input, points, align_corners=False, **kwargs):
    """A wrapper around :func:`grid_sample` to support 3D point_coords tensors
    Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
    lie inside ``[0, 1] x [0, 1]`` square.

    Args:
        input (Tensor): Feature map, shape (N, C, H, W).
        points (Tensor): Image based absolute point coordinates (normalized),
            range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2).
        align_corners (bool): Whether align_corners. Default: False

    Returns:
        Tensor: Features of `point` on `input`, shape (N, C, P) or
            (N, C, Hgrid, Wgrid).
    """

    add_dim = False
    if points.dim() == 3:
        add_dim = True
        points = points.unsqueeze(2)
    if is_in_onnx_export_without_custom_ops():
        # If custom ops for onnx runtime not compiled use python
        # implementation of grid_sample function to make onnx graph
        # with supported nodes
        output = bilinear_grid_sample(
            input, denormalize(points), align_corners=align_corners)
    else:
        output = F.grid_sample(
            input, denormalize(points), align_corners=align_corners, **kwargs)
    if add_dim:
        output = output.squeeze(3)
    return output


class SimpleRoIAlign(nn.Module):

    def __init__(self, output_size, spatial_scale, aligned=True):
        """Simple RoI align in PointRend, faster than standard RoIAlign.

        Args:
            output_size (tuple[int]): h, w
            spatial_scale (float): scale the input boxes by this number
            aligned (bool): if False, use the legacy implementation in
                MMDetection, align_corners=True will be used in F.grid_sample.
                If True, align the results more perfectly.
        """

        super(SimpleRoIAlign, self).__init__()
        self.output_size = _pair(output_size)
        self.spatial_scale = float(spatial_scale)
        # to be consistent with other RoI ops
        self.use_torchvision = False
        self.aligned = aligned

    def forward(self, features, rois):
        num_imgs = features.size(0)
        num_rois = rois.size(0)
        rel_roi_points = generate_grid(
            num_rois, self.output_size, device=rois.device)

        if torch.onnx.is_in_onnx_export():
            rel_img_points = rel_roi_point_to_rel_img_point(
                rois, rel_roi_points, features, self.spatial_scale)
            rel_img_points = rel_img_points.reshape(num_imgs, -1,
                                                    *rel_img_points.shape[1:])
            point_feats = point_sample(
                features, rel_img_points, align_corners=not self.aligned)
            point_feats = point_feats.transpose(1, 2)
        else:
            point_feats = []
            for batch_ind in range(num_imgs):
                # unravel batch dim
                feat = features[batch_ind].unsqueeze(0)
                inds = (rois[:, 0].long() == batch_ind)
                if inds.any():
                    rel_img_points = rel_roi_point_to_rel_img_point(
                        rois[inds], rel_roi_points[inds], feat,
                        self.spatial_scale).unsqueeze(0)
                    point_feat = point_sample(
                        feat, rel_img_points, align_corners=not self.aligned)
                    point_feat = point_feat.squeeze(0).transpose(0, 1)
                    point_feats.append(point_feat)

            point_feats = torch.cat(point_feats, dim=0)

        channels = features.size(1)
        roi_feats = point_feats.reshape(num_rois, channels, *self.output_size)

        return roi_feats

    def __repr__(self):
        format_str = self.__class__.__name__
        format_str += '(output_size={}, spatial_scale={}'.format(
            self.output_size, self.spatial_scale)
        return format_str