Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-310.pyc +0 -0
- src/dataset/__init__.py +97 -0
- src/dataset/__pycache__/__init__.cpython-310.pyc +0 -0
- src/dataset/__pycache__/base_depth_dataset.cpython-310.pyc +0 -0
- src/dataset/__pycache__/base_inpaint_dataset.cpython-310.pyc +0 -0
- src/dataset/__pycache__/depthanything_dataset.cpython-310.pyc +0 -0
- src/dataset/__pycache__/diode_dataset.cpython-310.pyc +0 -0
- src/dataset/__pycache__/eth3d_dataset.cpython-310.pyc +0 -0
- src/dataset/__pycache__/eval_base_dataset.cpython-310.pyc +0 -0
- src/dataset/__pycache__/hypersim_dataset.cpython-310.pyc +0 -0
- src/dataset/__pycache__/kitti_dataset.cpython-310.pyc +0 -0
- src/dataset/__pycache__/mixed_sampler.cpython-310.pyc +0 -0
- src/dataset/__pycache__/nyu_dataset.cpython-310.pyc +0 -0
- src/dataset/__pycache__/scannet_dataset.cpython-310.pyc +0 -0
- src/dataset/__pycache__/vkitti_dataset.cpython-310.pyc +0 -0
- src/dataset/base_depth_dataset.py +286 -0
- src/dataset/base_inpaint_dataset.py +280 -0
- src/dataset/depthanything_dataset.py +91 -0
- src/dataset/diode_dataset.py +91 -0
- src/dataset/eth3d_dataset.py +65 -0
- src/dataset/eval_base_dataset.py +283 -0
- src/dataset/hypersim_dataset.py +44 -0
- src/dataset/inpaint_dataset.py +286 -0
- src/dataset/kitti_dataset.py +124 -0
- src/dataset/mixed_sampler.py +149 -0
- src/dataset/nyu_dataset.py +61 -0
- src/dataset/scannet_dataset.py +44 -0
- src/dataset/vkitti_dataset.py +97 -0
- src/trainer/__init__.py +16 -0
- src/trainer/__pycache__/__init__.cpython-310.pyc +0 -0
- src/trainer/__pycache__/marigold_inpaint_trainer.cpython-310.pyc +0 -0
- src/trainer/__pycache__/marigold_trainer.cpython-310.pyc +0 -0
- src/trainer/__pycache__/marigold_xl_trainer.cpython-310.pyc +0 -0
- src/trainer/marigold_inpaint_trainer.py +665 -0
- src/trainer/marigold_trainer.py +968 -0
- src/trainer/marigold_xl_trainer.py +948 -0
- src/util/__pycache__/alignment.cpython-310.pyc +0 -0
- src/util/__pycache__/config_util.cpython-310.pyc +0 -0
- src/util/__pycache__/data_loader.cpython-310.pyc +0 -0
- src/util/__pycache__/depth_transform.cpython-310.pyc +0 -0
- src/util/__pycache__/logging_util.cpython-310.pyc +0 -0
- src/util/__pycache__/loss.cpython-310.pyc +0 -0
- src/util/__pycache__/lr_scheduler.cpython-310.pyc +0 -0
- src/util/__pycache__/metric.cpython-310.pyc +0 -0
- src/util/__pycache__/multi_res_noise.cpython-310.pyc +0 -0
- src/util/__pycache__/seeding.cpython-310.pyc +0 -0
- src/util/__pycache__/slurm_util.cpython-310.pyc +0 -0
- src/util/alignment.py +72 -0
- src/util/config_util.py +49 -0
src/__init__.py
ADDED
File without changes
|
src/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (136 Bytes). View file
|
|
src/dataset/__init__.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Last modified: 2024-04-16
|
2 |
+
#
|
3 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# --------------------------------------------------------------------------
|
17 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
18 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
19 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
20 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
21 |
+
# --------------------------------------------------------------------------
|
22 |
+
|
23 |
+
import os
|
24 |
+
import pdb
|
25 |
+
|
26 |
+
from .base_depth_dataset import BaseDepthDataset # noqa: F401
|
27 |
+
from .eval_base_dataset import EvaluateBaseDataset, DatasetMode, get_pred_name
|
28 |
+
from .diode_dataset import DIODEDataset
|
29 |
+
from .eth3d_dataset import ETH3DDataset
|
30 |
+
from .hypersim_dataset import HypersimDataset
|
31 |
+
from .kitti_dataset import KITTIDataset
|
32 |
+
from .nyu_dataset import NYUDataset
|
33 |
+
from .scannet_dataset import ScanNetDataset
|
34 |
+
from .vkitti_dataset import VirtualKITTIDataset
|
35 |
+
from .depthanything_dataset import DepthAnythingDataset
|
36 |
+
from .base_inpaint_dataset import BaseInpaintDataset
|
37 |
+
|
38 |
+
dataset_name_class_dict = {
|
39 |
+
"hypersim": HypersimDataset,
|
40 |
+
"vkitti": VirtualKITTIDataset,
|
41 |
+
"nyu_v2": NYUDataset,
|
42 |
+
"kitti": KITTIDataset,
|
43 |
+
"eth3d": ETH3DDataset,
|
44 |
+
"diode": DIODEDataset,
|
45 |
+
"scannet": ScanNetDataset,
|
46 |
+
'depthanything': DepthAnythingDataset,
|
47 |
+
'inpainting': BaseInpaintDataset
|
48 |
+
}
|
49 |
+
|
50 |
+
|
51 |
+
def get_dataset(
|
52 |
+
cfg_data_split, base_data_dir: str, mode: DatasetMode, **kwargs
|
53 |
+
):
|
54 |
+
if "mixed" == cfg_data_split.name:
|
55 |
+
# assert DatasetMode.TRAIN == mode, "Only training mode supports mixed datasets."
|
56 |
+
dataset_ls = [
|
57 |
+
get_dataset(_cfg, base_data_dir, mode, **kwargs)
|
58 |
+
for _cfg in cfg_data_split.dataset_list
|
59 |
+
]
|
60 |
+
return dataset_ls
|
61 |
+
elif cfg_data_split.name in dataset_name_class_dict.keys():
|
62 |
+
dataset_class = dataset_name_class_dict[cfg_data_split.name]
|
63 |
+
dataset = dataset_class(
|
64 |
+
mode=mode,
|
65 |
+
filename_ls_path=cfg_data_split.filenames,
|
66 |
+
dataset_dir=os.path.join(base_data_dir, cfg_data_split.dir),
|
67 |
+
**cfg_data_split,
|
68 |
+
**kwargs,
|
69 |
+
)
|
70 |
+
else:
|
71 |
+
raise NotImplementedError
|
72 |
+
|
73 |
+
return dataset
|
74 |
+
|
75 |
+
def get_eval_dataset(
|
76 |
+
cfg_data_split, base_data_dir: str, mode: DatasetMode, **kwargs
|
77 |
+
) -> EvaluateBaseDataset:
|
78 |
+
if "mixed" == cfg_data_split.name:
|
79 |
+
assert DatasetMode.TRAIN == mode, "Only training mode supports mixed datasets."
|
80 |
+
dataset_ls = [
|
81 |
+
get_dataset(_cfg, base_data_dir, mode, **kwargs)
|
82 |
+
for _cfg in cfg_data_split.dataset_list
|
83 |
+
]
|
84 |
+
return dataset_ls
|
85 |
+
elif cfg_data_split.name in dataset_name_class_dict.keys():
|
86 |
+
dataset_class = dataset_name_class_dict[cfg_data_split.name]
|
87 |
+
dataset = dataset_class(
|
88 |
+
mode=mode,
|
89 |
+
filename_ls_path=cfg_data_split.filenames,
|
90 |
+
dataset_dir=os.path.join(base_data_dir, cfg_data_split.dir),
|
91 |
+
**cfg_data_split,
|
92 |
+
**kwargs,
|
93 |
+
)
|
94 |
+
else:
|
95 |
+
raise NotImplementedError
|
96 |
+
|
97 |
+
return dataset
|
src/dataset/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (2.19 kB). View file
|
|
src/dataset/__pycache__/base_depth_dataset.cpython-310.pyc
ADDED
Binary file (7.92 kB). View file
|
|
src/dataset/__pycache__/base_inpaint_dataset.cpython-310.pyc
ADDED
Binary file (7.7 kB). View file
|
|
src/dataset/__pycache__/depthanything_dataset.cpython-310.pyc
ADDED
Binary file (1.92 kB). View file
|
|
src/dataset/__pycache__/diode_dataset.cpython-310.pyc
ADDED
Binary file (2.19 kB). View file
|
|
src/dataset/__pycache__/eth3d_dataset.cpython-310.pyc
ADDED
Binary file (1.4 kB). View file
|
|
src/dataset/__pycache__/eval_base_dataset.cpython-310.pyc
ADDED
Binary file (7.64 kB). View file
|
|
src/dataset/__pycache__/hypersim_dataset.cpython-310.pyc
ADDED
Binary file (957 Bytes). View file
|
|
src/dataset/__pycache__/kitti_dataset.cpython-310.pyc
ADDED
Binary file (3.35 kB). View file
|
|
src/dataset/__pycache__/mixed_sampler.cpython-310.pyc
ADDED
Binary file (3.95 kB). View file
|
|
src/dataset/__pycache__/nyu_dataset.cpython-310.pyc
ADDED
Binary file (1.39 kB). View file
|
|
src/dataset/__pycache__/scannet_dataset.cpython-310.pyc
ADDED
Binary file (946 Bytes). View file
|
|
src/dataset/__pycache__/vkitti_dataset.cpython-310.pyc
ADDED
Binary file (2.63 kB). View file
|
|
src/dataset/base_depth_dataset.py
ADDED
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Last modified: 2024-04-30
|
2 |
+
#
|
3 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# --------------------------------------------------------------------------
|
17 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
18 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
19 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
20 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
21 |
+
# --------------------------------------------------------------------------
|
22 |
+
import glob
|
23 |
+
import io
|
24 |
+
import json
|
25 |
+
import os
|
26 |
+
import pdb
|
27 |
+
import random
|
28 |
+
import tarfile
|
29 |
+
from enum import Enum
|
30 |
+
from typing import Union
|
31 |
+
|
32 |
+
import numpy as np
|
33 |
+
import torch
|
34 |
+
from PIL import Image
|
35 |
+
from torch.utils.data import Dataset
|
36 |
+
from torchvision.transforms import InterpolationMode, Resize, CenterCrop
|
37 |
+
import torchvision.transforms as transforms
|
38 |
+
from transformers import CLIPTextModel, CLIPTokenizer
|
39 |
+
from src.util.depth_transform import DepthNormalizerBase
|
40 |
+
import random
|
41 |
+
|
42 |
+
from src.dataset.eval_base_dataset import DatasetMode, DepthFileNameMode
|
43 |
+
|
44 |
+
|
45 |
+
def read_image_from_tar(tar_obj, img_rel_path):
|
46 |
+
image = tar_obj.extractfile("./" + img_rel_path)
|
47 |
+
image = image.read()
|
48 |
+
image = Image.open(io.BytesIO(image))
|
49 |
+
|
50 |
+
|
51 |
+
class BaseDepthDataset(Dataset):
|
52 |
+
def __init__(
|
53 |
+
self,
|
54 |
+
mode: DatasetMode,
|
55 |
+
filename_ls_path: str,
|
56 |
+
dataset_dir: str,
|
57 |
+
disp_name: str,
|
58 |
+
min_depth: float,
|
59 |
+
max_depth: float,
|
60 |
+
has_filled_depth: bool,
|
61 |
+
name_mode: DepthFileNameMode,
|
62 |
+
depth_transform: Union[DepthNormalizerBase, None] = None,
|
63 |
+
tokenizer: CLIPTokenizer = None,
|
64 |
+
augmentation_args: dict = None,
|
65 |
+
resize_to_hw=None,
|
66 |
+
move_invalid_to_far_plane: bool = True,
|
67 |
+
rgb_transform=lambda x: x / 255.0 * 2 - 1, # [0, 255] -> [-1, 1],
|
68 |
+
**kwargs,
|
69 |
+
) -> None:
|
70 |
+
super().__init__()
|
71 |
+
self.mode = mode
|
72 |
+
# dataset info
|
73 |
+
self.filename_ls_path = filename_ls_path
|
74 |
+
self.disp_name = disp_name
|
75 |
+
self.has_filled_depth = has_filled_depth
|
76 |
+
self.name_mode: DepthFileNameMode = name_mode
|
77 |
+
self.min_depth = min_depth
|
78 |
+
self.max_depth = max_depth
|
79 |
+
# training arguments
|
80 |
+
self.depth_transform: DepthNormalizerBase = depth_transform
|
81 |
+
self.augm_args = augmentation_args
|
82 |
+
self.resize_to_hw = resize_to_hw
|
83 |
+
self.rgb_transform = rgb_transform
|
84 |
+
self.move_invalid_to_far_plane = move_invalid_to_far_plane
|
85 |
+
self.tokenizer = tokenizer
|
86 |
+
# Load filenames
|
87 |
+
self.filenames = []
|
88 |
+
filename_paths = glob.glob(self.filename_ls_path)
|
89 |
+
for path in filename_paths:
|
90 |
+
with open(path, "r") as f:
|
91 |
+
self.filenames += json.load(f)
|
92 |
+
# Tar dataset
|
93 |
+
self.tar_obj = None
|
94 |
+
self.is_tar = (
|
95 |
+
True
|
96 |
+
if os.path.isfile(dataset_dir) and tarfile.is_tarfile(dataset_dir)
|
97 |
+
else False
|
98 |
+
)
|
99 |
+
|
100 |
+
def __len__(self):
|
101 |
+
return len(self.filenames)
|
102 |
+
|
103 |
+
def __getitem__(self, index):
|
104 |
+
rasters, other = self._get_data_item(index)
|
105 |
+
if DatasetMode.TRAIN == self.mode:
|
106 |
+
rasters = self._training_preprocess(rasters)
|
107 |
+
# merge
|
108 |
+
outputs = rasters
|
109 |
+
outputs.update(other)
|
110 |
+
return outputs
|
111 |
+
|
112 |
+
def _get_data_item(self, index):
|
113 |
+
rgb_path = self.filenames[index]['rgb_path']
|
114 |
+
depth_path = self.filenames[index]['depth_path']
|
115 |
+
mask_path = None
|
116 |
+
if 'valid_mask' in self.filenames[index]:
|
117 |
+
mask_path = self.filenames[index]['valid_mask']
|
118 |
+
if self.filenames[index]['caption'] is not None:
|
119 |
+
coca_caption = self.filenames[index]['caption']['coca_caption']
|
120 |
+
spatial_caption = self.filenames[index]['caption']['spatial_caption']
|
121 |
+
empty_caption = ''
|
122 |
+
caption_choices = [coca_caption, spatial_caption, empty_caption]
|
123 |
+
probabilities = [0.4, 0.4, 0.2]
|
124 |
+
caption = random.choices(caption_choices, probabilities)[0]
|
125 |
+
else:
|
126 |
+
caption = ''
|
127 |
+
|
128 |
+
rasters = {}
|
129 |
+
# RGB data
|
130 |
+
rasters.update(self._load_rgb_data(rgb_path))
|
131 |
+
|
132 |
+
# Depth data
|
133 |
+
if DatasetMode.RGB_ONLY != self.mode and depth_path is not None:
|
134 |
+
# load data
|
135 |
+
depth_data = self._load_depth_data(depth_path)
|
136 |
+
rasters.update(depth_data)
|
137 |
+
# valid mask
|
138 |
+
if mask_path is not None:
|
139 |
+
valid_mask_raw = Image.open(mask_path)
|
140 |
+
valid_mask_filled = Image.open(mask_path)
|
141 |
+
rasters["valid_mask_raw"] = torch.from_numpy(np.asarray(valid_mask_raw)).unsqueeze(0).bool()
|
142 |
+
rasters["valid_mask_filled"] = torch.from_numpy(np.asarray(valid_mask_filled)).unsqueeze(0).bool()
|
143 |
+
else:
|
144 |
+
rasters["valid_mask_raw"] = self._get_valid_mask(
|
145 |
+
rasters["depth_raw_linear"]
|
146 |
+
).clone()
|
147 |
+
rasters["valid_mask_filled"] = self._get_valid_mask(
|
148 |
+
rasters["depth_filled_linear"]
|
149 |
+
).clone()
|
150 |
+
|
151 |
+
other = {"index": index, "rgb_path": rgb_path, 'text': caption}
|
152 |
+
|
153 |
+
if self.resize_to_hw is not None:
|
154 |
+
resize_transform = transforms.Compose([
|
155 |
+
Resize(size=max(self.resize_to_hw), interpolation=InterpolationMode.NEAREST_EXACT),
|
156 |
+
CenterCrop(size=self.resize_to_hw)])
|
157 |
+
rasters = {k: resize_transform(v) for k, v in rasters.items()}
|
158 |
+
|
159 |
+
return rasters, other
|
160 |
+
|
161 |
+
def _load_rgb_data(self, rgb_path):
|
162 |
+
# Read RGB data
|
163 |
+
rgb = self._read_rgb_file(rgb_path)
|
164 |
+
rgb_norm = rgb / 255.0 * 2.0 - 1.0 # [0, 255] -> [-1, 1]
|
165 |
+
|
166 |
+
outputs = {
|
167 |
+
"rgb_int": torch.from_numpy(rgb).int(),
|
168 |
+
"rgb_norm": torch.from_numpy(rgb_norm).float(),
|
169 |
+
}
|
170 |
+
return outputs
|
171 |
+
|
172 |
+
def _load_depth_data(self, depth_path, filled_rel_path=None):
|
173 |
+
# Read depth data
|
174 |
+
outputs = {}
|
175 |
+
depth_raw = self._read_depth_file(depth_path).squeeze()
|
176 |
+
depth_raw_linear = torch.from_numpy(depth_raw.copy()).float().unsqueeze(0) # [1, H, W]
|
177 |
+
outputs["depth_raw_linear"] = depth_raw_linear.clone()
|
178 |
+
|
179 |
+
if self.has_filled_depth:
|
180 |
+
depth_filled = self._read_depth_file(filled_rel_path).squeeze()
|
181 |
+
depth_filled_linear = torch.from_numpy(depth_filled).float().unsqueeze(0)
|
182 |
+
outputs["depth_filled_linear"] = depth_filled_linear
|
183 |
+
else:
|
184 |
+
outputs["depth_filled_linear"] = depth_raw_linear.clone()
|
185 |
+
|
186 |
+
return outputs
|
187 |
+
|
188 |
+
def _get_data_path(self, index):
|
189 |
+
filename_line = self.filenames[index]
|
190 |
+
|
191 |
+
# Get data path
|
192 |
+
rgb_rel_path = filename_line[0]
|
193 |
+
|
194 |
+
depth_rel_path, text_rel_path = None, None
|
195 |
+
if DatasetMode.RGB_ONLY != self.mode:
|
196 |
+
depth_rel_path = filename_line[1]
|
197 |
+
if len(filename_line) > 2:
|
198 |
+
text_rel_path = filename_line[2]
|
199 |
+
return rgb_rel_path, depth_rel_path, text_rel_path
|
200 |
+
|
201 |
+
def _read_image(self, img_path) -> np.ndarray:
|
202 |
+
image_to_read = img_path
|
203 |
+
image = Image.open(image_to_read) # [H, W, rgb]
|
204 |
+
image = np.asarray(image)
|
205 |
+
return image
|
206 |
+
|
207 |
+
def _read_rgb_file(self, path) -> np.ndarray:
|
208 |
+
rgb = self._read_image(path)
|
209 |
+
rgb = np.transpose(rgb, (2, 0, 1)).astype(int) # [rgb, H, W]
|
210 |
+
return rgb
|
211 |
+
|
212 |
+
def _read_depth_file(self, path):
|
213 |
+
depth_in = self._read_image(path)
|
214 |
+
# Replace code below to decode depth according to dataset definition
|
215 |
+
depth_decoded = depth_in
|
216 |
+
return depth_decoded
|
217 |
+
|
218 |
+
def _get_valid_mask(self, depth: torch.Tensor):
|
219 |
+
valid_mask = torch.logical_and(
|
220 |
+
(depth > self.min_depth), (depth < self.max_depth)
|
221 |
+
).bool()
|
222 |
+
return valid_mask
|
223 |
+
|
224 |
+
def _training_preprocess(self, rasters):
|
225 |
+
# Augmentation
|
226 |
+
if self.augm_args is not None:
|
227 |
+
rasters = self._augment_data(rasters)
|
228 |
+
|
229 |
+
# Normalization
|
230 |
+
# rasters["depth_raw_norm"] = rasters["depth_raw_linear"] / 255.0 * 2.0 - 1.0
|
231 |
+
# rasters["depth_filled_norm"] = rasters["depth_filled_linear"] / 255.0 * 2.0 - 1.0
|
232 |
+
|
233 |
+
rasters["depth_raw_norm"] = self.depth_transform(
|
234 |
+
rasters["depth_raw_linear"], rasters["valid_mask_raw"]
|
235 |
+
).clone()
|
236 |
+
rasters["depth_filled_norm"] = self.depth_transform(
|
237 |
+
rasters["depth_filled_linear"], rasters["valid_mask_filled"]
|
238 |
+
).clone()
|
239 |
+
|
240 |
+
# Set invalid pixel to far plane
|
241 |
+
if self.move_invalid_to_far_plane:
|
242 |
+
if self.depth_transform.far_plane_at_max:
|
243 |
+
rasters["depth_filled_norm"][~rasters["valid_mask_filled"]] = (
|
244 |
+
self.depth_transform.norm_max
|
245 |
+
)
|
246 |
+
else:
|
247 |
+
rasters["depth_filled_norm"][~rasters["valid_mask_filled"]] = (
|
248 |
+
self.depth_transform.norm_min
|
249 |
+
)
|
250 |
+
|
251 |
+
# Resize
|
252 |
+
if self.resize_to_hw is not None:
|
253 |
+
resize_transform = transforms.Compose([
|
254 |
+
Resize(size=max(self.resize_to_hw), interpolation=InterpolationMode.NEAREST_EXACT),
|
255 |
+
CenterCrop(size=self.resize_to_hw)])
|
256 |
+
rasters = {k: resize_transform(v) for k, v in rasters.items()}
|
257 |
+
return rasters
|
258 |
+
|
259 |
+
def _augment_data(self, rasters_dict):
|
260 |
+
# lr flipping
|
261 |
+
lr_flip_p = self.augm_args.lr_flip_p
|
262 |
+
if random.random() < lr_flip_p:
|
263 |
+
rasters_dict = {k: v.flip(-1) for k, v in rasters_dict.items()}
|
264 |
+
|
265 |
+
return rasters_dict
|
266 |
+
|
267 |
+
def __del__(self):
|
268 |
+
if hasattr(self, "tar_obj") and self.tar_obj is not None:
|
269 |
+
self.tar_obj.close()
|
270 |
+
self.tar_obj = None
|
271 |
+
|
272 |
+
def get_pred_name(rgb_basename, name_mode, suffix=".png"):
|
273 |
+
if DepthFileNameMode.rgb_id == name_mode:
|
274 |
+
pred_basename = "pred_" + rgb_basename.split("_")[1]
|
275 |
+
elif DepthFileNameMode.i_d_rgb == name_mode:
|
276 |
+
pred_basename = rgb_basename.replace("_rgb.", "_pred.")
|
277 |
+
elif DepthFileNameMode.id == name_mode:
|
278 |
+
pred_basename = "pred_" + rgb_basename
|
279 |
+
elif DepthFileNameMode.rgb_i_d == name_mode:
|
280 |
+
pred_basename = "pred_" + "_".join(rgb_basename.split("_")[1:])
|
281 |
+
else:
|
282 |
+
raise NotImplementedError
|
283 |
+
# change suffix
|
284 |
+
pred_basename = os.path.splitext(pred_basename)[0] + suffix
|
285 |
+
|
286 |
+
return pred_basename
|
src/dataset/base_inpaint_dataset.py
ADDED
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Last modified: 2024-04-30
|
2 |
+
#
|
3 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# --------------------------------------------------------------------------
|
17 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
18 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
19 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
20 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
21 |
+
# --------------------------------------------------------------------------
|
22 |
+
import glob
|
23 |
+
import io
|
24 |
+
import json
|
25 |
+
import os
|
26 |
+
import pdb
|
27 |
+
import random
|
28 |
+
import tarfile
|
29 |
+
from enum import Enum
|
30 |
+
from typing import Union
|
31 |
+
|
32 |
+
import numpy as np
|
33 |
+
import torch
|
34 |
+
from PIL import Image
|
35 |
+
from torch.utils.data import Dataset
|
36 |
+
from torchvision.transforms import InterpolationMode, Resize, CenterCrop
|
37 |
+
import torchvision.transforms as transforms
|
38 |
+
from transformers import CLIPTextModel, CLIPTokenizer
|
39 |
+
from src.util.depth_transform import DepthNormalizerBase
|
40 |
+
import random
|
41 |
+
|
42 |
+
from src.dataset.eval_base_dataset import DatasetMode, DepthFileNameMode
|
43 |
+
from pycocotools import mask as coco_mask
|
44 |
+
from scipy.ndimage import gaussian_filter
|
45 |
+
|
46 |
+
def read_image_from_tar(tar_obj, img_rel_path):
|
47 |
+
image = tar_obj.extractfile("./" + img_rel_path)
|
48 |
+
image = image.read()
|
49 |
+
image = Image.open(io.BytesIO(image))
|
50 |
+
|
51 |
+
|
52 |
+
class BaseInpaintDataset(Dataset):
|
53 |
+
def __init__(
|
54 |
+
self,
|
55 |
+
mode: DatasetMode,
|
56 |
+
filename_ls_path: str,
|
57 |
+
dataset_dir: str,
|
58 |
+
disp_name: str,
|
59 |
+
depth_transform: Union[DepthNormalizerBase, None] = None,
|
60 |
+
tokenizer: CLIPTokenizer = None,
|
61 |
+
augmentation_args: dict = None,
|
62 |
+
resize_to_hw=None,
|
63 |
+
move_invalid_to_far_plane: bool = True,
|
64 |
+
rgb_transform=lambda x: x / 255.0 * 2 - 1, # [0, 255] -> [-1, 1],
|
65 |
+
**kwargs,
|
66 |
+
) -> None:
|
67 |
+
super().__init__()
|
68 |
+
self.mode = mode
|
69 |
+
# dataset info
|
70 |
+
self.filename_ls_path = filename_ls_path
|
71 |
+
self.disp_name = disp_name
|
72 |
+
# training arguments
|
73 |
+
self.depth_transform: DepthNormalizerBase = depth_transform
|
74 |
+
self.augm_args = augmentation_args
|
75 |
+
self.resize_to_hw = resize_to_hw
|
76 |
+
self.rgb_transform = rgb_transform
|
77 |
+
self.move_invalid_to_far_plane = move_invalid_to_far_plane
|
78 |
+
self.tokenizer = tokenizer
|
79 |
+
# Load filenames
|
80 |
+
self.filenames = []
|
81 |
+
filename_paths = glob.glob(self.filename_ls_path)
|
82 |
+
for path in filename_paths:
|
83 |
+
with open(path, "r") as f:
|
84 |
+
self.filenames += json.load(f)
|
85 |
+
# Tar dataset
|
86 |
+
self.tar_obj = None
|
87 |
+
self.is_tar = (
|
88 |
+
True
|
89 |
+
if os.path.isfile(dataset_dir) and tarfile.is_tarfile(dataset_dir)
|
90 |
+
else False
|
91 |
+
)
|
92 |
+
|
93 |
+
def __len__(self):
|
94 |
+
return len(self.filenames)
|
95 |
+
|
96 |
+
def __getitem__(self, index):
|
97 |
+
rasters, other = self._get_data_item(index)
|
98 |
+
if DatasetMode.TRAIN == self.mode:
|
99 |
+
rasters = self._training_preprocess(rasters)
|
100 |
+
# merge
|
101 |
+
outputs = rasters
|
102 |
+
outputs.update(other)
|
103 |
+
return outputs
|
104 |
+
|
105 |
+
def _get_data_item(self, index):
|
106 |
+
rgb_path = self.filenames[index]['rgb_path']
|
107 |
+
mask_path = None
|
108 |
+
if 'valid_mask' in self.filenames[index]:
|
109 |
+
mask_path = self.filenames[index]['valid_mask']
|
110 |
+
if self.filenames[index]['caption'] is not None:
|
111 |
+
coca_caption = self.filenames[index]['caption']['coca_caption']
|
112 |
+
spatial_caption = self.filenames[index]['caption']['spatial_caption']
|
113 |
+
empty_caption = ''
|
114 |
+
caption_choices = [coca_caption, spatial_caption, empty_caption]
|
115 |
+
probabilities = [0.4, 0.4, 0.2]
|
116 |
+
caption = random.choices(caption_choices, probabilities)[0]
|
117 |
+
else:
|
118 |
+
caption = ''
|
119 |
+
|
120 |
+
rasters = {}
|
121 |
+
# RGB data
|
122 |
+
rasters.update(self._load_rgb_data(rgb_path))
|
123 |
+
|
124 |
+
try:
|
125 |
+
anno = json.load(open(rgb_path.replace('.jpg', '.json')))['annotations']
|
126 |
+
random.shuffle(anno)
|
127 |
+
object_num = random.randint(5, 10)
|
128 |
+
mask = np.array(coco_mask.decode(anno[0]['segmentation']), dtype=np.uint8)
|
129 |
+
for single_anno in (anno[0:object_num] if len(anno)>object_num else anno):
|
130 |
+
mask += np.array(coco_mask.decode(single_anno['segmentation']), dtype=np.uint8)
|
131 |
+
except:
|
132 |
+
mask = None
|
133 |
+
|
134 |
+
a = random.random()
|
135 |
+
if a < 0.1 or mask is None:
|
136 |
+
mask = np.zeros(rasters['rgb_int'].shape[-2:])
|
137 |
+
rows, cols = mask.shape
|
138 |
+
grid_size = random.randint(5, 14)
|
139 |
+
grid_rows, grid_cols = rows // grid_size, cols // grid_size
|
140 |
+
for i in range(grid_rows):
|
141 |
+
for j in range(grid_cols):
|
142 |
+
random_prob = np.random.rand()
|
143 |
+
if random_prob < 0.2:
|
144 |
+
row_start = i * grid_size
|
145 |
+
row_end = (i + 1) * grid_size
|
146 |
+
col_start = j * grid_size
|
147 |
+
col_end = (j + 1) * grid_size
|
148 |
+
mask[row_start:row_end, col_start:col_end] = 1
|
149 |
+
|
150 |
+
rasters['mask'] = torch.from_numpy(mask).unsqueeze(0).to(torch.float32)
|
151 |
+
|
152 |
+
if self.resize_to_hw is not None:
|
153 |
+
resize_transform = transforms.Compose([
|
154 |
+
Resize(size=max(self.resize_to_hw), interpolation=InterpolationMode.NEAREST_EXACT),
|
155 |
+
CenterCrop(size=self.resize_to_hw)])
|
156 |
+
rasters = {k: resize_transform(v) for k, v in rasters.items()}
|
157 |
+
|
158 |
+
# mask = torch.zeros(rasters['rgb_int'].shape[-2:])
|
159 |
+
# rows, cols = mask.shape
|
160 |
+
# grid_size = random.randint(3, 10)
|
161 |
+
# grid_rows, grid_cols = rows // grid_size, cols // grid_size
|
162 |
+
# for i in range(grid_rows):
|
163 |
+
# for j in range(grid_cols):
|
164 |
+
# random_prob = np.random.rand()
|
165 |
+
# if random_prob < 0.5:
|
166 |
+
# row_start = i * grid_size
|
167 |
+
# row_end = (i + 1) * grid_size
|
168 |
+
# col_start = j * grid_size
|
169 |
+
# col_end = (j + 1) * grid_size
|
170 |
+
# mask[row_start:row_end, col_start:col_end] = 1
|
171 |
+
|
172 |
+
# rasters['mask'] = mask.unsqueeze(0)
|
173 |
+
|
174 |
+
other = {"index": index, "rgb_path": rgb_path, 'text': caption}
|
175 |
+
return rasters, other
|
176 |
+
|
177 |
+
def _load_rgb_data(self, rgb_path):
|
178 |
+
# Read RGB data
|
179 |
+
rgb = self._read_rgb_file(rgb_path)
|
180 |
+
rgb_norm = rgb / 255.0 * 2.0 - 1.0 # [0, 255] -> [-1, 1]
|
181 |
+
|
182 |
+
outputs = {
|
183 |
+
"rgb_int": torch.from_numpy(rgb).int(),
|
184 |
+
"rgb_norm": torch.from_numpy(rgb_norm).float(),
|
185 |
+
}
|
186 |
+
return outputs
|
187 |
+
|
188 |
+
def _get_data_path(self, index):
|
189 |
+
filename_line = self.filenames[index]
|
190 |
+
|
191 |
+
# Get data path
|
192 |
+
rgb_rel_path = filename_line[0]
|
193 |
+
|
194 |
+
depth_rel_path, text_rel_path = None, None
|
195 |
+
if DatasetMode.RGB_ONLY != self.mode:
|
196 |
+
depth_rel_path = filename_line[1]
|
197 |
+
if len(filename_line) > 2:
|
198 |
+
text_rel_path = filename_line[2]
|
199 |
+
return rgb_rel_path, depth_rel_path, text_rel_path
|
200 |
+
|
201 |
+
def _read_image(self, img_path) -> np.ndarray:
|
202 |
+
image_to_read = img_path
|
203 |
+
image = Image.open(image_to_read) # [H, W, rgb]
|
204 |
+
image = np.asarray(image)
|
205 |
+
return image
|
206 |
+
|
207 |
+
def _read_rgb_file(self, path) -> np.ndarray:
|
208 |
+
rgb = self._read_image(path)
|
209 |
+
rgb = np.transpose(rgb, (2, 0, 1)).astype(int) # [rgb, H, W]
|
210 |
+
return rgb
|
211 |
+
|
212 |
+
def _read_depth_file(self, path):
|
213 |
+
depth_in = self._read_image(path)
|
214 |
+
# Replace code below to decode depth according to dataset definition
|
215 |
+
depth_decoded = depth_in
|
216 |
+
return depth_decoded
|
217 |
+
|
218 |
+
def _training_preprocess(self, rasters):
|
219 |
+
# Augmentation
|
220 |
+
if self.augm_args is not None:
|
221 |
+
rasters = self._augment_data(rasters)
|
222 |
+
|
223 |
+
# Normalization
|
224 |
+
# rasters["depth_raw_norm"] = rasters["depth_raw_linear"] / 255.0 * 2.0 - 1.0
|
225 |
+
# rasters["depth_filled_norm"] = rasters["depth_filled_linear"] / 255.0 * 2.0 - 1.0
|
226 |
+
|
227 |
+
rasters["depth_raw_norm"] = self.depth_transform(
|
228 |
+
rasters["depth_raw_linear"], rasters["valid_mask_raw"]
|
229 |
+
).clone()
|
230 |
+
rasters["depth_filled_norm"] = self.depth_transform(
|
231 |
+
rasters["depth_filled_linear"], rasters["valid_mask_filled"]
|
232 |
+
).clone()
|
233 |
+
|
234 |
+
# Set invalid pixel to far plane
|
235 |
+
if self.move_invalid_to_far_plane:
|
236 |
+
if self.depth_transform.far_plane_at_max:
|
237 |
+
rasters["depth_filled_norm"][~rasters["valid_mask_filled"]] = (
|
238 |
+
self.depth_transform.norm_max
|
239 |
+
)
|
240 |
+
else:
|
241 |
+
rasters["depth_filled_norm"][~rasters["valid_mask_filled"]] = (
|
242 |
+
self.depth_transform.norm_min
|
243 |
+
)
|
244 |
+
|
245 |
+
# Resize
|
246 |
+
if self.resize_to_hw is not None:
|
247 |
+
resize_transform = transforms.Compose([
|
248 |
+
Resize(size=max(self.resize_to_hw), interpolation=InterpolationMode.NEAREST_EXACT),
|
249 |
+
CenterCrop(size=self.resize_to_hw)])
|
250 |
+
rasters = {k: resize_transform(v) for k, v in rasters.items()}
|
251 |
+
return rasters
|
252 |
+
|
253 |
+
def _augment_data(self, rasters_dict):
|
254 |
+
# lr flipping
|
255 |
+
lr_flip_p = self.augm_args.lr_flip_p
|
256 |
+
if random.random() < lr_flip_p:
|
257 |
+
rasters_dict = {k: v.flip(-1) for k, v in rasters_dict.items()}
|
258 |
+
|
259 |
+
return rasters_dict
|
260 |
+
|
261 |
+
def __del__(self):
|
262 |
+
if hasattr(self, "tar_obj") and self.tar_obj is not None:
|
263 |
+
self.tar_obj.close()
|
264 |
+
self.tar_obj = None
|
265 |
+
|
266 |
+
def get_pred_name(rgb_basename, name_mode, suffix=".png"):
|
267 |
+
if DepthFileNameMode.rgb_id == name_mode:
|
268 |
+
pred_basename = "pred_" + rgb_basename.split("_")[1]
|
269 |
+
elif DepthFileNameMode.i_d_rgb == name_mode:
|
270 |
+
pred_basename = rgb_basename.replace("_rgb.", "_pred.")
|
271 |
+
elif DepthFileNameMode.id == name_mode:
|
272 |
+
pred_basename = "pred_" + rgb_basename
|
273 |
+
elif DepthFileNameMode.rgb_i_d == name_mode:
|
274 |
+
pred_basename = "pred_" + "_".join(rgb_basename.split("_")[1:])
|
275 |
+
else:
|
276 |
+
raise NotImplementedError
|
277 |
+
# change suffix
|
278 |
+
pred_basename = os.path.splitext(pred_basename)[0] + suffix
|
279 |
+
|
280 |
+
return pred_basename
|
src/dataset/depthanything_dataset.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Last modified: 2024-02-08
|
2 |
+
#
|
3 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# --------------------------------------------------------------------------
|
17 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
18 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
19 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
20 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
21 |
+
# --------------------------------------------------------------------------
|
22 |
+
|
23 |
+
from .base_depth_dataset import BaseDepthDataset, DepthFileNameMode
|
24 |
+
import torch
|
25 |
+
from torchvision.transforms import InterpolationMode, Resize, CenterCrop
|
26 |
+
import torchvision.transforms as transforms
|
27 |
+
|
28 |
+
class DepthAnythingDataset(BaseDepthDataset):
|
29 |
+
def __init__(
|
30 |
+
self,
|
31 |
+
**kwargs,
|
32 |
+
) -> None:
|
33 |
+
super().__init__(
|
34 |
+
# ScanNet data parameter
|
35 |
+
min_depth=-1,
|
36 |
+
max_depth=256,
|
37 |
+
has_filled_depth=False,
|
38 |
+
name_mode=DepthFileNameMode.id,
|
39 |
+
**kwargs,
|
40 |
+
)
|
41 |
+
|
42 |
+
def _read_depth_file(self, rel_path):
|
43 |
+
depth_in = self._read_image(rel_path)
|
44 |
+
# Decode ScanNet depth
|
45 |
+
# depth_decoded = depth_in / 1000.0
|
46 |
+
return depth_in
|
47 |
+
|
48 |
+
def _training_preprocess(self, rasters):
|
49 |
+
# Augmentation
|
50 |
+
if self.augm_args is not None:
|
51 |
+
rasters = self._augment_data(rasters)
|
52 |
+
|
53 |
+
# Normalization
|
54 |
+
rasters["depth_raw_norm"] = rasters["depth_raw_linear"] / 255.0 * 2.0 - 1.0
|
55 |
+
rasters["depth_filled_norm"] = rasters["depth_filled_linear"] / 255.0 * 2.0 - 1.0
|
56 |
+
|
57 |
+
# Set invalid pixel to far plane
|
58 |
+
if self.move_invalid_to_far_plane:
|
59 |
+
if self.depth_transform.far_plane_at_max:
|
60 |
+
rasters["depth_filled_norm"][~rasters["valid_mask_filled"]] = (
|
61 |
+
self.depth_transform.norm_max
|
62 |
+
)
|
63 |
+
else:
|
64 |
+
rasters["depth_filled_norm"][~rasters["valid_mask_filled"]] = (
|
65 |
+
self.depth_transform.norm_min
|
66 |
+
)
|
67 |
+
|
68 |
+
# Resize
|
69 |
+
if self.resize_to_hw is not None:
|
70 |
+
T = transforms.Compose([
|
71 |
+
Resize(self.resize_to_hw[0]),
|
72 |
+
CenterCrop(self.resize_to_hw),
|
73 |
+
])
|
74 |
+
rasters = {k: T(v) for k, v in rasters.items()}
|
75 |
+
return rasters
|
76 |
+
|
77 |
+
# def _load_depth_data(self, depth_rel_path, filled_rel_path):
|
78 |
+
# # Read depth data
|
79 |
+
# outputs = {}
|
80 |
+
# depth_raw = self._read_depth_file(depth_rel_path).squeeze()
|
81 |
+
# depth_raw_linear = torch.from_numpy(depth_raw).float().unsqueeze(0) # [1, H, W] [0, 255]
|
82 |
+
# outputs["depth_raw_linear"] = depth_raw_linear.clone()
|
83 |
+
#
|
84 |
+
# if self.has_filled_depth:
|
85 |
+
# depth_filled = self._read_depth_file(filled_rel_path).squeeze()
|
86 |
+
# depth_filled_linear = torch.from_numpy(depth_filled).float().unsqueeze(0)
|
87 |
+
# outputs["depth_filled_linear"] = depth_filled_linear
|
88 |
+
# else:
|
89 |
+
# outputs["depth_filled_linear"] = depth_raw_linear.clone()
|
90 |
+
#
|
91 |
+
# return outputs
|
src/dataset/diode_dataset.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Last modified: 2024-02-26
|
2 |
+
#
|
3 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# --------------------------------------------------------------------------
|
17 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
18 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
19 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
20 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
21 |
+
# --------------------------------------------------------------------------
|
22 |
+
|
23 |
+
import os
|
24 |
+
import tarfile
|
25 |
+
from io import BytesIO
|
26 |
+
|
27 |
+
import numpy as np
|
28 |
+
import torch
|
29 |
+
|
30 |
+
from .eval_base_dataset import EvaluateBaseDataset, DepthFileNameMode, DatasetMode
|
31 |
+
|
32 |
+
|
33 |
+
class DIODEDataset(EvaluateBaseDataset):
|
34 |
+
def __init__(
|
35 |
+
self,
|
36 |
+
**kwargs,
|
37 |
+
) -> None:
|
38 |
+
super().__init__(
|
39 |
+
# DIODE data parameter
|
40 |
+
min_depth=0.6,
|
41 |
+
max_depth=350,
|
42 |
+
has_filled_depth=False,
|
43 |
+
name_mode=DepthFileNameMode.id,
|
44 |
+
**kwargs,
|
45 |
+
)
|
46 |
+
|
47 |
+
def _read_npy_file(self, rel_path):
|
48 |
+
if self.is_tar:
|
49 |
+
if self.tar_obj is None:
|
50 |
+
self.tar_obj = tarfile.open(self.dataset_dir)
|
51 |
+
fileobj = self.tar_obj.extractfile("./" + rel_path)
|
52 |
+
npy_path_or_content = BytesIO(fileobj.read())
|
53 |
+
else:
|
54 |
+
npy_path_or_content = os.path.join(self.dataset_dir, rel_path)
|
55 |
+
data = np.load(npy_path_or_content).squeeze()[np.newaxis, :, :]
|
56 |
+
return data
|
57 |
+
|
58 |
+
def _read_depth_file(self, rel_path):
|
59 |
+
depth = self._read_npy_file(rel_path)
|
60 |
+
return depth
|
61 |
+
|
62 |
+
def _get_data_path(self, index):
|
63 |
+
return self.filenames[index]
|
64 |
+
|
65 |
+
def _get_data_item(self, index):
|
66 |
+
# Special: depth mask is read from data
|
67 |
+
|
68 |
+
rgb_rel_path, depth_rel_path, mask_rel_path = self._get_data_path(index=index)
|
69 |
+
|
70 |
+
rasters = {}
|
71 |
+
|
72 |
+
# RGB data
|
73 |
+
rasters.update(self._load_rgb_data(rgb_rel_path=rgb_rel_path))
|
74 |
+
|
75 |
+
# Depth data
|
76 |
+
if DatasetMode.RGB_ONLY != self.mode:
|
77 |
+
# load data
|
78 |
+
depth_data = self._load_depth_data(
|
79 |
+
depth_rel_path=depth_rel_path, filled_rel_path=None
|
80 |
+
)
|
81 |
+
rasters.update(depth_data)
|
82 |
+
|
83 |
+
# valid mask
|
84 |
+
mask = self._read_npy_file(mask_rel_path).astype(bool)
|
85 |
+
mask = torch.from_numpy(mask).bool()
|
86 |
+
rasters["valid_mask_raw"] = mask.clone()
|
87 |
+
rasters["valid_mask_filled"] = mask.clone()
|
88 |
+
|
89 |
+
other = {"index": index, "rgb_relative_path": rgb_rel_path}
|
90 |
+
|
91 |
+
return rasters, other
|
src/dataset/eth3d_dataset.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Last modified: 2024-02-08
|
2 |
+
#
|
3 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# --------------------------------------------------------------------------
|
17 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
18 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
19 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
20 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
21 |
+
# --------------------------------------------------------------------------
|
22 |
+
|
23 |
+
import torch
|
24 |
+
import tarfile
|
25 |
+
import os
|
26 |
+
import numpy as np
|
27 |
+
|
28 |
+
from .eval_base_dataset import DepthFileNameMode, EvaluateBaseDataset
|
29 |
+
|
30 |
+
|
31 |
+
class ETH3DDataset(EvaluateBaseDataset):
|
32 |
+
HEIGHT, WIDTH = 4032, 6048
|
33 |
+
|
34 |
+
def __init__(
|
35 |
+
self,
|
36 |
+
**kwargs,
|
37 |
+
) -> None:
|
38 |
+
super().__init__(
|
39 |
+
# ETH3D data parameter
|
40 |
+
min_depth=1e-5,
|
41 |
+
max_depth=torch.inf,
|
42 |
+
has_filled_depth=False,
|
43 |
+
name_mode=DepthFileNameMode.id,
|
44 |
+
**kwargs,
|
45 |
+
)
|
46 |
+
|
47 |
+
def _read_depth_file(self, rel_path):
|
48 |
+
# Read special binary data: https://www.eth3d.net/documentation#format-of-multi-view-data-image-formats
|
49 |
+
if self.is_tar:
|
50 |
+
if self.tar_obj is None:
|
51 |
+
self.tar_obj = tarfile.open(self.dataset_dir)
|
52 |
+
binary_data = self.tar_obj.extractfile("./" + rel_path)
|
53 |
+
binary_data = binary_data.read()
|
54 |
+
|
55 |
+
else:
|
56 |
+
depth_path = os.path.join(self.dataset_dir, rel_path)
|
57 |
+
with open(depth_path, "rb") as file:
|
58 |
+
binary_data = file.read()
|
59 |
+
# Convert the binary data to a numpy array of 32-bit floats
|
60 |
+
depth_decoded = np.frombuffer(binary_data, dtype=np.float32).copy()
|
61 |
+
|
62 |
+
depth_decoded[depth_decoded == torch.inf] = 0.0
|
63 |
+
|
64 |
+
depth_decoded = depth_decoded.reshape((self.HEIGHT, self.WIDTH))
|
65 |
+
return depth_decoded
|
src/dataset/eval_base_dataset.py
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Last modified: 2024-04-30
|
2 |
+
#
|
3 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# --------------------------------------------------------------------------
|
17 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
18 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
19 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
20 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
21 |
+
# --------------------------------------------------------------------------
|
22 |
+
|
23 |
+
import io
|
24 |
+
import os
|
25 |
+
import random
|
26 |
+
import tarfile
|
27 |
+
from enum import Enum
|
28 |
+
from typing import Union
|
29 |
+
|
30 |
+
import numpy as np
|
31 |
+
import torch
|
32 |
+
from PIL import Image
|
33 |
+
from torch.utils.data import Dataset
|
34 |
+
from torchvision.transforms import InterpolationMode, Resize
|
35 |
+
|
36 |
+
from src.util.depth_transform import DepthNormalizerBase
|
37 |
+
|
38 |
+
|
39 |
+
class DatasetMode(Enum):
|
40 |
+
RGB_ONLY = "rgb_only"
|
41 |
+
EVAL = "evaluate"
|
42 |
+
TRAIN = "train"
|
43 |
+
|
44 |
+
|
45 |
+
class DepthFileNameMode(Enum):
|
46 |
+
"""Prediction file naming modes"""
|
47 |
+
|
48 |
+
id = 1 # id.png
|
49 |
+
rgb_id = 2 # rgb_id.png
|
50 |
+
i_d_rgb = 3 # i_d_1_rgb.png
|
51 |
+
rgb_i_d = 4
|
52 |
+
|
53 |
+
|
54 |
+
def read_image_from_tar(tar_obj, img_rel_path):
|
55 |
+
image = tar_obj.extractfile("./" + img_rel_path)
|
56 |
+
image = image.read()
|
57 |
+
image = Image.open(io.BytesIO(image))
|
58 |
+
|
59 |
+
|
60 |
+
class EvaluateBaseDataset(Dataset):
|
61 |
+
def __init__(
|
62 |
+
self,
|
63 |
+
mode: DatasetMode,
|
64 |
+
filename_ls_path: str,
|
65 |
+
dataset_dir: str,
|
66 |
+
disp_name: str,
|
67 |
+
min_depth: float,
|
68 |
+
max_depth: float,
|
69 |
+
has_filled_depth: bool,
|
70 |
+
name_mode: DepthFileNameMode,
|
71 |
+
depth_transform: Union[DepthNormalizerBase, None] = None,
|
72 |
+
augmentation_args: dict = None,
|
73 |
+
resize_to_hw=None,
|
74 |
+
move_invalid_to_far_plane: bool = True,
|
75 |
+
rgb_transform=lambda x: x / 255.0 * 2 - 1, # [0, 255] -> [-1, 1],
|
76 |
+
**kwargs,
|
77 |
+
) -> None:
|
78 |
+
super().__init__()
|
79 |
+
self.mode = mode
|
80 |
+
# dataset info
|
81 |
+
self.filename_ls_path = filename_ls_path
|
82 |
+
self.dataset_dir = dataset_dir
|
83 |
+
assert os.path.exists(
|
84 |
+
self.dataset_dir
|
85 |
+
), f"Dataset does not exist at: {self.dataset_dir}"
|
86 |
+
self.disp_name = disp_name
|
87 |
+
self.has_filled_depth = has_filled_depth
|
88 |
+
self.name_mode: DepthFileNameMode = name_mode
|
89 |
+
self.min_depth = min_depth
|
90 |
+
self.max_depth = max_depth
|
91 |
+
|
92 |
+
# training arguments
|
93 |
+
self.depth_transform: DepthNormalizerBase = depth_transform
|
94 |
+
self.augm_args = augmentation_args
|
95 |
+
self.resize_to_hw = resize_to_hw
|
96 |
+
self.rgb_transform = rgb_transform
|
97 |
+
self.move_invalid_to_far_plane = move_invalid_to_far_plane
|
98 |
+
|
99 |
+
# Load filenames
|
100 |
+
with open(self.filename_ls_path, "r") as f:
|
101 |
+
self.filenames = [
|
102 |
+
s.split() for s in f.readlines()
|
103 |
+
] # [['rgb.png', 'depth.tif'], [], ...]
|
104 |
+
|
105 |
+
# Tar dataset
|
106 |
+
self.tar_obj = None
|
107 |
+
self.is_tar = (
|
108 |
+
True
|
109 |
+
if os.path.isfile(dataset_dir) and tarfile.is_tarfile(dataset_dir)
|
110 |
+
else False
|
111 |
+
)
|
112 |
+
|
113 |
+
def __len__(self):
|
114 |
+
return len(self.filenames)
|
115 |
+
|
116 |
+
def __getitem__(self, index):
|
117 |
+
rasters, other = self._get_data_item(index)
|
118 |
+
if DatasetMode.TRAIN == self.mode:
|
119 |
+
rasters = self._training_preprocess(rasters)
|
120 |
+
# merge
|
121 |
+
outputs = rasters
|
122 |
+
outputs.update(other)
|
123 |
+
return outputs
|
124 |
+
|
125 |
+
def _get_data_item(self, index):
|
126 |
+
rgb_rel_path, depth_rel_path, filled_rel_path = self._get_data_path(index=index)
|
127 |
+
|
128 |
+
rasters = {}
|
129 |
+
|
130 |
+
# RGB data
|
131 |
+
rasters.update(self._load_rgb_data(rgb_rel_path=rgb_rel_path))
|
132 |
+
|
133 |
+
# Depth data
|
134 |
+
if DatasetMode.RGB_ONLY != self.mode:
|
135 |
+
# load data
|
136 |
+
depth_data = self._load_depth_data(
|
137 |
+
depth_rel_path=depth_rel_path, filled_rel_path=filled_rel_path
|
138 |
+
)
|
139 |
+
rasters.update(depth_data)
|
140 |
+
# valid mask
|
141 |
+
rasters["valid_mask_raw"] = self._get_valid_mask(
|
142 |
+
rasters["depth_raw_linear"]
|
143 |
+
).clone()
|
144 |
+
rasters["valid_mask_filled"] = self._get_valid_mask(
|
145 |
+
rasters["depth_filled_linear"]
|
146 |
+
).clone()
|
147 |
+
|
148 |
+
other = {"index": index, "rgb_relative_path": rgb_rel_path}
|
149 |
+
|
150 |
+
return rasters, other
|
151 |
+
|
152 |
+
def _load_rgb_data(self, rgb_rel_path):
|
153 |
+
# Read RGB data
|
154 |
+
rgb = self._read_rgb_file(rgb_rel_path)
|
155 |
+
rgb_norm = rgb / 255.0 * 2.0 - 1.0 # [0, 255] -> [-1, 1]
|
156 |
+
|
157 |
+
outputs = {
|
158 |
+
"rgb_int": torch.from_numpy(rgb).int(),
|
159 |
+
"rgb_norm": torch.from_numpy(rgb_norm).float(),
|
160 |
+
}
|
161 |
+
return outputs
|
162 |
+
|
163 |
+
def _load_depth_data(self, depth_rel_path, filled_rel_path):
|
164 |
+
# Read depth data
|
165 |
+
outputs = {}
|
166 |
+
depth_raw = self._read_depth_file(depth_rel_path).squeeze()
|
167 |
+
depth_raw_linear = torch.from_numpy(depth_raw).float().unsqueeze(0) # [1, H, W]
|
168 |
+
outputs["depth_raw_linear"] = depth_raw_linear.clone()
|
169 |
+
|
170 |
+
if self.has_filled_depth:
|
171 |
+
depth_filled = self._read_depth_file(filled_rel_path).squeeze()
|
172 |
+
depth_filled_linear = torch.from_numpy(depth_filled).float().unsqueeze(0)
|
173 |
+
outputs["depth_filled_linear"] = depth_filled_linear
|
174 |
+
else:
|
175 |
+
outputs["depth_filled_linear"] = depth_raw_linear.clone()
|
176 |
+
|
177 |
+
return outputs
|
178 |
+
|
179 |
+
def _get_data_path(self, index):
|
180 |
+
filename_line = self.filenames[index]
|
181 |
+
|
182 |
+
# Get data path
|
183 |
+
rgb_rel_path = filename_line[0]
|
184 |
+
|
185 |
+
depth_rel_path, filled_rel_path = None, None
|
186 |
+
if DatasetMode.RGB_ONLY != self.mode:
|
187 |
+
depth_rel_path = filename_line[1]
|
188 |
+
if self.has_filled_depth:
|
189 |
+
filled_rel_path = filename_line[2]
|
190 |
+
return rgb_rel_path, depth_rel_path, filled_rel_path
|
191 |
+
|
192 |
+
def _read_image(self, img_rel_path) -> np.ndarray:
|
193 |
+
if self.is_tar:
|
194 |
+
if self.tar_obj is None:
|
195 |
+
self.tar_obj = tarfile.open(self.dataset_dir)
|
196 |
+
image_to_read = self.tar_obj.extractfile("./" + img_rel_path)
|
197 |
+
image_to_read = image_to_read.read()
|
198 |
+
image_to_read = io.BytesIO(image_to_read)
|
199 |
+
else:
|
200 |
+
image_to_read = os.path.join(self.dataset_dir, img_rel_path)
|
201 |
+
image = Image.open(image_to_read) # [H, W, rgb]
|
202 |
+
image = np.asarray(image)
|
203 |
+
return image
|
204 |
+
|
205 |
+
def _read_rgb_file(self, rel_path) -> np.ndarray:
|
206 |
+
rgb = self._read_image(rel_path)
|
207 |
+
rgb = np.transpose(rgb, (2, 0, 1)).astype(int) # [rgb, H, W]
|
208 |
+
return rgb
|
209 |
+
|
210 |
+
def _read_depth_file(self, rel_path):
|
211 |
+
depth_in = self._read_image(rel_path)
|
212 |
+
# Replace code below to decode depth according to dataset definition
|
213 |
+
depth_decoded = depth_in
|
214 |
+
|
215 |
+
return depth_decoded
|
216 |
+
|
217 |
+
def _get_valid_mask(self, depth: torch.Tensor):
|
218 |
+
valid_mask = torch.logical_and(
|
219 |
+
(depth > self.min_depth), (depth < self.max_depth)
|
220 |
+
).bool()
|
221 |
+
return valid_mask
|
222 |
+
|
223 |
+
def _training_preprocess(self, rasters):
|
224 |
+
# Augmentation
|
225 |
+
if self.augm_args is not None:
|
226 |
+
rasters = self._augment_data(rasters)
|
227 |
+
|
228 |
+
# Normalization
|
229 |
+
rasters["depth_raw_norm"] = self.depth_transform(
|
230 |
+
rasters["depth_raw_linear"], rasters["valid_mask_raw"]
|
231 |
+
).clone()
|
232 |
+
rasters["depth_filled_norm"] = self.depth_transform(
|
233 |
+
rasters["depth_filled_linear"], rasters["valid_mask_filled"]
|
234 |
+
).clone()
|
235 |
+
|
236 |
+
# Set invalid pixel to far plane
|
237 |
+
if self.move_invalid_to_far_plane:
|
238 |
+
if self.depth_transform.far_plane_at_max:
|
239 |
+
rasters["depth_filled_norm"][~rasters["valid_mask_filled"]] = (
|
240 |
+
self.depth_transform.norm_max
|
241 |
+
)
|
242 |
+
else:
|
243 |
+
rasters["depth_filled_norm"][~rasters["valid_mask_filled"]] = (
|
244 |
+
self.depth_transform.norm_min
|
245 |
+
)
|
246 |
+
|
247 |
+
# Resize
|
248 |
+
if self.resize_to_hw is not None:
|
249 |
+
resize_transform = Resize(
|
250 |
+
size=self.resize_to_hw, interpolation=InterpolationMode.NEAREST_EXACT
|
251 |
+
)
|
252 |
+
rasters = {k: resize_transform(v) for k, v in rasters.items()}
|
253 |
+
|
254 |
+
return rasters
|
255 |
+
|
256 |
+
def _augment_data(self, rasters_dict):
|
257 |
+
# lr flipping
|
258 |
+
lr_flip_p = self.augm_args.lr_flip_p
|
259 |
+
if random.random() < lr_flip_p:
|
260 |
+
rasters_dict = {k: v.flip(-1) for k, v in rasters_dict.items()}
|
261 |
+
|
262 |
+
return rasters_dict
|
263 |
+
|
264 |
+
def __del__(self):
|
265 |
+
if hasattr(self, "tar_obj") and self.tar_obj is not None:
|
266 |
+
self.tar_obj.close()
|
267 |
+
self.tar_obj = None
|
268 |
+
|
269 |
+
def get_pred_name(rgb_basename, name_mode, suffix=".png"):
|
270 |
+
if DepthFileNameMode.rgb_id == name_mode:
|
271 |
+
pred_basename = "pred_" + rgb_basename.split("_")[1]
|
272 |
+
elif DepthFileNameMode.i_d_rgb == name_mode:
|
273 |
+
pred_basename = rgb_basename.replace("_rgb.", "_pred.")
|
274 |
+
elif DepthFileNameMode.id == name_mode:
|
275 |
+
pred_basename = "pred_" + rgb_basename
|
276 |
+
elif DepthFileNameMode.rgb_i_d == name_mode:
|
277 |
+
pred_basename = "pred_" + "_".join(rgb_basename.split("_")[1:])
|
278 |
+
else:
|
279 |
+
raise NotImplementedError
|
280 |
+
# change suffix
|
281 |
+
pred_basename = os.path.splitext(pred_basename)[0] + suffix
|
282 |
+
|
283 |
+
return pred_basename
|
src/dataset/hypersim_dataset.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Last modified: 2024-02-08
|
2 |
+
#
|
3 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# --------------------------------------------------------------------------
|
17 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
18 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
19 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
20 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
21 |
+
# --------------------------------------------------------------------------
|
22 |
+
|
23 |
+
|
24 |
+
from .base_depth_dataset import BaseDepthDataset, DepthFileNameMode
|
25 |
+
|
26 |
+
class HypersimDataset(BaseDepthDataset):
|
27 |
+
def __init__(
|
28 |
+
self,
|
29 |
+
**kwargs,
|
30 |
+
) -> None:
|
31 |
+
super().__init__(
|
32 |
+
# Hypersim data parameter
|
33 |
+
min_depth=1e-5,
|
34 |
+
max_depth=65.0,
|
35 |
+
has_filled_depth=False,
|
36 |
+
name_mode=DepthFileNameMode.rgb_i_d,
|
37 |
+
**kwargs,
|
38 |
+
)
|
39 |
+
|
40 |
+
def _read_depth_file(self, rel_path):
|
41 |
+
depth_in = self._read_image(rel_path)
|
42 |
+
# Decode Hypersim depth
|
43 |
+
depth_decoded = depth_in / 1000.0
|
44 |
+
return depth_decoded
|
src/dataset/inpaint_dataset.py
ADDED
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Last modified: 2024-04-30
|
2 |
+
#
|
3 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# --------------------------------------------------------------------------
|
17 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
18 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
19 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
20 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
21 |
+
# --------------------------------------------------------------------------
|
22 |
+
import glob
|
23 |
+
import io
|
24 |
+
import json
|
25 |
+
import os
|
26 |
+
import pdb
|
27 |
+
import random
|
28 |
+
import tarfile
|
29 |
+
from enum import Enum
|
30 |
+
from typing import Union
|
31 |
+
|
32 |
+
import numpy as np
|
33 |
+
import torch
|
34 |
+
from PIL import Image
|
35 |
+
from torch.utils.data import Dataset
|
36 |
+
from torchvision.transforms import InterpolationMode, Resize, CenterCrop
|
37 |
+
import torchvision.transforms as transforms
|
38 |
+
from transformers import CLIPTextModel, CLIPTokenizer
|
39 |
+
from src.util.depth_transform import DepthNormalizerBase
|
40 |
+
import random
|
41 |
+
|
42 |
+
from src.dataset.eval_base_dataset import DatasetMode, DepthFileNameMode
|
43 |
+
|
44 |
+
|
45 |
+
def read_image_from_tar(tar_obj, img_rel_path):
|
46 |
+
image = tar_obj.extractfile("./" + img_rel_path)
|
47 |
+
image = image.read()
|
48 |
+
image = Image.open(io.BytesIO(image))
|
49 |
+
|
50 |
+
|
51 |
+
class BaseDepthDataset(Dataset):
|
52 |
+
def __init__(
|
53 |
+
self,
|
54 |
+
mode: DatasetMode,
|
55 |
+
filename_ls_path: str,
|
56 |
+
dataset_dir: str,
|
57 |
+
disp_name: str,
|
58 |
+
min_depth: float,
|
59 |
+
max_depth: float,
|
60 |
+
has_filled_depth: bool,
|
61 |
+
name_mode: DepthFileNameMode,
|
62 |
+
depth_transform: Union[DepthNormalizerBase, None] = None,
|
63 |
+
tokenizer: CLIPTokenizer = None,
|
64 |
+
augmentation_args: dict = None,
|
65 |
+
resize_to_hw=None,
|
66 |
+
move_invalid_to_far_plane: bool = True,
|
67 |
+
rgb_transform=lambda x: x / 255.0 * 2 - 1, # [0, 255] -> [-1, 1],
|
68 |
+
**kwargs,
|
69 |
+
) -> None:
|
70 |
+
super().__init__()
|
71 |
+
self.mode = mode
|
72 |
+
# dataset info
|
73 |
+
self.filename_ls_path = filename_ls_path
|
74 |
+
self.disp_name = disp_name
|
75 |
+
self.has_filled_depth = has_filled_depth
|
76 |
+
self.name_mode: DepthFileNameMode = name_mode
|
77 |
+
self.min_depth = min_depth
|
78 |
+
self.max_depth = max_depth
|
79 |
+
# training arguments
|
80 |
+
self.depth_transform: DepthNormalizerBase = depth_transform
|
81 |
+
self.augm_args = augmentation_args
|
82 |
+
self.resize_to_hw = resize_to_hw
|
83 |
+
self.rgb_transform = rgb_transform
|
84 |
+
self.move_invalid_to_far_plane = move_invalid_to_far_plane
|
85 |
+
self.tokenizer = tokenizer
|
86 |
+
# Load filenames
|
87 |
+
self.filenames = []
|
88 |
+
filename_paths = glob.glob(self.filename_ls_path)
|
89 |
+
for path in filename_paths:
|
90 |
+
with open(path, "r") as f:
|
91 |
+
self.filenames += json.load(f)
|
92 |
+
# Tar dataset
|
93 |
+
self.tar_obj = None
|
94 |
+
self.is_tar = (
|
95 |
+
True
|
96 |
+
if os.path.isfile(dataset_dir) and tarfile.is_tarfile(dataset_dir)
|
97 |
+
else False
|
98 |
+
)
|
99 |
+
|
100 |
+
def __len__(self):
|
101 |
+
return len(self.filenames)
|
102 |
+
|
103 |
+
def __getitem__(self, index):
|
104 |
+
rasters, other = self._get_data_item(index)
|
105 |
+
if DatasetMode.TRAIN == self.mode:
|
106 |
+
rasters = self._training_preprocess(rasters)
|
107 |
+
# merge
|
108 |
+
outputs = rasters
|
109 |
+
outputs.update(other)
|
110 |
+
return outputs
|
111 |
+
|
112 |
+
def _get_data_item(self, index):
|
113 |
+
rgb_path = self.filenames[index]['rgb_path']
|
114 |
+
depth_path = self.filenames[index]['depth_path']
|
115 |
+
mask_path = None
|
116 |
+
if 'valid_mask' in self.filenames[index]:
|
117 |
+
mask_path = self.filenames[index]['valid_mask']
|
118 |
+
if self.filenames[index]['caption'] is not None:
|
119 |
+
coca_caption = self.filenames[index]['caption']['coca_caption']
|
120 |
+
spatial_caption = self.filenames[index]['caption']['spatial_caption']
|
121 |
+
empty_caption = ''
|
122 |
+
caption_choices = [coca_caption, spatial_caption, empty_caption]
|
123 |
+
probabilities = [0.4, 0.4, 0.2]
|
124 |
+
caption = random.choices(caption_choices, probabilities)[0]
|
125 |
+
else:
|
126 |
+
caption = ''
|
127 |
+
|
128 |
+
rasters = {}
|
129 |
+
# RGB data
|
130 |
+
rasters.update(self._load_rgb_data(rgb_path))
|
131 |
+
|
132 |
+
# Depth data
|
133 |
+
if DatasetMode.RGB_ONLY != self.mode and depth_path is not None:
|
134 |
+
# load data
|
135 |
+
depth_data = self._load_depth_data(depth_path)
|
136 |
+
rasters.update(depth_data)
|
137 |
+
# valid mask
|
138 |
+
if mask_path is not None:
|
139 |
+
valid_mask_raw = Image.open(mask_path)
|
140 |
+
valid_mask_filled = Image.open(mask_path)
|
141 |
+
rasters["valid_mask_raw"] = torch.from_numpy(np.asarray(valid_mask_raw)).unsqueeze(0).bool()
|
142 |
+
rasters["valid_mask_filled"] = torch.from_numpy(np.asarray(valid_mask_filled)).unsqueeze(0).bool()
|
143 |
+
else:
|
144 |
+
rasters["valid_mask_raw"] = self._get_valid_mask(
|
145 |
+
rasters["depth_raw_linear"]
|
146 |
+
).clone()
|
147 |
+
rasters["valid_mask_filled"] = self._get_valid_mask(
|
148 |
+
rasters["depth_filled_linear"]
|
149 |
+
).clone()
|
150 |
+
|
151 |
+
other = {"index": index, "rgb_path": rgb_path, 'text': caption}
|
152 |
+
|
153 |
+
if self.resize_to_hw is not None:
|
154 |
+
resize_transform = transforms.Compose([
|
155 |
+
Resize(size=max(self.resize_to_hw), interpolation=InterpolationMode.NEAREST_EXACT),
|
156 |
+
CenterCrop(size=self.resize_to_hw)])
|
157 |
+
rasters = {k: resize_transform(v) for k, v in rasters.items()}
|
158 |
+
|
159 |
+
return rasters, other
|
160 |
+
|
161 |
+
def _load_rgb_data(self, rgb_path):
|
162 |
+
# Read RGB data
|
163 |
+
rgb = self._read_rgb_file(rgb_path)
|
164 |
+
rgb_norm = rgb / 255.0 * 2.0 - 1.0 # [0, 255] -> [-1, 1]
|
165 |
+
|
166 |
+
outputs = {
|
167 |
+
"rgb_int": torch.from_numpy(rgb).int(),
|
168 |
+
"rgb_norm": torch.from_numpy(rgb_norm).float(),
|
169 |
+
}
|
170 |
+
return outputs
|
171 |
+
|
172 |
+
def _load_depth_data(self, depth_path, filled_rel_path=None):
|
173 |
+
# Read depth data
|
174 |
+
outputs = {}
|
175 |
+
depth_raw = self._read_depth_file(depth_path).squeeze()
|
176 |
+
depth_raw_linear = torch.from_numpy(depth_raw.copy()).float().unsqueeze(0) # [1, H, W]
|
177 |
+
outputs["depth_raw_linear"] = depth_raw_linear.clone()
|
178 |
+
|
179 |
+
if self.has_filled_depth:
|
180 |
+
depth_filled = self._read_depth_file(filled_rel_path).squeeze()
|
181 |
+
depth_filled_linear = torch.from_numpy(depth_filled).float().unsqueeze(0)
|
182 |
+
outputs["depth_filled_linear"] = depth_filled_linear
|
183 |
+
else:
|
184 |
+
outputs["depth_filled_linear"] = depth_raw_linear.clone()
|
185 |
+
|
186 |
+
return outputs
|
187 |
+
|
188 |
+
def _get_data_path(self, index):
|
189 |
+
filename_line = self.filenames[index]
|
190 |
+
|
191 |
+
# Get data path
|
192 |
+
rgb_rel_path = filename_line[0]
|
193 |
+
|
194 |
+
depth_rel_path, text_rel_path = None, None
|
195 |
+
if DatasetMode.RGB_ONLY != self.mode:
|
196 |
+
depth_rel_path = filename_line[1]
|
197 |
+
if len(filename_line) > 2:
|
198 |
+
text_rel_path = filename_line[2]
|
199 |
+
return rgb_rel_path, depth_rel_path, text_rel_path
|
200 |
+
|
201 |
+
def _read_image(self, img_path) -> np.ndarray:
|
202 |
+
image_to_read = img_path
|
203 |
+
image = Image.open(image_to_read) # [H, W, rgb]
|
204 |
+
image = np.asarray(image)
|
205 |
+
return image
|
206 |
+
|
207 |
+
def _read_rgb_file(self, path) -> np.ndarray:
|
208 |
+
rgb = self._read_image(path)
|
209 |
+
rgb = np.transpose(rgb, (2, 0, 1)).astype(int) # [rgb, H, W]
|
210 |
+
return rgb
|
211 |
+
|
212 |
+
def _read_depth_file(self, path):
|
213 |
+
depth_in = self._read_image(path)
|
214 |
+
# Replace code below to decode depth according to dataset definition
|
215 |
+
depth_decoded = depth_in
|
216 |
+
return depth_decoded
|
217 |
+
|
218 |
+
def _get_valid_mask(self, depth: torch.Tensor):
|
219 |
+
valid_mask = torch.logical_and(
|
220 |
+
(depth > self.min_depth), (depth < self.max_depth)
|
221 |
+
).bool()
|
222 |
+
return valid_mask
|
223 |
+
|
224 |
+
def _training_preprocess(self, rasters):
|
225 |
+
# Augmentation
|
226 |
+
if self.augm_args is not None:
|
227 |
+
rasters = self._augment_data(rasters)
|
228 |
+
|
229 |
+
# Normalization
|
230 |
+
# rasters["depth_raw_norm"] = rasters["depth_raw_linear"] / 255.0 * 2.0 - 1.0
|
231 |
+
# rasters["depth_filled_norm"] = rasters["depth_filled_linear"] / 255.0 * 2.0 - 1.0
|
232 |
+
|
233 |
+
rasters["depth_raw_norm"] = self.depth_transform(
|
234 |
+
rasters["depth_raw_linear"], rasters["valid_mask_raw"]
|
235 |
+
).clone()
|
236 |
+
rasters["depth_filled_norm"] = self.depth_transform(
|
237 |
+
rasters["depth_filled_linear"], rasters["valid_mask_filled"]
|
238 |
+
).clone()
|
239 |
+
|
240 |
+
# Set invalid pixel to far plane
|
241 |
+
if self.move_invalid_to_far_plane:
|
242 |
+
if self.depth_transform.far_plane_at_max:
|
243 |
+
rasters["depth_filled_norm"][~rasters["valid_mask_filled"]] = (
|
244 |
+
self.depth_transform.norm_max
|
245 |
+
)
|
246 |
+
else:
|
247 |
+
rasters["depth_filled_norm"][~rasters["valid_mask_filled"]] = (
|
248 |
+
self.depth_transform.norm_min
|
249 |
+
)
|
250 |
+
|
251 |
+
# Resize
|
252 |
+
if self.resize_to_hw is not None:
|
253 |
+
resize_transform = transforms.Compose([
|
254 |
+
Resize(size=max(self.resize_to_hw), interpolation=InterpolationMode.NEAREST_EXACT),
|
255 |
+
CenterCrop(size=self.resize_to_hw)])
|
256 |
+
rasters = {k: resize_transform(v) for k, v in rasters.items()}
|
257 |
+
return rasters
|
258 |
+
|
259 |
+
def _augment_data(self, rasters_dict):
|
260 |
+
# lr flipping
|
261 |
+
lr_flip_p = self.augm_args.lr_flip_p
|
262 |
+
if random.random() < lr_flip_p:
|
263 |
+
rasters_dict = {k: v.flip(-1) for k, v in rasters_dict.items()}
|
264 |
+
|
265 |
+
return rasters_dict
|
266 |
+
|
267 |
+
def __del__(self):
|
268 |
+
if hasattr(self, "tar_obj") and self.tar_obj is not None:
|
269 |
+
self.tar_obj.close()
|
270 |
+
self.tar_obj = None
|
271 |
+
|
272 |
+
def get_pred_name(rgb_basename, name_mode, suffix=".png"):
|
273 |
+
if DepthFileNameMode.rgb_id == name_mode:
|
274 |
+
pred_basename = "pred_" + rgb_basename.split("_")[1]
|
275 |
+
elif DepthFileNameMode.i_d_rgb == name_mode:
|
276 |
+
pred_basename = rgb_basename.replace("_rgb.", "_pred.")
|
277 |
+
elif DepthFileNameMode.id == name_mode:
|
278 |
+
pred_basename = "pred_" + rgb_basename
|
279 |
+
elif DepthFileNameMode.rgb_i_d == name_mode:
|
280 |
+
pred_basename = "pred_" + "_".join(rgb_basename.split("_")[1:])
|
281 |
+
else:
|
282 |
+
raise NotImplementedError
|
283 |
+
# change suffix
|
284 |
+
pred_basename = os.path.splitext(pred_basename)[0] + suffix
|
285 |
+
|
286 |
+
return pred_basename
|
src/dataset/kitti_dataset.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Last modified: 2024-02-08
|
2 |
+
#
|
3 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# --------------------------------------------------------------------------
|
17 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
18 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
19 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
20 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
21 |
+
# --------------------------------------------------------------------------
|
22 |
+
|
23 |
+
import torch
|
24 |
+
|
25 |
+
from .eval_base_dataset import DepthFileNameMode, EvaluateBaseDataset
|
26 |
+
|
27 |
+
|
28 |
+
class KITTIDataset(EvaluateBaseDataset):
|
29 |
+
def __init__(
|
30 |
+
self,
|
31 |
+
kitti_bm_crop, # Crop to KITTI benchmark size
|
32 |
+
valid_mask_crop, # Evaluation mask. [None, garg or eigen]
|
33 |
+
**kwargs,
|
34 |
+
) -> None:
|
35 |
+
super().__init__(
|
36 |
+
# KITTI data parameter
|
37 |
+
min_depth=1e-5,
|
38 |
+
max_depth=80,
|
39 |
+
has_filled_depth=False,
|
40 |
+
name_mode=DepthFileNameMode.id,
|
41 |
+
**kwargs,
|
42 |
+
)
|
43 |
+
self.kitti_bm_crop = kitti_bm_crop
|
44 |
+
self.valid_mask_crop = valid_mask_crop
|
45 |
+
assert self.valid_mask_crop in [
|
46 |
+
None,
|
47 |
+
"garg", # set evaluation mask according to Garg ECCV16
|
48 |
+
"eigen", # set evaluation mask according to Eigen NIPS14
|
49 |
+
], f"Unknown crop type: {self.valid_mask_crop}"
|
50 |
+
|
51 |
+
# Filter out empty depth
|
52 |
+
self.filenames = [f for f in self.filenames if "None" != f[1]]
|
53 |
+
|
54 |
+
def _read_depth_file(self, rel_path):
|
55 |
+
depth_in = self._read_image(rel_path)
|
56 |
+
# Decode KITTI depth
|
57 |
+
depth_decoded = depth_in / 256.0
|
58 |
+
return depth_decoded
|
59 |
+
|
60 |
+
def _load_rgb_data(self, rgb_rel_path):
|
61 |
+
rgb_data = super()._load_rgb_data(rgb_rel_path)
|
62 |
+
if self.kitti_bm_crop:
|
63 |
+
rgb_data = {k: self.kitti_benchmark_crop(v) for k, v in rgb_data.items()}
|
64 |
+
return rgb_data
|
65 |
+
|
66 |
+
def _load_depth_data(self, depth_rel_path, filled_rel_path):
|
67 |
+
depth_data = super()._load_depth_data(depth_rel_path, filled_rel_path)
|
68 |
+
if self.kitti_bm_crop:
|
69 |
+
depth_data = {
|
70 |
+
k: self.kitti_benchmark_crop(v) for k, v in depth_data.items()
|
71 |
+
}
|
72 |
+
return depth_data
|
73 |
+
|
74 |
+
@staticmethod
|
75 |
+
def kitti_benchmark_crop(input_img):
|
76 |
+
"""
|
77 |
+
Crop images to KITTI benchmark size
|
78 |
+
Args:
|
79 |
+
`input_img` (torch.Tensor): Input image to be cropped.
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
torch.Tensor:Cropped image.
|
83 |
+
"""
|
84 |
+
KB_CROP_HEIGHT = 352
|
85 |
+
KB_CROP_WIDTH = 1216
|
86 |
+
|
87 |
+
height, width = input_img.shape[-2:]
|
88 |
+
top_margin = int(height - KB_CROP_HEIGHT)
|
89 |
+
left_margin = int((width - KB_CROP_WIDTH) / 2)
|
90 |
+
if 2 == len(input_img.shape):
|
91 |
+
out = input_img[
|
92 |
+
top_margin : top_margin + KB_CROP_HEIGHT,
|
93 |
+
left_margin : left_margin + KB_CROP_WIDTH,
|
94 |
+
]
|
95 |
+
elif 3 == len(input_img.shape):
|
96 |
+
out = input_img[
|
97 |
+
:,
|
98 |
+
top_margin : top_margin + KB_CROP_HEIGHT,
|
99 |
+
left_margin : left_margin + KB_CROP_WIDTH,
|
100 |
+
]
|
101 |
+
return out
|
102 |
+
|
103 |
+
def _get_valid_mask(self, depth: torch.Tensor):
|
104 |
+
# reference: https://github.com/cleinc/bts/blob/master/pytorch/bts_eval.py
|
105 |
+
valid_mask = super()._get_valid_mask(depth) # [1, H, W]
|
106 |
+
|
107 |
+
if self.valid_mask_crop is not None:
|
108 |
+
eval_mask = torch.zeros_like(valid_mask.squeeze()).bool()
|
109 |
+
gt_height, gt_width = eval_mask.shape
|
110 |
+
|
111 |
+
if "garg" == self.valid_mask_crop:
|
112 |
+
eval_mask[
|
113 |
+
int(0.40810811 * gt_height) : int(0.99189189 * gt_height),
|
114 |
+
int(0.03594771 * gt_width) : int(0.96405229 * gt_width),
|
115 |
+
] = 1
|
116 |
+
elif "eigen" == self.valid_mask_crop:
|
117 |
+
eval_mask[
|
118 |
+
int(0.3324324 * gt_height) : int(0.91351351 * gt_height),
|
119 |
+
int(0.0359477 * gt_width) : int(0.96405229 * gt_width),
|
120 |
+
] = 1
|
121 |
+
|
122 |
+
eval_mask.reshape(valid_mask.shape)
|
123 |
+
valid_mask = torch.logical_and(valid_mask, eval_mask)
|
124 |
+
return valid_mask
|
src/dataset/mixed_sampler.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Last modified: 2024-04-18
|
2 |
+
#
|
3 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# --------------------------------------------------------------------------
|
17 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
18 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
19 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
20 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
21 |
+
# --------------------------------------------------------------------------
|
22 |
+
|
23 |
+
import torch
|
24 |
+
from torch.utils.data import (
|
25 |
+
BatchSampler,
|
26 |
+
RandomSampler,
|
27 |
+
SequentialSampler,
|
28 |
+
)
|
29 |
+
|
30 |
+
|
31 |
+
class MixedBatchSampler(BatchSampler):
|
32 |
+
"""Sample one batch from a selected dataset with given probability.
|
33 |
+
Compatible with datasets at different resolution
|
34 |
+
"""
|
35 |
+
|
36 |
+
def __init__(
|
37 |
+
self, src_dataset_ls, batch_size, drop_last, shuffle, prob=None, generator=None
|
38 |
+
):
|
39 |
+
self.base_sampler = None
|
40 |
+
self.batch_size = batch_size
|
41 |
+
self.shuffle = shuffle
|
42 |
+
self.drop_last = drop_last
|
43 |
+
self.generator = generator
|
44 |
+
|
45 |
+
self.src_dataset_ls = src_dataset_ls
|
46 |
+
self.n_dataset = len(self.src_dataset_ls)
|
47 |
+
|
48 |
+
# Dataset length
|
49 |
+
self.dataset_length = [len(ds) for ds in self.src_dataset_ls]
|
50 |
+
self.cum_dataset_length = [
|
51 |
+
sum(self.dataset_length[:i]) for i in range(self.n_dataset)
|
52 |
+
] # cumulative dataset length
|
53 |
+
|
54 |
+
# BatchSamplers for each source dataset
|
55 |
+
if self.shuffle:
|
56 |
+
self.src_batch_samplers = [
|
57 |
+
BatchSampler(
|
58 |
+
sampler=RandomSampler(
|
59 |
+
ds, replacement=False, generator=self.generator
|
60 |
+
),
|
61 |
+
batch_size=self.batch_size,
|
62 |
+
drop_last=self.drop_last,
|
63 |
+
)
|
64 |
+
for ds in self.src_dataset_ls
|
65 |
+
]
|
66 |
+
else:
|
67 |
+
self.src_batch_samplers = [
|
68 |
+
BatchSampler(
|
69 |
+
sampler=SequentialSampler(ds),
|
70 |
+
batch_size=self.batch_size,
|
71 |
+
drop_last=self.drop_last,
|
72 |
+
)
|
73 |
+
for ds in self.src_dataset_ls
|
74 |
+
]
|
75 |
+
self.raw_batches = [
|
76 |
+
list(bs) for bs in self.src_batch_samplers
|
77 |
+
] # index in original dataset
|
78 |
+
self.n_batches = [len(b) for b in self.raw_batches]
|
79 |
+
self.n_total_batch = sum(self.n_batches)
|
80 |
+
|
81 |
+
# sampling probability
|
82 |
+
if prob is None:
|
83 |
+
# if not given, decide by dataset length
|
84 |
+
self.prob = torch.tensor(self.n_batches) / self.n_total_batch
|
85 |
+
else:
|
86 |
+
self.prob = torch.as_tensor(prob)
|
87 |
+
|
88 |
+
def __iter__(self):
|
89 |
+
"""_summary_
|
90 |
+
|
91 |
+
Yields:
|
92 |
+
list(int): a batch of indics, corresponding to ConcatDataset of src_dataset_ls
|
93 |
+
"""
|
94 |
+
for _ in range(self.n_total_batch):
|
95 |
+
idx_ds = torch.multinomial(
|
96 |
+
self.prob, 1, replacement=True, generator=self.generator
|
97 |
+
).item()
|
98 |
+
# if batch list is empty, generate new list
|
99 |
+
if 0 == len(self.raw_batches[idx_ds]):
|
100 |
+
self.raw_batches[idx_ds] = list(self.src_batch_samplers[idx_ds])
|
101 |
+
# get a batch from list
|
102 |
+
batch_raw = self.raw_batches[idx_ds].pop()
|
103 |
+
# shift by cumulative dataset length
|
104 |
+
shift = self.cum_dataset_length[idx_ds]
|
105 |
+
batch = [n + shift for n in batch_raw]
|
106 |
+
|
107 |
+
yield batch
|
108 |
+
|
109 |
+
def __len__(self):
|
110 |
+
return self.n_total_batch
|
111 |
+
|
112 |
+
|
113 |
+
# Unit test
|
114 |
+
if "__main__" == __name__:
|
115 |
+
from torch.utils.data import ConcatDataset, DataLoader, Dataset
|
116 |
+
|
117 |
+
class SimpleDataset(Dataset):
|
118 |
+
def __init__(self, start, len) -> None:
|
119 |
+
super().__init__()
|
120 |
+
self.start = start
|
121 |
+
self.len = len
|
122 |
+
|
123 |
+
def __len__(self):
|
124 |
+
return self.len
|
125 |
+
|
126 |
+
def __getitem__(self, index):
|
127 |
+
return self.start + index
|
128 |
+
|
129 |
+
dataset_1 = SimpleDataset(0, 10)
|
130 |
+
dataset_2 = SimpleDataset(200, 20)
|
131 |
+
dataset_3 = SimpleDataset(1000, 50)
|
132 |
+
|
133 |
+
concat_dataset = ConcatDataset(
|
134 |
+
[dataset_1, dataset_2, dataset_3]
|
135 |
+
) # will directly concatenate
|
136 |
+
|
137 |
+
mixed_sampler = MixedBatchSampler(
|
138 |
+
src_dataset_ls=[dataset_1, dataset_2, dataset_3],
|
139 |
+
batch_size=4,
|
140 |
+
drop_last=True,
|
141 |
+
shuffle=False,
|
142 |
+
prob=[0.6, 0.3, 0.1],
|
143 |
+
generator=torch.Generator().manual_seed(0),
|
144 |
+
)
|
145 |
+
|
146 |
+
loader = DataLoader(concat_dataset, batch_sampler=mixed_sampler)
|
147 |
+
|
148 |
+
for d in loader:
|
149 |
+
print(d)
|
src/dataset/nyu_dataset.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Last modified: 2024-02-08
|
2 |
+
#
|
3 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# --------------------------------------------------------------------------
|
17 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
18 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
19 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
20 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
21 |
+
# --------------------------------------------------------------------------
|
22 |
+
|
23 |
+
import torch
|
24 |
+
|
25 |
+
from .eval_base_dataset import DepthFileNameMode, EvaluateBaseDataset
|
26 |
+
|
27 |
+
|
28 |
+
class NYUDataset(EvaluateBaseDataset):
|
29 |
+
def __init__(
|
30 |
+
self,
|
31 |
+
eigen_valid_mask: bool,
|
32 |
+
**kwargs,
|
33 |
+
) -> None:
|
34 |
+
super().__init__(
|
35 |
+
# NYUv2 dataset parameter
|
36 |
+
min_depth=1e-3,
|
37 |
+
max_depth=10.0,
|
38 |
+
has_filled_depth=True,
|
39 |
+
name_mode=DepthFileNameMode.rgb_id,
|
40 |
+
**kwargs,
|
41 |
+
)
|
42 |
+
|
43 |
+
self.eigen_valid_mask = eigen_valid_mask
|
44 |
+
|
45 |
+
def _read_depth_file(self, rel_path):
|
46 |
+
depth_in = self._read_image(rel_path)
|
47 |
+
# Decode NYU depth
|
48 |
+
depth_decoded = depth_in / 1000.0
|
49 |
+
return depth_decoded
|
50 |
+
|
51 |
+
def _get_valid_mask(self, depth: torch.Tensor):
|
52 |
+
valid_mask = super()._get_valid_mask(depth)
|
53 |
+
|
54 |
+
# Eigen crop for evaluation
|
55 |
+
if self.eigen_valid_mask:
|
56 |
+
eval_mask = torch.zeros_like(valid_mask.squeeze()).bool()
|
57 |
+
eval_mask[45:471, 41:601] = 1
|
58 |
+
eval_mask.reshape(valid_mask.shape)
|
59 |
+
valid_mask = torch.logical_and(valid_mask, eval_mask)
|
60 |
+
|
61 |
+
return valid_mask
|
src/dataset/scannet_dataset.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Last modified: 2024-02-08
|
2 |
+
#
|
3 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# --------------------------------------------------------------------------
|
17 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
18 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
19 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
20 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
21 |
+
# --------------------------------------------------------------------------
|
22 |
+
|
23 |
+
from .eval_base_dataset import DepthFileNameMode, EvaluateBaseDataset
|
24 |
+
|
25 |
+
|
26 |
+
class ScanNetDataset(EvaluateBaseDataset):
|
27 |
+
def __init__(
|
28 |
+
self,
|
29 |
+
**kwargs,
|
30 |
+
) -> None:
|
31 |
+
super().__init__(
|
32 |
+
# ScanNet data parameter
|
33 |
+
min_depth=1e-3,
|
34 |
+
max_depth=10,
|
35 |
+
has_filled_depth=False,
|
36 |
+
name_mode=DepthFileNameMode.id,
|
37 |
+
**kwargs,
|
38 |
+
)
|
39 |
+
|
40 |
+
def _read_depth_file(self, rel_path):
|
41 |
+
depth_in = self._read_image(rel_path)
|
42 |
+
# Decode ScanNet depth
|
43 |
+
depth_decoded = depth_in / 1000.0
|
44 |
+
return depth_decoded
|
src/dataset/vkitti_dataset.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Last modified: 2024-02-08
|
2 |
+
#
|
3 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# --------------------------------------------------------------------------
|
17 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
18 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
19 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
20 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
21 |
+
# --------------------------------------------------------------------------
|
22 |
+
|
23 |
+
import torch
|
24 |
+
|
25 |
+
from .base_depth_dataset import BaseDepthDataset, DepthFileNameMode
|
26 |
+
from .kitti_dataset import KITTIDataset
|
27 |
+
|
28 |
+
class VirtualKITTIDataset(BaseDepthDataset):
|
29 |
+
def __init__(
|
30 |
+
self,
|
31 |
+
kitti_bm_crop, # Crop to KITTI benchmark size
|
32 |
+
valid_mask_crop, # Evaluation mask. [None, garg or eigen]
|
33 |
+
**kwargs,
|
34 |
+
) -> None:
|
35 |
+
super().__init__(
|
36 |
+
# virtual KITTI data parameter
|
37 |
+
min_depth=1e-5,
|
38 |
+
max_depth=80, # 655.35
|
39 |
+
has_filled_depth=False,
|
40 |
+
name_mode=DepthFileNameMode.id,
|
41 |
+
**kwargs,
|
42 |
+
)
|
43 |
+
self.kitti_bm_crop = kitti_bm_crop
|
44 |
+
self.valid_mask_crop = valid_mask_crop
|
45 |
+
assert self.valid_mask_crop in [
|
46 |
+
None,
|
47 |
+
"garg", # set evaluation mask according to Garg ECCV16
|
48 |
+
"eigen", # set evaluation mask according to Eigen NIPS14
|
49 |
+
], f"Unknown crop type: {self.valid_mask_crop}"
|
50 |
+
|
51 |
+
# Filter out empty depth
|
52 |
+
self.filenames = self.filenames
|
53 |
+
|
54 |
+
def _read_depth_file(self, rel_path):
|
55 |
+
depth_in = self._read_image(rel_path)
|
56 |
+
# Decode vKITTI depth
|
57 |
+
depth_decoded = depth_in / 100.0
|
58 |
+
return depth_decoded
|
59 |
+
|
60 |
+
def _load_rgb_data(self, rgb_rel_path):
|
61 |
+
rgb_data = super()._load_rgb_data(rgb_rel_path)
|
62 |
+
if self.kitti_bm_crop:
|
63 |
+
rgb_data = {
|
64 |
+
k: KITTIDataset.kitti_benchmark_crop(v) for k, v in rgb_data.items()
|
65 |
+
}
|
66 |
+
return rgb_data
|
67 |
+
|
68 |
+
def _load_depth_data(self, depth_rel_path, filled_rel_path=None):
|
69 |
+
depth_data = super()._load_depth_data(depth_rel_path, filled_rel_path)
|
70 |
+
if self.kitti_bm_crop:
|
71 |
+
depth_data = {
|
72 |
+
k: KITTIDataset.kitti_benchmark_crop(v) for k, v in depth_data.items()
|
73 |
+
}
|
74 |
+
return depth_data
|
75 |
+
|
76 |
+
def _get_valid_mask(self, depth: torch.Tensor):
|
77 |
+
# reference: https://github.com/cleinc/bts/blob/master/pytorch/bts_eval.py
|
78 |
+
valid_mask = super()._get_valid_mask(depth) # [1, H, W]
|
79 |
+
|
80 |
+
if self.valid_mask_crop is not None:
|
81 |
+
eval_mask = torch.zeros_like(valid_mask.squeeze()).bool()
|
82 |
+
gt_height, gt_width = eval_mask.shape
|
83 |
+
|
84 |
+
if "garg" == self.valid_mask_crop:
|
85 |
+
eval_mask[
|
86 |
+
int(0.40810811 * gt_height) : int(0.99189189 * gt_height),
|
87 |
+
int(0.03594771 * gt_width) : int(0.96405229 * gt_width),
|
88 |
+
] = 1
|
89 |
+
elif "eigen" == self.valid_mask_crop:
|
90 |
+
eval_mask[
|
91 |
+
int(0.3324324 * gt_height) : int(0.91351351 * gt_height),
|
92 |
+
int(0.0359477 * gt_width) : int(0.96405229 * gt_width),
|
93 |
+
] = 1
|
94 |
+
|
95 |
+
eval_mask.reshape(valid_mask.shape)
|
96 |
+
valid_mask = torch.logical_and(valid_mask, eval_mask)
|
97 |
+
return valid_mask
|
src/trainer/__init__.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Author: Bingxin Ke
|
2 |
+
# Last modified: 2024-05-17
|
3 |
+
|
4 |
+
from .marigold_trainer import MarigoldTrainer
|
5 |
+
from .marigold_xl_trainer import MarigoldXLTrainer
|
6 |
+
from .marigold_inpaint_trainer import MarigoldInpaintTrainer
|
7 |
+
|
8 |
+
trainer_cls_name_dict = {
|
9 |
+
"MarigoldTrainer": MarigoldTrainer,
|
10 |
+
"MarigoldXLTrainer": MarigoldXLTrainer,
|
11 |
+
"MarigoldInpaintTrainer": MarigoldInpaintTrainer
|
12 |
+
}
|
13 |
+
|
14 |
+
|
15 |
+
def get_trainer_cls(trainer_name):
|
16 |
+
return trainer_cls_name_dict[trainer_name]
|
src/trainer/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (519 Bytes). View file
|
|
src/trainer/__pycache__/marigold_inpaint_trainer.cpython-310.pyc
ADDED
Binary file (17.2 kB). View file
|
|
src/trainer/__pycache__/marigold_trainer.cpython-310.pyc
ADDED
Binary file (22.5 kB). View file
|
|
src/trainer/__pycache__/marigold_xl_trainer.cpython-310.pyc
ADDED
Binary file (22.4 kB). View file
|
|
src/trainer/marigold_inpaint_trainer.py
ADDED
@@ -0,0 +1,665 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# An official reimplemented version of Marigold training script.
|
2 |
+
# Last modified: 2024-04-29
|
3 |
+
#
|
4 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
+
# you may not use this file except in compliance with the License.
|
8 |
+
# You may obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
+
# See the License for the specific language governing permissions and
|
16 |
+
# limitations under the License.
|
17 |
+
# --------------------------------------------------------------------------
|
18 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
19 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
20 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
21 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
22 |
+
# --------------------------------------------------------------------------
|
23 |
+
from diffusers import StableDiffusionInpaintPipeline
|
24 |
+
import logging
|
25 |
+
import os
|
26 |
+
import pdb
|
27 |
+
import cv2
|
28 |
+
import shutil
|
29 |
+
import json
|
30 |
+
from pycocotools import mask as coco_mask
|
31 |
+
from datetime import datetime
|
32 |
+
from typing import List, Union
|
33 |
+
import random
|
34 |
+
import safetensors
|
35 |
+
import numpy as np
|
36 |
+
import torch
|
37 |
+
from diffusers import DDPMScheduler
|
38 |
+
from omegaconf import OmegaConf
|
39 |
+
from torch.nn import Conv2d
|
40 |
+
from torch.nn.parameter import Parameter
|
41 |
+
from torch.optim import Adam
|
42 |
+
from torch.optim.lr_scheduler import LambdaLR
|
43 |
+
from torch.utils.data import DataLoader, Dataset
|
44 |
+
from tqdm import tqdm
|
45 |
+
from PIL import Image
|
46 |
+
# import torch.optim.lr_scheduler
|
47 |
+
|
48 |
+
from diffusers.schedulers import PNDMScheduler
|
49 |
+
from torchvision.transforms.functional import pil_to_tensor
|
50 |
+
from marigold.marigold_pipeline import MarigoldPipeline, MarigoldDepthOutput
|
51 |
+
from src.util import metric
|
52 |
+
from src.util.data_loader import skip_first_batches
|
53 |
+
from src.util.logging_util import tb_logger, eval_dic_to_text
|
54 |
+
from src.util.loss import get_loss
|
55 |
+
from src.util.lr_scheduler import IterExponential
|
56 |
+
from src.util.metric import MetricTracker
|
57 |
+
from src.util.multi_res_noise import multi_res_noise_like
|
58 |
+
from src.util.alignment import align_depth_least_square, depth2disparity, disparity2depth
|
59 |
+
from src.util.seeding import generate_seed_sequence
|
60 |
+
from accelerate import Accelerator
|
61 |
+
import os
|
62 |
+
from torchvision.transforms import InterpolationMode, Resize, CenterCrop
|
63 |
+
import torchvision.transforms as transforms
|
64 |
+
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
|
65 |
+
|
66 |
+
class MarigoldInpaintTrainer:
|
67 |
+
def __init__(
|
68 |
+
self,
|
69 |
+
cfg: OmegaConf,
|
70 |
+
model: MarigoldPipeline,
|
71 |
+
train_dataloader: DataLoader,
|
72 |
+
device,
|
73 |
+
base_ckpt_dir,
|
74 |
+
out_dir_ckpt,
|
75 |
+
out_dir_eval,
|
76 |
+
out_dir_vis,
|
77 |
+
accumulation_steps: int,
|
78 |
+
depth_model = None,
|
79 |
+
separate_list: List = None,
|
80 |
+
val_dataloaders: List[DataLoader] = None,
|
81 |
+
vis_dataloaders: List[DataLoader] = None,
|
82 |
+
train_dataset: Dataset = None,
|
83 |
+
timestep_method: str = 'unidiffuser',
|
84 |
+
connection: bool = False
|
85 |
+
):
|
86 |
+
self.cfg: OmegaConf = cfg
|
87 |
+
self.model: MarigoldPipeline = model
|
88 |
+
self.depth_model = depth_model
|
89 |
+
self.device = device
|
90 |
+
self.seed: Union[int, None] = (
|
91 |
+
self.cfg.trainer.init_seed
|
92 |
+
) # used to generate seed sequence, set to `None` to train w/o seeding
|
93 |
+
self.out_dir_ckpt = out_dir_ckpt
|
94 |
+
self.out_dir_eval = out_dir_eval
|
95 |
+
self.out_dir_vis = out_dir_vis
|
96 |
+
self.train_loader: DataLoader = train_dataloader
|
97 |
+
self.val_loaders: List[DataLoader] = val_dataloaders
|
98 |
+
self.vis_loaders: List[DataLoader] = vis_dataloaders
|
99 |
+
self.accumulation_steps: int = accumulation_steps
|
100 |
+
self.separate_list = separate_list
|
101 |
+
self.timestep_method = timestep_method
|
102 |
+
self.train_dataset = train_dataset
|
103 |
+
self.connection = connection
|
104 |
+
# Adapt input layers
|
105 |
+
# if 8 != self.model.unet.config["in_channels"]:
|
106 |
+
# self._replace_unet_conv_in()
|
107 |
+
# if 8 != self.model.unet.config["out_channels"]:
|
108 |
+
# self._replace_unet_conv_out()
|
109 |
+
|
110 |
+
self.train_metrics = MetricTracker(*["loss", 'rgb_loss', 'depth_loss'])
|
111 |
+
# self.generator = torch.Generator('cuda:0').manual_seed(1024)
|
112 |
+
|
113 |
+
# Encode empty text prompt
|
114 |
+
self.model.encode_empty_text()
|
115 |
+
self.empty_text_embed = self.model.empty_text_embed.detach().clone().to(device)
|
116 |
+
|
117 |
+
self.model.unet.enable_xformers_memory_efficient_attention()
|
118 |
+
|
119 |
+
# Trainability
|
120 |
+
self.model.text_encoder.requires_grad_(False)
|
121 |
+
# self.model.unet.requires_grad_(True)
|
122 |
+
|
123 |
+
grad_part = filter(lambda p: p.requires_grad, self.model.unet.parameters())
|
124 |
+
|
125 |
+
# Optimizer !should be defined after input layer is adapted
|
126 |
+
lr = self.cfg.lr
|
127 |
+
self.optimizer = Adam(grad_part, lr=lr)
|
128 |
+
|
129 |
+
total_params = sum(p.numel() for p in self.model.unet.parameters())
|
130 |
+
total_params_m = total_params / 1_000_000
|
131 |
+
print(f"Total parameters: {total_params_m:.2f}M")
|
132 |
+
trainable_params = sum(p.numel() for p in self.model.unet.parameters() if p.requires_grad)
|
133 |
+
trainable_params_m = trainable_params / 1_000_000
|
134 |
+
print(f"Trainable parameters: {trainable_params_m:.2f}M")
|
135 |
+
|
136 |
+
# LR scheduler
|
137 |
+
lr_func = IterExponential(
|
138 |
+
total_iter_length=self.cfg.lr_scheduler.kwargs.total_iter,
|
139 |
+
final_ratio=self.cfg.lr_scheduler.kwargs.final_ratio,
|
140 |
+
warmup_steps=self.cfg.lr_scheduler.kwargs.warmup_steps,
|
141 |
+
)
|
142 |
+
self.lr_scheduler = LambdaLR(optimizer=self.optimizer, lr_lambda=lr_func)
|
143 |
+
|
144 |
+
# Loss
|
145 |
+
self.loss = get_loss(loss_name=self.cfg.loss.name, **self.cfg.loss.kwargs)
|
146 |
+
|
147 |
+
# Training noise scheduler
|
148 |
+
# self.rgb_training_noise_scheduler: PNDMScheduler = PNDMScheduler.from_pretrained(
|
149 |
+
# os.path.join(
|
150 |
+
# cfg.trainer.rgb_training_noise_scheduler.pretrained_path,
|
151 |
+
# "scheduler",
|
152 |
+
# )
|
153 |
+
# )
|
154 |
+
|
155 |
+
self.rgb_training_noise_scheduler: DDPMScheduler = DDPMScheduler.from_pretrained(
|
156 |
+
cfg.trainer.depth_training_noise_scheduler.pretrained_path, subfolder="scheduler")
|
157 |
+
self.depth_training_noise_scheduler: DDPMScheduler = DDPMScheduler.from_pretrained(
|
158 |
+
cfg.trainer.depth_training_noise_scheduler.pretrained_path, subfolder="scheduler")
|
159 |
+
|
160 |
+
self.rgb_prediction_type = self.rgb_training_noise_scheduler.config.prediction_type
|
161 |
+
# assert (
|
162 |
+
# self.rgb_prediction_type == self.model.rgb_scheduler.config.prediction_type
|
163 |
+
# ), "Different prediction types"
|
164 |
+
self.depth_prediction_type = self.depth_training_noise_scheduler.config.prediction_type
|
165 |
+
assert (
|
166 |
+
self.depth_prediction_type == self.model.depth_scheduler.config.prediction_type
|
167 |
+
), "Different prediction types"
|
168 |
+
self.scheduler_timesteps = (
|
169 |
+
self.rgb_training_noise_scheduler.config.num_train_timesteps
|
170 |
+
)
|
171 |
+
|
172 |
+
# Settings
|
173 |
+
self.max_epoch = self.cfg.max_epoch
|
174 |
+
self.max_iter = self.cfg.max_iter
|
175 |
+
self.gradient_accumulation_steps = accumulation_steps
|
176 |
+
self.gt_depth_type = self.cfg.gt_depth_type
|
177 |
+
self.gt_mask_type = self.cfg.gt_mask_type
|
178 |
+
self.save_period = self.cfg.trainer.save_period
|
179 |
+
self.backup_period = self.cfg.trainer.backup_period
|
180 |
+
self.val_period = self.cfg.trainer.validation_period
|
181 |
+
self.vis_period = self.cfg.trainer.visualization_period
|
182 |
+
|
183 |
+
# Multi-resolution noise
|
184 |
+
self.apply_multi_res_noise = self.cfg.multi_res_noise is not None
|
185 |
+
if self.apply_multi_res_noise:
|
186 |
+
self.mr_noise_strength = self.cfg.multi_res_noise.strength
|
187 |
+
self.annealed_mr_noise = self.cfg.multi_res_noise.annealed
|
188 |
+
self.mr_noise_downscale_strategy = (
|
189 |
+
self.cfg.multi_res_noise.downscale_strategy
|
190 |
+
)
|
191 |
+
|
192 |
+
# Internal variables
|
193 |
+
self.epoch = 0
|
194 |
+
self.n_batch_in_epoch = 0 # batch index in the epoch, used when resume training
|
195 |
+
self.effective_iter = 0 # how many times optimizer.step() is called
|
196 |
+
self.in_evaluation = False
|
197 |
+
self.global_seed_sequence: List = [] # consistent global seed sequence, used to seed random generator, to ensure consistency when resuming
|
198 |
+
|
199 |
+
def _replace_unet_conv_in(self):
|
200 |
+
# replace the first layer to accept 8 in_channels
|
201 |
+
_weight = self.model.unet.conv_in.weight.clone() # [320, 4, 3, 3]
|
202 |
+
_bias = self.model.unet.conv_in.bias.clone() # [320]
|
203 |
+
zero_weight = torch.zeros(_weight.shape).to(_weight.device)
|
204 |
+
_weight = torch.cat([_weight, zero_weight], dim=1)
|
205 |
+
# _weight = _weight.repeat((1, 2, 1, 1)) # Keep selected channel(s)
|
206 |
+
# half the activation magnitude
|
207 |
+
# _weight *= 0.5
|
208 |
+
# new conv_in channel
|
209 |
+
_n_convin_out_channel = self.model.unet.conv_in.out_channels
|
210 |
+
_new_conv_in = Conv2d(
|
211 |
+
8, _n_convin_out_channel, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
|
212 |
+
)
|
213 |
+
_new_conv_in.weight = Parameter(_weight)
|
214 |
+
_new_conv_in.bias = Parameter(_bias)
|
215 |
+
self.model.unet.conv_in = _new_conv_in
|
216 |
+
logging.info("Unet conv_in layer is replaced")
|
217 |
+
# replace config
|
218 |
+
self.model.unet.config["in_channels"] = 8
|
219 |
+
logging.info("Unet config is updated")
|
220 |
+
return
|
221 |
+
|
222 |
+
def parallel_train(self, t_end=None, accelerator=None):
|
223 |
+
logging.info("Start training")
|
224 |
+
self.model, self.optimizer, self.train_loader, self.lr_scheduler = accelerator.prepare(
|
225 |
+
self.model, self.optimizer, self.train_loader, self.lr_scheduler
|
226 |
+
)
|
227 |
+
self.depth_model = accelerator.prepare(self.depth_model)
|
228 |
+
|
229 |
+
self.accelerator = accelerator
|
230 |
+
if os.path.exists(os.path.join(self.out_dir_ckpt, 'latest')):
|
231 |
+
accelerator.load_state(os.path.join(self.out_dir_ckpt, 'latest'))
|
232 |
+
self.load_miscs(os.path.join(self.out_dir_ckpt, 'latest'))
|
233 |
+
|
234 |
+
# if accelerator.is_main_process:
|
235 |
+
# self._inpaint_rgbd()
|
236 |
+
|
237 |
+
self.train_metrics.reset()
|
238 |
+
accumulated_step = 0
|
239 |
+
for epoch in range(self.epoch, self.max_epoch + 1):
|
240 |
+
self.epoch = epoch
|
241 |
+
logging.debug(f"epoch: {self.epoch}")
|
242 |
+
|
243 |
+
# Skip previous batches when resume
|
244 |
+
for batch in skip_first_batches(self.train_loader, self.n_batch_in_epoch):
|
245 |
+
self.model.unet.train()
|
246 |
+
|
247 |
+
# globally consistent random generators
|
248 |
+
if self.seed is not None:
|
249 |
+
local_seed = self._get_next_seed()
|
250 |
+
rand_num_generator = torch.Generator(device=self.model.device)
|
251 |
+
rand_num_generator.manual_seed(local_seed)
|
252 |
+
else:
|
253 |
+
rand_num_generator = None
|
254 |
+
|
255 |
+
# >>> With gradient accumulation >>>
|
256 |
+
|
257 |
+
# Get data
|
258 |
+
rgb = batch["rgb_norm"].to(self.model.device)
|
259 |
+
with torch.no_grad():
|
260 |
+
disparities = self.depth_model(batch["rgb_int"].numpy().astype(np.uint8), 518, device=self.model.device)
|
261 |
+
|
262 |
+
if len(disparities.shape) == 2:
|
263 |
+
disparities = disparities.unsqueeze(0)
|
264 |
+
|
265 |
+
depth_gt_for_latent = []
|
266 |
+
for disparity_map in disparities:
|
267 |
+
depth_map = ((disparity_map - disparity_map.min()) / (disparity_map.max() - disparity_map.min())) * 2 - 1
|
268 |
+
depth_gt_for_latent.append(depth_map)
|
269 |
+
depth_gt_for_latent = torch.stack(depth_gt_for_latent, dim=0)
|
270 |
+
|
271 |
+
batch_size = rgb.shape[0]
|
272 |
+
|
273 |
+
mask = self.model.mask_processor.preprocess(batch['mask'] * 255).to(self.model.device)
|
274 |
+
|
275 |
+
rgb_timesteps = torch.randint(
|
276 |
+
0,
|
277 |
+
self.scheduler_timesteps,
|
278 |
+
(batch_size,),
|
279 |
+
device=self.model.device,
|
280 |
+
generator=rand_num_generator,
|
281 |
+
).long() # [B]
|
282 |
+
depth_timesteps = rgb_timesteps
|
283 |
+
|
284 |
+
rgb_flag = 1
|
285 |
+
depth_flag = 1
|
286 |
+
|
287 |
+
if self.timestep_method == 'joint':
|
288 |
+
rgb_mask = mask
|
289 |
+
depth_mask = mask
|
290 |
+
|
291 |
+
elif self.timestep_method == 'partition':
|
292 |
+
rand_num = random.random()
|
293 |
+
if rand_num < 0.5: # joint prediction
|
294 |
+
rgb_mask = mask
|
295 |
+
depth_mask = mask
|
296 |
+
elif rand_num < 0.75: # full rgb; depth prediction
|
297 |
+
rgb_flag = 0
|
298 |
+
rgb_mask = torch.zeros_like(mask)
|
299 |
+
depth_mask = mask
|
300 |
+
else:
|
301 |
+
depth_flag = 0
|
302 |
+
rgb_mask = mask
|
303 |
+
if random.random() < 0.5:
|
304 |
+
depth_mask = torch.zeros_like(mask) # full depth; rgb prediction
|
305 |
+
else:
|
306 |
+
depth_mask = mask # partial depth; rgb prediction
|
307 |
+
|
308 |
+
masked_rgb = rgb * (rgb_mask < 0.5)
|
309 |
+
masked_depth = depth_gt_for_latent * (depth_mask.squeeze() < 0.5)
|
310 |
+
with torch.no_grad():
|
311 |
+
# Encode image
|
312 |
+
rgb_latent = self.model.encode_rgb(rgb) # [B, 4, h, w]
|
313 |
+
mask_rgb_latent = self.model.encode_rgb(masked_rgb)
|
314 |
+
|
315 |
+
if depth_timesteps.sum() == 0:
|
316 |
+
gt_depth_latent = self.encode_depth(masked_depth)
|
317 |
+
else:
|
318 |
+
gt_depth_latent = self.encode_depth(depth_gt_for_latent)
|
319 |
+
mask_depth_latent = self.encode_depth(masked_depth)
|
320 |
+
|
321 |
+
rgb_mask = torch.nn.functional.interpolate(rgb_mask, size=rgb_latent.shape[-2:])
|
322 |
+
depth_mask = torch.nn.functional.interpolate(depth_mask, size=gt_depth_latent.shape[-2:])
|
323 |
+
|
324 |
+
# Sample noise
|
325 |
+
rgb_noise = torch.randn(
|
326 |
+
rgb_latent.shape,
|
327 |
+
device=self.model.device,
|
328 |
+
generator=rand_num_generator,
|
329 |
+
) # [B, 4, h, w]
|
330 |
+
depth_noise = torch.randn(
|
331 |
+
gt_depth_latent.shape,
|
332 |
+
device=self.model.device,
|
333 |
+
generator=rand_num_generator,
|
334 |
+
) # [B, 4, h, w]
|
335 |
+
|
336 |
+
if rgb_timesteps.sum() == 0:
|
337 |
+
noisy_rgb_latents = rgb_latent
|
338 |
+
else:
|
339 |
+
noisy_rgb_latents = self.rgb_training_noise_scheduler.add_noise(
|
340 |
+
rgb_latent, rgb_noise, rgb_timesteps
|
341 |
+
) # [B, 4, h, w]
|
342 |
+
if depth_timesteps.sum() == 0:
|
343 |
+
noisy_depth_latents = gt_depth_latent
|
344 |
+
else:
|
345 |
+
noisy_depth_latents = self.depth_training_noise_scheduler.add_noise(
|
346 |
+
gt_depth_latent, depth_noise, depth_timesteps
|
347 |
+
) # [B, 4, h, w]
|
348 |
+
|
349 |
+
noisy_latents = torch.cat(
|
350 |
+
[noisy_rgb_latents, rgb_mask, mask_rgb_latent, mask_depth_latent, noisy_depth_latents, depth_mask, mask_rgb_latent, mask_depth_latent], dim=1
|
351 |
+
).float() # [B, 9*2, h, w]
|
352 |
+
|
353 |
+
# Text embedding
|
354 |
+
input_ids = self.model.tokenizer(
|
355 |
+
batch['text'],
|
356 |
+
padding="max_length",
|
357 |
+
max_length=self.model.tokenizer.model_max_length,
|
358 |
+
truncation=True,
|
359 |
+
return_tensors="pt",
|
360 |
+
)
|
361 |
+
input_ids = {k: v.to(self.model.device) for k, v in input_ids.items()}
|
362 |
+
text_embed = self.model.text_encoder(**input_ids)[0]
|
363 |
+
|
364 |
+
model_pred = self.model.unet(
|
365 |
+
noisy_latents, rgb_timesteps, depth_timesteps, text_embed, controlnet_connection=self.connection
|
366 |
+
).sample # [B, 8, h, w]
|
367 |
+
|
368 |
+
if torch.isnan(model_pred).any():
|
369 |
+
logging.warning("model_pred contains NaN.")
|
370 |
+
|
371 |
+
# Get the target for loss depending on the prediction type
|
372 |
+
if "sample" == self.rgb_prediction_type:
|
373 |
+
rgb_target = rgb_latent
|
374 |
+
elif "epsilon" == self.rgb_prediction_type:
|
375 |
+
rgb_target = rgb_latent
|
376 |
+
elif "v_prediction" == self.rgb_prediction_type:
|
377 |
+
rgb_target = self.rgb_training_noise_scheduler.get_velocity(
|
378 |
+
rgb_latent, rgb_noise, rgb_timesteps
|
379 |
+
) # [B, 4, h, w]
|
380 |
+
else:
|
381 |
+
raise ValueError(f"Unknown rgb prediction type {self.prediction_type}")
|
382 |
+
|
383 |
+
if "sample" == self.depth_prediction_type:
|
384 |
+
depth_target = gt_depth_latent
|
385 |
+
elif "epsilon" == self.depth_prediction_type:
|
386 |
+
depth_target = gt_depth_latent
|
387 |
+
elif "v_prediction" == self.depth_prediction_type:
|
388 |
+
depth_target = self.depth_training_noise_scheduler.get_velocity(
|
389 |
+
gt_depth_latent, depth_noise, depth_timesteps
|
390 |
+
) # [B, 4, h, w]
|
391 |
+
else:
|
392 |
+
raise ValueError(f"Unknown depth prediction type {self.prediction_type}")
|
393 |
+
# Masked latent loss
|
394 |
+
with accelerator.accumulate(self.model):
|
395 |
+
|
396 |
+
rgb_loss = self.loss(model_pred[:, 0:4, :, :].float(), rgb_target.float())
|
397 |
+
depth_loss = self.loss(model_pred[:, 4:, :, :].float(), depth_target.float())
|
398 |
+
|
399 |
+
if rgb_flag == 0:
|
400 |
+
loss = depth_loss
|
401 |
+
elif depth_flag == 0:
|
402 |
+
loss = rgb_loss
|
403 |
+
else:
|
404 |
+
loss = self.cfg.loss.depth_factor * depth_loss + (1 - self.cfg.loss.depth_factor) * rgb_loss
|
405 |
+
|
406 |
+
self.train_metrics.update("loss", loss.item())
|
407 |
+
self.train_metrics.update("rgb_loss", rgb_loss.item())
|
408 |
+
self.train_metrics.update("depth_loss", depth_loss.item())
|
409 |
+
# loss = loss / self.gradient_accumulation_steps
|
410 |
+
accelerator.backward(loss)
|
411 |
+
self.optimizer.step()
|
412 |
+
self.optimizer.zero_grad()
|
413 |
+
# loss.backward()
|
414 |
+
self.n_batch_in_epoch += 1
|
415 |
+
# print(accelerator.process_index, self.lr_scheduler.get_last_lr())
|
416 |
+
self.lr_scheduler.step(self.effective_iter)
|
417 |
+
|
418 |
+
if accelerator.sync_gradients:
|
419 |
+
accumulated_step += 1
|
420 |
+
|
421 |
+
if accumulated_step >= self.gradient_accumulation_steps:
|
422 |
+
accumulated_step = 0
|
423 |
+
self.effective_iter += 1
|
424 |
+
|
425 |
+
if accelerator.is_main_process:
|
426 |
+
# Log to tensorboard
|
427 |
+
if self.effective_iter == 1:
|
428 |
+
self._inpaint_rgbd()
|
429 |
+
|
430 |
+
accumulated_loss = self.train_metrics.result()["loss"]
|
431 |
+
rgb_loss = self.train_metrics.result()["rgb_loss"]
|
432 |
+
depth_loss = self.train_metrics.result()["depth_loss"]
|
433 |
+
tb_logger.log_dic(
|
434 |
+
{
|
435 |
+
f"train/{k}": v
|
436 |
+
for k, v in self.train_metrics.result().items()
|
437 |
+
},
|
438 |
+
global_step=self.effective_iter,
|
439 |
+
)
|
440 |
+
tb_logger.writer.add_scalar(
|
441 |
+
"lr",
|
442 |
+
self.lr_scheduler.get_last_lr()[0],
|
443 |
+
global_step=self.effective_iter,
|
444 |
+
)
|
445 |
+
tb_logger.writer.add_scalar(
|
446 |
+
"n_batch_in_epoch",
|
447 |
+
self.n_batch_in_epoch,
|
448 |
+
global_step=self.effective_iter,
|
449 |
+
)
|
450 |
+
logging.info(
|
451 |
+
f"iter {self.effective_iter:5d} (epoch {epoch:2d}): loss={accumulated_loss:.5f}, rgb_loss={rgb_loss:.5f}, depth_loss={depth_loss:.5f}"
|
452 |
+
)
|
453 |
+
accelerator.wait_for_everyone()
|
454 |
+
|
455 |
+
if self.save_period > 0 and 0 == self.effective_iter % self.save_period:
|
456 |
+
accelerator.save_state(output_dir=os.path.join(self.out_dir_ckpt, 'latest'))
|
457 |
+
unwrapped_model = accelerator.unwrap_model(self.model)
|
458 |
+
if accelerator.is_main_process:
|
459 |
+
accelerator.save_model(unwrapped_model.unet,
|
460 |
+
os.path.join(self.out_dir_ckpt, 'latest'), safe_serialization=False)
|
461 |
+
self.save_miscs('latest')
|
462 |
+
self._inpaint_rgbd()
|
463 |
+
accelerator.wait_for_everyone()
|
464 |
+
|
465 |
+
if self.backup_period > 0 and 0 == self.effective_iter % self.backup_period:
|
466 |
+
unwrapped_model = accelerator.unwrap_model(self.model)
|
467 |
+
if accelerator.is_main_process:
|
468 |
+
accelerator.save_model(unwrapped_model.unet,
|
469 |
+
os.path.join(self.out_dir_ckpt, self._get_backup_ckpt_name()), safe_serialization=False)
|
470 |
+
accelerator.wait_for_everyone()
|
471 |
+
|
472 |
+
# End of training
|
473 |
+
if self.max_iter > 0 and self.effective_iter >= self.max_iter:
|
474 |
+
unwrapped_model = accelerator.unwrap_model(self.model)
|
475 |
+
if accelerator.is_main_process:
|
476 |
+
unwrapped_model.unet.save_pretrained(
|
477 |
+
os.path.join(self.out_dir_ckpt, self._get_backup_ckpt_name()))
|
478 |
+
accelerator.wait_for_everyone()
|
479 |
+
return
|
480 |
+
|
481 |
+
torch.cuda.empty_cache()
|
482 |
+
# <<< Effective batch end <<<
|
483 |
+
|
484 |
+
# Epoch end
|
485 |
+
self.n_batch_in_epoch = 0
|
486 |
+
|
487 |
+
def _inpaint_rgbd(self):
|
488 |
+
image_path = ['/dataset/~sa-1b/data/sa_001000/sa_10000335.jpg',
|
489 |
+
'/dataset/~sa-1b/data/sa_000357/sa_3572319.jpg',
|
490 |
+
'/dataset/~sa-1b/data/sa_000045/sa_457934.jpg']
|
491 |
+
prompt = ['A white car is parked in front of the factory',
|
492 |
+
'church with cemetery next to it',
|
493 |
+
'A house with a red brick roof']
|
494 |
+
|
495 |
+
imgs = [pil_to_tensor(Image.open(p)) for p in image_path]
|
496 |
+
depth_imgs = [self.depth_model(img.unsqueeze(0).cpu().numpy()) for img in imgs]
|
497 |
+
|
498 |
+
masks = []
|
499 |
+
for rgb_path in image_path:
|
500 |
+
anno = json.load(open(rgb_path.replace('.jpg', '.json')))['annotations']
|
501 |
+
random.shuffle(anno)
|
502 |
+
object_num = random.randint(5, 10)
|
503 |
+
mask = np.array(coco_mask.decode(anno[0]['segmentation']), dtype=np.uint8)
|
504 |
+
for single_anno in (anno[0:object_num] if len(anno)>object_num else anno):
|
505 |
+
mask += np.array(coco_mask.decode(single_anno['segmentation']), dtype=np.uint8)
|
506 |
+
masks.append(torch.from_numpy(mask))
|
507 |
+
|
508 |
+
resize_transform = transforms.Compose([
|
509 |
+
Resize(size=512, interpolation=InterpolationMode.NEAREST_EXACT),
|
510 |
+
CenterCrop(size=[512, 512])])
|
511 |
+
imgs = [resize_transform(img) for img in imgs]
|
512 |
+
depth_imgs = [resize_transform(depth_img.unsqueeze(0)) for depth_img in depth_imgs]
|
513 |
+
masks = [resize_transform(mask.unsqueeze(0)) for mask in masks]
|
514 |
+
# pdb.set_trace()
|
515 |
+
|
516 |
+
for i in range(len(imgs)):
|
517 |
+
output_image = self.model._rgbd_inpaint(imgs[i], depth_imgs[i], masks[i], [prompt[i]], processing_res=512, mode='joint_inpaint')
|
518 |
+
tb_logger.writer.add_image(f'{prompt[i]}', pil_to_tensor(output_image), self.effective_iter)
|
519 |
+
|
520 |
+
def encode_depth(self, depth_in):
|
521 |
+
# stack depth into 3-channel
|
522 |
+
stacked = self.stack_depth_images(depth_in)
|
523 |
+
# encode using VAE encoder
|
524 |
+
depth_latent = self.model.encode_rgb(stacked)
|
525 |
+
return depth_latent
|
526 |
+
|
527 |
+
@staticmethod
|
528 |
+
def stack_depth_images(depth_in):
|
529 |
+
if 4 == len(depth_in.shape):
|
530 |
+
stacked = depth_in.repeat(1, 3, 1, 1)
|
531 |
+
elif 3 == len(depth_in.shape):
|
532 |
+
stacked = depth_in.unsqueeze(1)
|
533 |
+
stacked = stacked.repeat(1, 3, 1, 1)
|
534 |
+
elif 2 == len(depth_in.shape):
|
535 |
+
stacked = depth_in.unsqueeze(0).unsqueeze(0)
|
536 |
+
stacked = stacked.repeat(1, 3, 1, 1)
|
537 |
+
return stacked
|
538 |
+
|
539 |
+
def visualize(self):
|
540 |
+
for val_loader in self.vis_loaders:
|
541 |
+
vis_dataset_name = val_loader.dataset.disp_name
|
542 |
+
vis_out_dir = os.path.join(
|
543 |
+
self.out_dir_vis, self._get_backup_ckpt_name(), vis_dataset_name
|
544 |
+
)
|
545 |
+
os.makedirs(vis_out_dir, exist_ok=True)
|
546 |
+
_ = self.validate_single_dataset(
|
547 |
+
data_loader=val_loader,
|
548 |
+
metric_tracker=self.val_metrics,
|
549 |
+
save_to_dir=vis_out_dir,
|
550 |
+
)
|
551 |
+
|
552 |
+
def _get_next_seed(self):
|
553 |
+
if 0 == len(self.global_seed_sequence):
|
554 |
+
self.global_seed_sequence = generate_seed_sequence(
|
555 |
+
initial_seed=self.seed,
|
556 |
+
length=self.max_iter * self.gradient_accumulation_steps,
|
557 |
+
)
|
558 |
+
logging.info(
|
559 |
+
f"Global seed sequence is generated, length={len(self.global_seed_sequence)}"
|
560 |
+
)
|
561 |
+
return self.global_seed_sequence.pop()
|
562 |
+
|
563 |
+
def save_miscs(self, ckpt_name):
|
564 |
+
ckpt_dir = os.path.join(self.out_dir_ckpt, ckpt_name)
|
565 |
+
state = {
|
566 |
+
"config": self.cfg,
|
567 |
+
"effective_iter": self.effective_iter,
|
568 |
+
"epoch": self.epoch,
|
569 |
+
"n_batch_in_epoch": self.n_batch_in_epoch,
|
570 |
+
"global_seed_sequence": self.global_seed_sequence,
|
571 |
+
}
|
572 |
+
train_state_path = os.path.join(ckpt_dir, "trainer.ckpt")
|
573 |
+
torch.save(state, train_state_path)
|
574 |
+
|
575 |
+
logging.info(f"Misc state is saved to: {train_state_path}")
|
576 |
+
|
577 |
+
def load_miscs(self, ckpt_path):
|
578 |
+
checkpoint = torch.load(os.path.join(ckpt_path, "trainer.ckpt"))
|
579 |
+
self.effective_iter = checkpoint["effective_iter"]
|
580 |
+
self.epoch = checkpoint["epoch"]
|
581 |
+
self.n_batch_in_epoch = checkpoint["n_batch_in_epoch"]
|
582 |
+
self.global_seed_sequence = checkpoint["global_seed_sequence"]
|
583 |
+
|
584 |
+
logging.info(f"Misc state is loaded from {ckpt_path}")
|
585 |
+
|
586 |
+
|
587 |
+
def save_checkpoint(self, ckpt_name, save_train_state):
|
588 |
+
ckpt_dir = os.path.join(self.out_dir_ckpt, ckpt_name)
|
589 |
+
logging.info(f"Saving checkpoint to: {ckpt_dir}")
|
590 |
+
# Backup previous checkpoint
|
591 |
+
temp_ckpt_dir = None
|
592 |
+
if os.path.exists(ckpt_dir) and os.path.isdir(ckpt_dir):
|
593 |
+
temp_ckpt_dir = os.path.join(
|
594 |
+
os.path.dirname(ckpt_dir), f"_old_{os.path.basename(ckpt_dir)}"
|
595 |
+
)
|
596 |
+
if os.path.exists(temp_ckpt_dir):
|
597 |
+
shutil.rmtree(temp_ckpt_dir, ignore_errors=True)
|
598 |
+
os.rename(ckpt_dir, temp_ckpt_dir)
|
599 |
+
logging.debug(f"Old checkpoint is backed up at: {temp_ckpt_dir}")
|
600 |
+
|
601 |
+
# Save UNet
|
602 |
+
unet_path = os.path.join(ckpt_dir, "unet")
|
603 |
+
self.model.unet.save_pretrained(unet_path, safe_serialization=False)
|
604 |
+
logging.info(f"UNet is saved to: {unet_path}")
|
605 |
+
|
606 |
+
if save_train_state:
|
607 |
+
state = {
|
608 |
+
"config": self.cfg,
|
609 |
+
"effective_iter": self.effective_iter,
|
610 |
+
"epoch": self.epoch,
|
611 |
+
"n_batch_in_epoch": self.n_batch_in_epoch,
|
612 |
+
"best_metric": self.best_metric,
|
613 |
+
"in_evaluation": self.in_evaluation,
|
614 |
+
"global_seed_sequence": self.global_seed_sequence,
|
615 |
+
}
|
616 |
+
train_state_path = os.path.join(ckpt_dir, "trainer.ckpt")
|
617 |
+
torch.save(state, train_state_path)
|
618 |
+
# iteration indicator
|
619 |
+
f = open(os.path.join(ckpt_dir, self._get_backup_ckpt_name()), "w")
|
620 |
+
f.close()
|
621 |
+
|
622 |
+
logging.info(f"Trainer state is saved to: {train_state_path}")
|
623 |
+
|
624 |
+
# Remove temp ckpt
|
625 |
+
if temp_ckpt_dir is not None and os.path.exists(temp_ckpt_dir):
|
626 |
+
shutil.rmtree(temp_ckpt_dir, ignore_errors=True)
|
627 |
+
logging.debug("Old checkpoint backup is removed.")
|
628 |
+
|
629 |
+
def load_checkpoint(
|
630 |
+
self, ckpt_path, load_trainer_state=True, resume_lr_scheduler=True
|
631 |
+
):
|
632 |
+
logging.info(f"Loading checkpoint from: {ckpt_path}")
|
633 |
+
# Load UNet
|
634 |
+
_model_path = os.path.join(ckpt_path, "unet", "diffusion_pytorch_model.bin")
|
635 |
+
self.model.unet.load_state_dict(
|
636 |
+
torch.load(_model_path, map_location=self.device)
|
637 |
+
)
|
638 |
+
self.model.unet.to(self.device)
|
639 |
+
logging.info(f"UNet parameters are loaded from {_model_path}")
|
640 |
+
|
641 |
+
# Load training states
|
642 |
+
if load_trainer_state:
|
643 |
+
checkpoint = torch.load(os.path.join(ckpt_path, "trainer.ckpt"))
|
644 |
+
self.effective_iter = checkpoint["effective_iter"]
|
645 |
+
self.epoch = checkpoint["epoch"]
|
646 |
+
self.n_batch_in_epoch = checkpoint["n_batch_in_epoch"]
|
647 |
+
self.in_evaluation = checkpoint["in_evaluation"]
|
648 |
+
self.global_seed_sequence = checkpoint["global_seed_sequence"]
|
649 |
+
|
650 |
+
self.best_metric = checkpoint["best_metric"]
|
651 |
+
|
652 |
+
self.optimizer.load_state_dict(checkpoint["optimizer"])
|
653 |
+
logging.info(f"optimizer state is loaded from {ckpt_path}")
|
654 |
+
|
655 |
+
if resume_lr_scheduler:
|
656 |
+
self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
|
657 |
+
logging.info(f"LR scheduler state is loaded from {ckpt_path}")
|
658 |
+
|
659 |
+
logging.info(
|
660 |
+
f"Checkpoint loaded from: {ckpt_path}. Resume from iteration {self.effective_iter} (epoch {self.epoch})"
|
661 |
+
)
|
662 |
+
return
|
663 |
+
|
664 |
+
def _get_backup_ckpt_name(self):
|
665 |
+
return f"iter_{self.effective_iter:06d}"
|
src/trainer/marigold_trainer.py
ADDED
@@ -0,0 +1,968 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# An official reimplemented version of Marigold training script.
|
2 |
+
# Last modified: 2024-04-29
|
3 |
+
#
|
4 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
+
# you may not use this file except in compliance with the License.
|
8 |
+
# You may obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
+
# See the License for the specific language governing permissions and
|
16 |
+
# limitations under the License.
|
17 |
+
# --------------------------------------------------------------------------
|
18 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
19 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
20 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
21 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
22 |
+
# --------------------------------------------------------------------------
|
23 |
+
|
24 |
+
|
25 |
+
import logging
|
26 |
+
import os
|
27 |
+
import pdb
|
28 |
+
import shutil
|
29 |
+
from datetime import datetime
|
30 |
+
from typing import List, Union
|
31 |
+
import random
|
32 |
+
import safetensors
|
33 |
+
import numpy as np
|
34 |
+
import torch
|
35 |
+
from diffusers import DDPMScheduler
|
36 |
+
from omegaconf import OmegaConf
|
37 |
+
from torch.nn import Conv2d
|
38 |
+
from torch.nn.parameter import Parameter
|
39 |
+
from torch.optim import Adam
|
40 |
+
from torch.optim.lr_scheduler import LambdaLR
|
41 |
+
from torch.utils.data import DataLoader
|
42 |
+
from tqdm import tqdm
|
43 |
+
from PIL import Image
|
44 |
+
# import torch.optim.lr_scheduler
|
45 |
+
|
46 |
+
from marigold.marigold_pipeline import MarigoldPipeline, MarigoldDepthOutput
|
47 |
+
from src.util import metric
|
48 |
+
from src.util.data_loader import skip_first_batches
|
49 |
+
from src.util.logging_util import tb_logger, eval_dic_to_text
|
50 |
+
from src.util.loss import get_loss
|
51 |
+
from src.util.lr_scheduler import IterExponential
|
52 |
+
from src.util.metric import MetricTracker
|
53 |
+
from src.util.multi_res_noise import multi_res_noise_like
|
54 |
+
from src.util.alignment import align_depth_least_square, depth2disparity, disparity2depth
|
55 |
+
from src.util.seeding import generate_seed_sequence
|
56 |
+
from accelerate import Accelerator
|
57 |
+
import os
|
58 |
+
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
|
59 |
+
|
60 |
+
class MarigoldTrainer:
|
61 |
+
def __init__(
|
62 |
+
self,
|
63 |
+
cfg: OmegaConf,
|
64 |
+
model: MarigoldPipeline,
|
65 |
+
train_dataloader: DataLoader,
|
66 |
+
device,
|
67 |
+
base_ckpt_dir,
|
68 |
+
out_dir_ckpt,
|
69 |
+
out_dir_eval,
|
70 |
+
out_dir_vis,
|
71 |
+
accumulation_steps: int,
|
72 |
+
depth_model = None,
|
73 |
+
separate_list: List = None,
|
74 |
+
val_dataloaders: List[DataLoader] = None,
|
75 |
+
vis_dataloaders: List[DataLoader] = None,
|
76 |
+
timestep_method: str = 'unidiffuser'
|
77 |
+
):
|
78 |
+
self.cfg: OmegaConf = cfg
|
79 |
+
self.model: MarigoldPipeline = model
|
80 |
+
self.depth_model = depth_model
|
81 |
+
self.device = device
|
82 |
+
self.seed: Union[int, None] = (
|
83 |
+
self.cfg.trainer.init_seed
|
84 |
+
) # used to generate seed sequence, set to `None` to train w/o seeding
|
85 |
+
self.out_dir_ckpt = out_dir_ckpt
|
86 |
+
self.out_dir_eval = out_dir_eval
|
87 |
+
self.out_dir_vis = out_dir_vis
|
88 |
+
self.train_loader: DataLoader = train_dataloader
|
89 |
+
self.val_loaders: List[DataLoader] = val_dataloaders
|
90 |
+
self.vis_loaders: List[DataLoader] = vis_dataloaders
|
91 |
+
self.accumulation_steps: int = accumulation_steps
|
92 |
+
self.separate_list = separate_list
|
93 |
+
self.timestep_method = timestep_method
|
94 |
+
# Adapt input layers
|
95 |
+
# if 8 != self.model.unet.config["in_channels"]:
|
96 |
+
# self._replace_unet_conv_in()
|
97 |
+
# if 8 != self.model.unet.config["out_channels"]:
|
98 |
+
# self._replace_unet_conv_out()
|
99 |
+
|
100 |
+
self.prompt = ['a view of a city skyline from a bridge',
|
101 |
+
'a man and a woman sitting on a couch',
|
102 |
+
'a black car parked in a parking lot next to the water',
|
103 |
+
'Enchanted forest with glowing plants, fairies, and ancient castle.',
|
104 |
+
'Futuristic city with skyscrapers, neon lights, and hovering vehicles.',
|
105 |
+
'Fantasy mountain landscape with waterfalls, dragons, and mythical creatures.']
|
106 |
+
# self.generator = torch.Generator('cuda:0').manual_seed(1024)
|
107 |
+
|
108 |
+
# Encode empty text prompt
|
109 |
+
self.model.encode_empty_text()
|
110 |
+
self.empty_text_embed = self.model.empty_text_embed.detach().clone().to(device)
|
111 |
+
|
112 |
+
self.model.unet.enable_xformers_memory_efficient_attention()
|
113 |
+
|
114 |
+
# Trainability
|
115 |
+
self.model.text_encoder.requires_grad_(False)
|
116 |
+
# self.model.unet.requires_grad_(True)
|
117 |
+
|
118 |
+
grad_part = filter(lambda p: p.requires_grad, self.model.unet.parameters())
|
119 |
+
|
120 |
+
# Optimizer !should be defined after input layer is adapted
|
121 |
+
lr = self.cfg.lr
|
122 |
+
self.optimizer = Adam(grad_part, lr=lr)
|
123 |
+
|
124 |
+
total_params = sum(p.numel() for p in self.model.unet.parameters())
|
125 |
+
total_params_m = total_params / 1_000_000
|
126 |
+
print(f"Total parameters: {total_params_m:.2f}M")
|
127 |
+
trainable_params = sum(p.numel() for p in self.model.unet.parameters() if p.requires_grad)
|
128 |
+
trainable_params_m = trainable_params / 1_000_000
|
129 |
+
print(f"Trainable parameters: {trainable_params_m:.2f}M")
|
130 |
+
|
131 |
+
# LR scheduler
|
132 |
+
lr_func = IterExponential(
|
133 |
+
total_iter_length=self.cfg.lr_scheduler.kwargs.total_iter,
|
134 |
+
final_ratio=self.cfg.lr_scheduler.kwargs.final_ratio,
|
135 |
+
warmup_steps=self.cfg.lr_scheduler.kwargs.warmup_steps,
|
136 |
+
)
|
137 |
+
self.lr_scheduler = LambdaLR(optimizer=self.optimizer, lr_lambda=lr_func)
|
138 |
+
|
139 |
+
# Loss
|
140 |
+
self.loss = get_loss(loss_name=self.cfg.loss.name, **self.cfg.loss.kwargs)
|
141 |
+
|
142 |
+
# Training noise scheduler
|
143 |
+
self.training_noise_scheduler: DDPMScheduler = DDPMScheduler.from_pretrained(
|
144 |
+
os.path.join(
|
145 |
+
cfg.trainer.training_noise_scheduler.pretrained_path,
|
146 |
+
"scheduler",
|
147 |
+
)
|
148 |
+
)
|
149 |
+
# pdb.set_trace()
|
150 |
+
self.prediction_type = self.training_noise_scheduler.config.prediction_type
|
151 |
+
assert (
|
152 |
+
self.prediction_type == self.model.scheduler.config.prediction_type
|
153 |
+
), "Different prediction types"
|
154 |
+
self.scheduler_timesteps = (
|
155 |
+
self.training_noise_scheduler.config.num_train_timesteps
|
156 |
+
)
|
157 |
+
|
158 |
+
# Eval metrics
|
159 |
+
self.metric_funcs = [getattr(metric, _met) for _met in cfg.eval.eval_metrics]
|
160 |
+
self.train_metrics = MetricTracker(*["loss", 'rgb_loss', 'depth_loss'])
|
161 |
+
self.val_metrics = MetricTracker(*[m.__name__ for m in self.metric_funcs])
|
162 |
+
# main metric for best checkpoint saving
|
163 |
+
self.main_val_metric = cfg.validation.main_val_metric
|
164 |
+
self.main_val_metric_goal = cfg.validation.main_val_metric_goal
|
165 |
+
assert (
|
166 |
+
self.main_val_metric in cfg.eval.eval_metrics
|
167 |
+
), f"Main eval metric `{self.main_val_metric}` not found in evaluation metrics."
|
168 |
+
self.best_metric = 1e8 if "minimize" == self.main_val_metric_goal else -1e8
|
169 |
+
|
170 |
+
# Settings
|
171 |
+
self.max_epoch = self.cfg.max_epoch
|
172 |
+
self.max_iter = self.cfg.max_iter
|
173 |
+
self.gradient_accumulation_steps = accumulation_steps
|
174 |
+
self.gt_depth_type = self.cfg.gt_depth_type
|
175 |
+
self.gt_mask_type = self.cfg.gt_mask_type
|
176 |
+
self.save_period = self.cfg.trainer.save_period
|
177 |
+
self.backup_period = self.cfg.trainer.backup_period
|
178 |
+
self.val_period = self.cfg.trainer.validation_period
|
179 |
+
self.vis_period = self.cfg.trainer.visualization_period
|
180 |
+
|
181 |
+
# Multi-resolution noise
|
182 |
+
self.apply_multi_res_noise = self.cfg.multi_res_noise is not None
|
183 |
+
if self.apply_multi_res_noise:
|
184 |
+
self.mr_noise_strength = self.cfg.multi_res_noise.strength
|
185 |
+
self.annealed_mr_noise = self.cfg.multi_res_noise.annealed
|
186 |
+
self.mr_noise_downscale_strategy = (
|
187 |
+
self.cfg.multi_res_noise.downscale_strategy
|
188 |
+
)
|
189 |
+
|
190 |
+
# Internal variables
|
191 |
+
self.epoch = 0
|
192 |
+
self.n_batch_in_epoch = 0 # batch index in the epoch, used when resume training
|
193 |
+
self.effective_iter = 0 # how many times optimizer.step() is called
|
194 |
+
self.in_evaluation = False
|
195 |
+
self.global_seed_sequence: List = [] # consistent global seed sequence, used to seed random generator, to ensure consistency when resuming
|
196 |
+
|
197 |
+
def _replace_unet_conv_in(self):
|
198 |
+
# replace the first layer to accept 8 in_channels
|
199 |
+
_weight = self.model.unet.conv_in.weight.clone() # [320, 4, 3, 3]
|
200 |
+
_bias = self.model.unet.conv_in.bias.clone() # [320]
|
201 |
+
zero_weight = torch.zeros(_weight.shape).to(_weight.device)
|
202 |
+
_weight = torch.cat([_weight, zero_weight], dim=1)
|
203 |
+
# _weight = _weight.repeat((1, 2, 1, 1)) # Keep selected channel(s)
|
204 |
+
# half the activation magnitude
|
205 |
+
# _weight *= 0.5
|
206 |
+
# new conv_in channel
|
207 |
+
_n_convin_out_channel = self.model.unet.conv_in.out_channels
|
208 |
+
_new_conv_in = Conv2d(
|
209 |
+
8, _n_convin_out_channel, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
|
210 |
+
)
|
211 |
+
_new_conv_in.weight = Parameter(_weight)
|
212 |
+
_new_conv_in.bias = Parameter(_bias)
|
213 |
+
self.model.unet.conv_in = _new_conv_in
|
214 |
+
logging.info("Unet conv_in layer is replaced")
|
215 |
+
# replace config
|
216 |
+
self.model.unet.config["in_channels"] = 8
|
217 |
+
logging.info("Unet config is updated")
|
218 |
+
return
|
219 |
+
|
220 |
+
def _replace_unet_conv_out(self):
|
221 |
+
# replace the first layer to accept 8 in_channels
|
222 |
+
_weight = self.model.unet.conv_out.weight.clone() # [8, 320, 3, 3]
|
223 |
+
_bias = self.model.unet.conv_out.bias.clone() # [320]
|
224 |
+
_weight = _weight.repeat((2, 1, 1, 1)) # Keep selected channel(s)
|
225 |
+
_bias = _bias.repeat((2))
|
226 |
+
# half the activation magnitude
|
227 |
+
|
228 |
+
# new conv_in channel
|
229 |
+
_n_convin_out_channel = self.model.unet.conv_out.out_channels
|
230 |
+
_new_conv_out = Conv2d(
|
231 |
+
_n_convin_out_channel, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
|
232 |
+
)
|
233 |
+
_new_conv_out.weight = Parameter(_weight)
|
234 |
+
_new_conv_out.bias = Parameter(_bias)
|
235 |
+
self.model.unet.conv_out = _new_conv_out
|
236 |
+
logging.info("Unet conv_out layer is replaced")
|
237 |
+
# replace config
|
238 |
+
self.model.unet.config["out_channels"] = 8
|
239 |
+
logging.info("Unet config is updated")
|
240 |
+
return
|
241 |
+
|
242 |
+
def parallel_train(self, t_end=None, accelerator=None):
|
243 |
+
logging.info("Start training")
|
244 |
+
# pdb.set_trace()
|
245 |
+
self.model, self.optimizer, self.train_loader, self.lr_scheduler = accelerator.prepare(
|
246 |
+
self.model, self.optimizer, self.train_loader, self.lr_scheduler
|
247 |
+
)
|
248 |
+
self.depth_model = accelerator.prepare(self.depth_model)
|
249 |
+
|
250 |
+
self.accelerator = accelerator
|
251 |
+
if self.val_loaders is not None:
|
252 |
+
for idx, loader in enumerate(self.val_loaders):
|
253 |
+
self.val_loaders[idx] = accelerator.prepare(loader)
|
254 |
+
|
255 |
+
if os.path.exists(os.path.join(self.out_dir_ckpt, 'latest')):
|
256 |
+
accelerator.load_state(os.path.join(self.out_dir_ckpt, 'latest'))
|
257 |
+
self.load_miscs(os.path.join(self.out_dir_ckpt, 'latest'))
|
258 |
+
|
259 |
+
self.train_metrics.reset()
|
260 |
+
accumulated_step = 0
|
261 |
+
for epoch in range(self.epoch, self.max_epoch + 1):
|
262 |
+
self.epoch = epoch
|
263 |
+
logging.debug(f"epoch: {self.epoch}")
|
264 |
+
|
265 |
+
# Skip previous batches when resume
|
266 |
+
for batch in skip_first_batches(self.train_loader, self.n_batch_in_epoch):
|
267 |
+
self.model.unet.train()
|
268 |
+
|
269 |
+
# globally consistent random generators
|
270 |
+
if self.seed is not None:
|
271 |
+
local_seed = self._get_next_seed()
|
272 |
+
rand_num_generator = torch.Generator(device=self.model.device)
|
273 |
+
rand_num_generator.manual_seed(local_seed)
|
274 |
+
else:
|
275 |
+
rand_num_generator = None
|
276 |
+
|
277 |
+
# >>> With gradient accumulation >>>
|
278 |
+
|
279 |
+
# Get data
|
280 |
+
rgb = batch["rgb_norm"].to(self.model.device)
|
281 |
+
if self.gt_depth_type not in batch:
|
282 |
+
with torch.no_grad():
|
283 |
+
disparities = self.depth_model(batch["rgb_int"].numpy().astype(np.uint8), 518, device=self.model.device)
|
284 |
+
depth_gt_for_latent = []
|
285 |
+
for disparity_map in disparities:
|
286 |
+
depth_map = ((disparity_map - disparity_map.min()) / (disparity_map.max() - disparity_map.min())) * 2 - 1
|
287 |
+
depth_gt_for_latent.append(depth_map)
|
288 |
+
depth_gt_for_latent = torch.stack(depth_gt_for_latent, dim=0)
|
289 |
+
else:
|
290 |
+
if "least_square_disparity" == self.cfg.eval.alignment:
|
291 |
+
# convert GT depth -> GT disparity
|
292 |
+
depth_raw_ts = batch["depth_raw_linear"].squeeze()
|
293 |
+
depth_raw = depth_raw_ts.cpu().numpy()
|
294 |
+
# pdb.set_trace()
|
295 |
+
disparities = depth2disparity(
|
296 |
+
depth=depth_raw
|
297 |
+
)
|
298 |
+
depth_gt_for_latent = []
|
299 |
+
for disparity_map in disparities:
|
300 |
+
depth_map = ((disparity_map - disparity_map.min()) / (
|
301 |
+
disparity_map.max() - disparity_map.min())) * 2 - 1
|
302 |
+
depth_gt_for_latent.append(torch.from_numpy(depth_map))
|
303 |
+
depth_gt_for_latent = torch.stack(depth_gt_for_latent, dim=0).to(self.model.device)
|
304 |
+
else:
|
305 |
+
depth_gt_for_latent = batch[self.gt_depth_type].to(self.model.device)
|
306 |
+
|
307 |
+
batch_size = rgb.shape[0]
|
308 |
+
|
309 |
+
if self.gt_mask_type is not None:
|
310 |
+
valid_mask_for_latent = batch[self.gt_mask_type].to(self.model.device)
|
311 |
+
invalid_mask = ~valid_mask_for_latent
|
312 |
+
valid_mask_down = ~torch.max_pool2d(
|
313 |
+
invalid_mask.float(), 8, 8
|
314 |
+
).bool()
|
315 |
+
valid_mask_down = valid_mask_down.repeat((1, 4, 1, 1))
|
316 |
+
|
317 |
+
with torch.no_grad():
|
318 |
+
# Encode image
|
319 |
+
rgb_latent = self.model.encode_rgb(rgb) # [B, 4, h, w]
|
320 |
+
# Encode GT depth
|
321 |
+
gt_depth_latent = self.encode_depth(
|
322 |
+
depth_gt_for_latent
|
323 |
+
) # [B, 4, h, w]
|
324 |
+
# Sample a random timestep for each image
|
325 |
+
if self.cfg.loss.depth_factor == 1:
|
326 |
+
rgb_timesteps = torch.zeros(
|
327 |
+
(batch_size),
|
328 |
+
device=self.model.device
|
329 |
+
).long() # [B]
|
330 |
+
depth_timesteps = torch.randint(
|
331 |
+
0,
|
332 |
+
self.scheduler_timesteps,
|
333 |
+
(batch_size,),
|
334 |
+
device=self.model.device,
|
335 |
+
generator=rand_num_generator,
|
336 |
+
).long() # [B]
|
337 |
+
elif self.timestep_method == 'unidiffuser':
|
338 |
+
rgb_timesteps = torch.randint(
|
339 |
+
0,
|
340 |
+
self.scheduler_timesteps,
|
341 |
+
(batch_size,),
|
342 |
+
device=self.model.device,
|
343 |
+
generator=rand_num_generator,
|
344 |
+
).long() # [B]
|
345 |
+
depth_timesteps = torch.randint(
|
346 |
+
0,
|
347 |
+
self.scheduler_timesteps,
|
348 |
+
(batch_size,),
|
349 |
+
device=self.model.device,
|
350 |
+
generator=rand_num_generator,
|
351 |
+
).long() # [B]
|
352 |
+
elif self.timestep_method == 'joint':
|
353 |
+
rgb_timesteps = torch.randint(
|
354 |
+
0,
|
355 |
+
self.scheduler_timesteps,
|
356 |
+
(batch_size,),
|
357 |
+
device=self.model.device,
|
358 |
+
generator=rand_num_generator,
|
359 |
+
).long() # [B]
|
360 |
+
depth_timesteps = rgb_timesteps # [B]
|
361 |
+
elif self.timestep_method == 'partition':
|
362 |
+
rand_num = random.random()
|
363 |
+
if rand_num < 0.3333:
|
364 |
+
# joint generation
|
365 |
+
rgb_timesteps = torch.randint(
|
366 |
+
0,
|
367 |
+
self.scheduler_timesteps,
|
368 |
+
(batch_size,),
|
369 |
+
device=self.model.device,
|
370 |
+
generator=rand_num_generator,
|
371 |
+
).long() # [B]
|
372 |
+
depth_timesteps = rgb_timesteps
|
373 |
+
elif rand_num < 0.6666:
|
374 |
+
# image2depth generation
|
375 |
+
rgb_timesteps = torch.zeros(
|
376 |
+
(batch_size),
|
377 |
+
device=self.model.device
|
378 |
+
).long() # [B]
|
379 |
+
depth_timesteps = torch.randint(
|
380 |
+
0,
|
381 |
+
self.scheduler_timesteps,
|
382 |
+
(batch_size,),
|
383 |
+
device=self.model.device,
|
384 |
+
generator=rand_num_generator,
|
385 |
+
).long() # [B]
|
386 |
+
else:
|
387 |
+
# depth2image generation
|
388 |
+
rgb_timesteps = torch.randint(
|
389 |
+
0,
|
390 |
+
self.scheduler_timesteps,
|
391 |
+
(batch_size,),
|
392 |
+
device=self.model.device,
|
393 |
+
generator=rand_num_generator,
|
394 |
+
).long() # [B]
|
395 |
+
depth_timesteps = torch.zeros(
|
396 |
+
(batch_size),
|
397 |
+
device=self.model.device
|
398 |
+
).long() # [B]
|
399 |
+
|
400 |
+
# Sample noise
|
401 |
+
if self.apply_multi_res_noise:
|
402 |
+
rgb_strength = self.mr_noise_strength
|
403 |
+
if self.annealed_mr_noise:
|
404 |
+
# calculate strength depending on t
|
405 |
+
rgb_strength = rgb_strength * (rgb_timesteps / self.scheduler_timesteps)
|
406 |
+
rgb_noise = multi_res_noise_like(
|
407 |
+
rgb_latent,
|
408 |
+
strength=rgb_strength,
|
409 |
+
downscale_strategy=self.mr_noise_downscale_strategy,
|
410 |
+
generator=rand_num_generator,
|
411 |
+
device=self.model.device,
|
412 |
+
)
|
413 |
+
|
414 |
+
depth_strength = self.mr_noise_strength
|
415 |
+
if self.annealed_mr_noise:
|
416 |
+
# calculate strength depending on t
|
417 |
+
depth_strength = depth_strength * (depth_timesteps / self.scheduler_timesteps)
|
418 |
+
depth_noise = multi_res_noise_like(
|
419 |
+
gt_depth_latent,
|
420 |
+
strength=depth_strength,
|
421 |
+
downscale_strategy=self.mr_noise_downscale_strategy,
|
422 |
+
generator=rand_num_generator,
|
423 |
+
device=self.model.device,
|
424 |
+
)
|
425 |
+
else:
|
426 |
+
rgb_noise = torch.randn(
|
427 |
+
rgb_latent.shape,
|
428 |
+
device=self.model.device,
|
429 |
+
generator=rand_num_generator,
|
430 |
+
) # [B, 8, h, w]
|
431 |
+
|
432 |
+
depth_noise = torch.randn(
|
433 |
+
gt_depth_latent.shape,
|
434 |
+
device=self.model.device,
|
435 |
+
generator=rand_num_generator,
|
436 |
+
) # [B, 8, h, w]
|
437 |
+
# Add noise to the latents (diffusion forward process)
|
438 |
+
|
439 |
+
if depth_timesteps.sum() == 0:
|
440 |
+
noisy_rgb_latents = rgb_latent
|
441 |
+
else:
|
442 |
+
noisy_rgb_latents = self.training_noise_scheduler.add_noise(
|
443 |
+
rgb_latent, rgb_noise, rgb_timesteps
|
444 |
+
) # [B, 4, h, w]
|
445 |
+
|
446 |
+
noisy_depth_latents = self.training_noise_scheduler.add_noise(
|
447 |
+
gt_depth_latent, depth_noise, depth_timesteps
|
448 |
+
) # [B, 4, h, w]
|
449 |
+
|
450 |
+
noisy_latents = torch.cat(
|
451 |
+
[noisy_rgb_latents, noisy_depth_latents], dim=1
|
452 |
+
).float() # [B, 8, h, w]
|
453 |
+
|
454 |
+
# Text embedding
|
455 |
+
input_ids = self.model.tokenizer(
|
456 |
+
batch['text'],
|
457 |
+
padding="max_length",
|
458 |
+
max_length=self.model.tokenizer.model_max_length,
|
459 |
+
truncation=True,
|
460 |
+
return_tensors="pt",
|
461 |
+
)
|
462 |
+
input_ids = {k: v.to(self.model.device) for k, v in input_ids.items()}
|
463 |
+
text_embed = self.model.text_encoder(**input_ids)[0]
|
464 |
+
# text_embed = self.empty_text_embed.to(device).repeat(
|
465 |
+
# (batch_size, 1, 1)
|
466 |
+
# ) # [B, 77, 1024]
|
467 |
+
model_pred = self.model.unet(
|
468 |
+
noisy_latents, rgb_timesteps, depth_timesteps, text_embed
|
469 |
+
).sample # [B, 4, h, w]
|
470 |
+
if torch.isnan(model_pred).any():
|
471 |
+
logging.warning("model_pred contains NaN.")
|
472 |
+
|
473 |
+
# Get the target for loss depending on the prediction type
|
474 |
+
if "sample" == self.prediction_type:
|
475 |
+
rgb_target = rgb_latent
|
476 |
+
depth_target = gt_depth_latent
|
477 |
+
elif "epsilon" == self.prediction_type:
|
478 |
+
rgb_target = rgb_latent
|
479 |
+
depth_target = gt_depth_latent
|
480 |
+
elif "v_prediction" == self.prediction_type:
|
481 |
+
rgb_target = self.training_noise_scheduler.get_velocity(
|
482 |
+
rgb_latent, rgb_noise, rgb_timesteps
|
483 |
+
) # [B, 4, h, w]
|
484 |
+
depth_target = self.training_noise_scheduler.get_velocity(
|
485 |
+
gt_depth_latent, depth_noise, depth_timesteps
|
486 |
+
) # [B, 4, h, w]
|
487 |
+
else:
|
488 |
+
raise ValueError(f"Unknown prediction type {self.prediction_type}")
|
489 |
+
# Masked latent loss
|
490 |
+
with accelerator.accumulate(self.model):
|
491 |
+
if self.gt_mask_type is not None:
|
492 |
+
depth_loss = self.loss(
|
493 |
+
model_pred[:, 4:, :, :][valid_mask_down].float(),
|
494 |
+
depth_target[valid_mask_down].float(),
|
495 |
+
)
|
496 |
+
else:
|
497 |
+
depth_loss = self.loss(model_pred[:, 4:, :, :].float(),depth_target.float())
|
498 |
+
|
499 |
+
rgb_loss = self.loss(model_pred[:, 0:4, :, :].float(), rgb_target.float())
|
500 |
+
|
501 |
+
if torch.sum(rgb_timesteps) == 0 or torch.sum(rgb_timesteps) == len(rgb_timesteps) * self.scheduler_timesteps:
|
502 |
+
loss = depth_loss
|
503 |
+
elif torch.sum(depth_timesteps) == 0 or torch.sum(depth_timesteps) == len(depth_timesteps) * self.scheduler_timesteps:
|
504 |
+
loss = rgb_loss
|
505 |
+
else:
|
506 |
+
loss = self.cfg.loss.depth_factor * depth_loss + (1 - self.cfg.loss.depth_factor) * rgb_loss
|
507 |
+
|
508 |
+
self.train_metrics.update("loss", loss.item())
|
509 |
+
self.train_metrics.update("rgb_loss", rgb_loss.item())
|
510 |
+
self.train_metrics.update("depth_loss", depth_loss.item())
|
511 |
+
# loss = loss / self.gradient_accumulation_steps
|
512 |
+
accelerator.backward(loss)
|
513 |
+
self.optimizer.step()
|
514 |
+
self.optimizer.zero_grad()
|
515 |
+
# loss.backward()
|
516 |
+
self.n_batch_in_epoch += 1
|
517 |
+
# print(accelerator.process_index, self.lr_scheduler.get_last_lr())
|
518 |
+
self.lr_scheduler.step(self.effective_iter)
|
519 |
+
|
520 |
+
if accelerator.sync_gradients:
|
521 |
+
accumulated_step += 1
|
522 |
+
|
523 |
+
if accumulated_step >= self.gradient_accumulation_steps:
|
524 |
+
accumulated_step = 0
|
525 |
+
self.effective_iter += 1
|
526 |
+
|
527 |
+
if accelerator.is_main_process:
|
528 |
+
# Log to tensorboard
|
529 |
+
if self.effective_iter == 1:
|
530 |
+
generator = torch.Generator(self.model.device).manual_seed(1024)
|
531 |
+
img = self.model.generate_rgbd(self.prompt, num_inference_steps=50, generator=generator,
|
532 |
+
show_pbar=True)
|
533 |
+
for idx in range(len(self.prompt)):
|
534 |
+
tb_logger.writer.add_image(f'image/{self.prompt[idx]}', img[idx], self.effective_iter)
|
535 |
+
self._depth2image()
|
536 |
+
self._image2depth()
|
537 |
+
|
538 |
+
accumulated_loss = self.train_metrics.result()["loss"]
|
539 |
+
rgb_loss = self.train_metrics.result()["rgb_loss"]
|
540 |
+
depth_loss = self.train_metrics.result()["depth_loss"]
|
541 |
+
tb_logger.log_dic(
|
542 |
+
{
|
543 |
+
f"train/{k}": v
|
544 |
+
for k, v in self.train_metrics.result().items()
|
545 |
+
},
|
546 |
+
global_step=self.effective_iter,
|
547 |
+
)
|
548 |
+
tb_logger.writer.add_scalar(
|
549 |
+
"lr",
|
550 |
+
self.lr_scheduler.get_last_lr()[0],
|
551 |
+
global_step=self.effective_iter,
|
552 |
+
)
|
553 |
+
tb_logger.writer.add_scalar(
|
554 |
+
"n_batch_in_epoch",
|
555 |
+
self.n_batch_in_epoch,
|
556 |
+
global_step=self.effective_iter,
|
557 |
+
)
|
558 |
+
logging.info(
|
559 |
+
f"iter {self.effective_iter:5d} (epoch {epoch:2d}): loss={accumulated_loss:.5f}, rgb_loss={rgb_loss:.5f}, depth_loss={depth_loss:.5f}"
|
560 |
+
)
|
561 |
+
accelerator.wait_for_everyone()
|
562 |
+
|
563 |
+
if self.save_period > 0 and 0 == self.effective_iter % self.save_period:
|
564 |
+
accelerator.save_state(output_dir=os.path.join(self.out_dir_ckpt, 'latest'))
|
565 |
+
unwrapped_model = accelerator.unwrap_model(self.model)
|
566 |
+
if accelerator.is_main_process:
|
567 |
+
accelerator.save_model(unwrapped_model.unet,
|
568 |
+
os.path.join(self.out_dir_ckpt, 'latest'), safe_serialization=False)
|
569 |
+
self.save_miscs('latest')
|
570 |
+
|
571 |
+
# RGB-D joint generation
|
572 |
+
generator = torch.Generator(self.model.device).manual_seed(1024)
|
573 |
+
img = self.model.generate_rgbd(self.prompt, num_inference_steps=50, generator=generator, show_pbar=False, height=64, width=64)
|
574 |
+
for idx in range(len(self.prompt)):
|
575 |
+
tb_logger.writer.add_image(f'image/{self.prompt[idx]}', img[idx], self.effective_iter)
|
576 |
+
|
577 |
+
# depth to RGB generation
|
578 |
+
self._depth2image()
|
579 |
+
# # RGB to depth generation
|
580 |
+
self._image2depth()
|
581 |
+
|
582 |
+
accelerator.wait_for_everyone()
|
583 |
+
|
584 |
+
if self.backup_period > 0 and 0 == self.effective_iter % self.backup_period:
|
585 |
+
unwrapped_model = accelerator.unwrap_model(self.model)
|
586 |
+
if accelerator.is_main_process:
|
587 |
+
unwrapped_model.unet.save_pretrained(
|
588 |
+
os.path.join(self.out_dir_ckpt, self._get_backup_ckpt_name()))
|
589 |
+
accelerator.wait_for_everyone()
|
590 |
+
|
591 |
+
if self.val_period > 0 and 0 == self.effective_iter % self.val_period:
|
592 |
+
self.validate()
|
593 |
+
|
594 |
+
# End of training
|
595 |
+
if self.max_iter > 0 and self.effective_iter >= self.max_iter:
|
596 |
+
unwrapped_model = accelerator.unwrap_model(self.model)
|
597 |
+
if accelerator.is_main_process:
|
598 |
+
unwrapped_model.unet.save_pretrained(
|
599 |
+
os.path.join(self.out_dir_ckpt, self._get_backup_ckpt_name()))
|
600 |
+
accelerator.wait_for_everyone()
|
601 |
+
return
|
602 |
+
|
603 |
+
torch.cuda.empty_cache()
|
604 |
+
# <<< Effective batch end <<<
|
605 |
+
|
606 |
+
# Epoch end
|
607 |
+
self.n_batch_in_epoch = 0
|
608 |
+
|
609 |
+
def _image2depth(self):
|
610 |
+
generator = torch.Generator(self.model.device).manual_seed(1024)
|
611 |
+
image2dept_paths = ['/home/aiops/wangzh/data/scannet/scene0593_00/color/000100.jpg',
|
612 |
+
'/home/aiops/wangzh/data/scannet/scene0593_00/color/000700.jpg',
|
613 |
+
'/home/aiops/wangzh/data/scannet/scene0591_01/color/000600.jpg',
|
614 |
+
'/home/aiops/wangzh/data/scannet/scene0591_01/color/001500.jpg']
|
615 |
+
for img_idx, image_path in enumerate(image2dept_paths):
|
616 |
+
rgb_input = Image.open(image_path)
|
617 |
+
depth_pred: MarigoldDepthOutput = self.model.image2depth(
|
618 |
+
rgb_input,
|
619 |
+
denoising_steps=self.cfg.validation.denoising_steps,
|
620 |
+
ensemble_size=self.cfg.validation.ensemble_size,
|
621 |
+
processing_res=self.cfg.validation.processing_res,
|
622 |
+
match_input_res=self.cfg.validation.match_input_res,
|
623 |
+
generator=generator,
|
624 |
+
batch_size=self.cfg.validation.ensemble_size,
|
625 |
+
# use batch size 1 to increase reproducibility
|
626 |
+
color_map="Spectral",
|
627 |
+
show_progress_bar=False,
|
628 |
+
resample_method=self.cfg.validation.resample_method,
|
629 |
+
)
|
630 |
+
img = self.model.post_process_rgbd(['None'], [rgb_input], [depth_pred['depth_colored']])
|
631 |
+
tb_logger.writer.add_image(f'image2depth_{img_idx}', img[0], self.effective_iter)
|
632 |
+
|
633 |
+
def _depth2image(self):
|
634 |
+
generator = torch.Generator(self.model.device).manual_seed(1024)
|
635 |
+
if "least_square_disparity" == self.cfg.eval.alignment:
|
636 |
+
depth2image_path = ['/home/aiops/wangzh/data/ori_depth_part0-0/sa_10000335.jpg',
|
637 |
+
'/home/aiops/wangzh/data/ori_depth_part0-0/sa_3572319.jpg',
|
638 |
+
'/home/aiops/wangzh/data/ori_depth_part0-0/sa_457934.jpg']
|
639 |
+
else:
|
640 |
+
depth2image_path = ['/home/aiops/wangzh/data/sa_001000/sa_10000335.jpg',
|
641 |
+
'/home/aiops/wangzh/data/sa_000357/sa_3572319.jpg',
|
642 |
+
'/home/aiops/wangzh/data/sa_000045/sa_457934.jpg']
|
643 |
+
prompts = ['Red car parked in the factory',
|
644 |
+
'White gothic church with cemetery next to it',
|
645 |
+
'House with red roof and starry sky in the background']
|
646 |
+
for img_idx, depth_path in enumerate(depth2image_path):
|
647 |
+
depth_input = Image.open(depth_path)
|
648 |
+
image_pred = self.model.single_depth2image(
|
649 |
+
depth_input,
|
650 |
+
prompts[img_idx],
|
651 |
+
num_inference_steps=50,
|
652 |
+
processing_res=self.cfg.validation.processing_res,
|
653 |
+
generator=generator,
|
654 |
+
show_pbar=False,
|
655 |
+
resample_method=self.cfg.validation.resample_method,
|
656 |
+
)
|
657 |
+
img = self.model.post_process_rgbd([prompts[img_idx]], [image_pred], [depth_input])
|
658 |
+
tb_logger.writer.add_image(f'depth2image_{img_idx}', img[0], self.effective_iter)
|
659 |
+
|
660 |
+
def encode_depth(self, depth_in):
|
661 |
+
# stack depth into 3-channel
|
662 |
+
stacked = self.stack_depth_images(depth_in)
|
663 |
+
# encode using VAE encoder
|
664 |
+
depth_latent = self.model.encode_rgb(stacked)
|
665 |
+
return depth_latent
|
666 |
+
|
667 |
+
@staticmethod
|
668 |
+
def stack_depth_images(depth_in):
|
669 |
+
if 4 == len(depth_in.shape):
|
670 |
+
stacked = depth_in.repeat(1, 3, 1, 1)
|
671 |
+
elif 3 == len(depth_in.shape):
|
672 |
+
stacked = depth_in.unsqueeze(1)
|
673 |
+
stacked = stacked.repeat(1, 3, 1, 1)
|
674 |
+
return stacked
|
675 |
+
|
676 |
+
def validate(self):
|
677 |
+
for i, val_loader in enumerate(self.val_loaders):
|
678 |
+
val_dataset_name = val_loader.dataset.disp_name
|
679 |
+
val_metric_dic = self.validate_single_dataset(
|
680 |
+
data_loader=val_loader, metric_tracker=self.val_metrics
|
681 |
+
)
|
682 |
+
|
683 |
+
if self.accelerator.is_main_process:
|
684 |
+
val_metric_dic = {k:torch.tensor(v).cuda() for k,v in val_metric_dic.items()}
|
685 |
+
|
686 |
+
tb_logger.log_dic(
|
687 |
+
{f"val/{val_dataset_name}/{k}": v for k, v in val_metric_dic.items()},
|
688 |
+
global_step=self.effective_iter,
|
689 |
+
)
|
690 |
+
# save to file
|
691 |
+
eval_text = eval_dic_to_text(
|
692 |
+
val_metrics=val_metric_dic,
|
693 |
+
dataset_name=val_dataset_name,
|
694 |
+
sample_list_path=val_loader.dataset.filename_ls_path,
|
695 |
+
)
|
696 |
+
_save_to = os.path.join(
|
697 |
+
self.out_dir_eval,
|
698 |
+
f"eval-{val_dataset_name}-iter{self.effective_iter:06d}.txt",
|
699 |
+
)
|
700 |
+
with open(_save_to, "w+") as f:
|
701 |
+
f.write(eval_text)
|
702 |
+
|
703 |
+
# Update main eval metric
|
704 |
+
if 0 == i:
|
705 |
+
main_eval_metric = val_metric_dic[self.main_val_metric]
|
706 |
+
if (
|
707 |
+
"minimize" == self.main_val_metric_goal
|
708 |
+
and main_eval_metric < self.best_metric
|
709 |
+
or "maximize" == self.main_val_metric_goal
|
710 |
+
and main_eval_metric > self.best_metric
|
711 |
+
):
|
712 |
+
self.best_metric = main_eval_metric
|
713 |
+
logging.info(
|
714 |
+
f"Best metric: {self.main_val_metric} = {self.best_metric} at iteration {self.effective_iter}"
|
715 |
+
)
|
716 |
+
# Save a checkpoint
|
717 |
+
self.save_checkpoint(
|
718 |
+
ckpt_name='best', save_train_state=False
|
719 |
+
)
|
720 |
+
|
721 |
+
self.accelerator.wait_for_everyone()
|
722 |
+
|
723 |
+
def visualize(self):
|
724 |
+
for val_loader in self.vis_loaders:
|
725 |
+
vis_dataset_name = val_loader.dataset.disp_name
|
726 |
+
vis_out_dir = os.path.join(
|
727 |
+
self.out_dir_vis, self._get_backup_ckpt_name(), vis_dataset_name
|
728 |
+
)
|
729 |
+
os.makedirs(vis_out_dir, exist_ok=True)
|
730 |
+
_ = self.validate_single_dataset(
|
731 |
+
data_loader=val_loader,
|
732 |
+
metric_tracker=self.val_metrics,
|
733 |
+
save_to_dir=vis_out_dir,
|
734 |
+
)
|
735 |
+
|
736 |
+
@torch.no_grad()
|
737 |
+
def validate_single_dataset(
|
738 |
+
self,
|
739 |
+
data_loader: DataLoader,
|
740 |
+
metric_tracker: MetricTracker,
|
741 |
+
save_to_dir: str = None,
|
742 |
+
):
|
743 |
+
self.model.to(self.device)
|
744 |
+
metric_tracker.reset()
|
745 |
+
|
746 |
+
# Generate seed sequence for consistent evaluation
|
747 |
+
val_init_seed = self.cfg.validation.init_seed
|
748 |
+
val_seed_ls = generate_seed_sequence(val_init_seed, len(data_loader))
|
749 |
+
|
750 |
+
for i, batch in enumerate(
|
751 |
+
tqdm(data_loader, desc=f"evaluating on {data_loader.dataset.disp_name}"),
|
752 |
+
start=1,
|
753 |
+
):
|
754 |
+
|
755 |
+
rgb_int = batch["rgb_int"] # [3, H, W]
|
756 |
+
# GT depth
|
757 |
+
depth_raw_ts = batch["depth_raw_linear"].squeeze()
|
758 |
+
depth_raw = depth_raw_ts.cpu().numpy()
|
759 |
+
depth_raw_ts = depth_raw_ts.to(self.device)
|
760 |
+
valid_mask_ts = batch["valid_mask_raw"].squeeze()
|
761 |
+
valid_mask = valid_mask_ts.cpu().numpy()
|
762 |
+
valid_mask_ts = valid_mask_ts.to(self.device)
|
763 |
+
|
764 |
+
# Random number generator
|
765 |
+
seed = val_seed_ls.pop()
|
766 |
+
if seed is None:
|
767 |
+
generator = None
|
768 |
+
else:
|
769 |
+
generator = torch.Generator(device=self.device)
|
770 |
+
generator.manual_seed(seed)
|
771 |
+
|
772 |
+
# Predict depth
|
773 |
+
pipe_out: MarigoldDepthOutput = self.model.image2depth(
|
774 |
+
rgb_int,
|
775 |
+
denoising_steps=self.cfg.validation.denoising_steps,
|
776 |
+
ensemble_size=self.cfg.validation.ensemble_size,
|
777 |
+
processing_res=self.cfg.validation.processing_res,
|
778 |
+
match_input_res=self.cfg.validation.match_input_res,
|
779 |
+
generator=generator,
|
780 |
+
batch_size=self.cfg.validation.ensemble_size, # use batch size 1 to increase reproducibility
|
781 |
+
color_map=None,
|
782 |
+
show_progress_bar=False,
|
783 |
+
resample_method=self.cfg.validation.resample_method,
|
784 |
+
)
|
785 |
+
|
786 |
+
depth_pred: np.ndarray = pipe_out.depth_np
|
787 |
+
|
788 |
+
if "least_square" == self.cfg.eval.alignment:
|
789 |
+
depth_pred, scale, shift = align_depth_least_square(
|
790 |
+
gt_arr=depth_raw,
|
791 |
+
pred_arr=depth_pred,
|
792 |
+
valid_mask_arr=valid_mask,
|
793 |
+
return_scale_shift=True,
|
794 |
+
max_resolution=self.cfg.eval.align_max_res,
|
795 |
+
)
|
796 |
+
elif "least_square_disparity" == self.cfg.eval.alignment:
|
797 |
+
# convert GT depth -> GT disparity
|
798 |
+
gt_disparity, gt_non_neg_mask = depth2disparity(
|
799 |
+
depth=depth_raw, return_mask=True
|
800 |
+
)
|
801 |
+
|
802 |
+
pred_non_neg_mask = depth_pred > 0
|
803 |
+
valid_nonnegative_mask = valid_mask & gt_non_neg_mask & pred_non_neg_mask
|
804 |
+
|
805 |
+
disparity_pred, scale, shift = align_depth_least_square(
|
806 |
+
gt_arr=gt_disparity,
|
807 |
+
pred_arr=depth_pred,
|
808 |
+
valid_mask_arr=valid_nonnegative_mask,
|
809 |
+
return_scale_shift=True,
|
810 |
+
max_resolution=self.cfg.eval.align_max_res,
|
811 |
+
)
|
812 |
+
# convert to depth
|
813 |
+
disparity_pred = np.clip(
|
814 |
+
disparity_pred, a_min=1e-3, a_max=None
|
815 |
+
) # avoid 0 disparity
|
816 |
+
depth_pred = disparity2depth(disparity_pred)
|
817 |
+
|
818 |
+
# Clip to dataset min max
|
819 |
+
depth_pred = np.clip(
|
820 |
+
depth_pred,
|
821 |
+
a_min=data_loader.dataset.min_depth,
|
822 |
+
a_max=data_loader.dataset.max_depth,
|
823 |
+
)
|
824 |
+
|
825 |
+
# clip to d > 0 for evaluation
|
826 |
+
depth_pred = np.clip(depth_pred, a_min=1e-6, a_max=None)
|
827 |
+
|
828 |
+
# Evaluate
|
829 |
+
sample_metric = []
|
830 |
+
depth_pred_ts = torch.from_numpy(depth_pred).to(self.device)
|
831 |
+
|
832 |
+
for met_func in self.metric_funcs:
|
833 |
+
_metric_name = met_func.__name__
|
834 |
+
_metric = met_func(depth_pred_ts, depth_raw_ts, valid_mask_ts).cuda(self.accelerator.process_index)
|
835 |
+
self.accelerator.wait_for_everyone()
|
836 |
+
_metric = self.accelerator.gather_for_metrics(_metric.unsqueeze(0)).mean().item()
|
837 |
+
sample_metric.append(_metric.__str__())
|
838 |
+
metric_tracker.update(_metric_name, _metric)
|
839 |
+
|
840 |
+
self.accelerator.wait_for_everyone()
|
841 |
+
# Save as 16-bit uint png
|
842 |
+
if save_to_dir is not None:
|
843 |
+
img_name = batch["rgb_relative_path"][0].replace("/", "_")
|
844 |
+
png_save_path = os.path.join(save_to_dir, f"{img_name}.png")
|
845 |
+
depth_to_save = (pipe_out.depth_np * 65535.0).astype(np.uint16)
|
846 |
+
Image.fromarray(depth_to_save).save(png_save_path, mode="I;16")
|
847 |
+
|
848 |
+
return metric_tracker.result()
|
849 |
+
|
850 |
+
def _get_next_seed(self):
|
851 |
+
if 0 == len(self.global_seed_sequence):
|
852 |
+
self.global_seed_sequence = generate_seed_sequence(
|
853 |
+
initial_seed=self.seed,
|
854 |
+
length=self.max_iter * self.gradient_accumulation_steps,
|
855 |
+
)
|
856 |
+
logging.info(
|
857 |
+
f"Global seed sequence is generated, length={len(self.global_seed_sequence)}"
|
858 |
+
)
|
859 |
+
return self.global_seed_sequence.pop()
|
860 |
+
|
861 |
+
def save_miscs(self, ckpt_name):
|
862 |
+
ckpt_dir = os.path.join(self.out_dir_ckpt, ckpt_name)
|
863 |
+
state = {
|
864 |
+
"config": self.cfg,
|
865 |
+
"effective_iter": self.effective_iter,
|
866 |
+
"epoch": self.epoch,
|
867 |
+
"n_batch_in_epoch": self.n_batch_in_epoch,
|
868 |
+
"best_metric": self.best_metric,
|
869 |
+
"in_evaluation": self.in_evaluation,
|
870 |
+
"global_seed_sequence": self.global_seed_sequence,
|
871 |
+
}
|
872 |
+
train_state_path = os.path.join(ckpt_dir, "trainer.ckpt")
|
873 |
+
torch.save(state, train_state_path)
|
874 |
+
|
875 |
+
logging.info(f"Misc state is saved to: {train_state_path}")
|
876 |
+
|
877 |
+
def load_miscs(self, ckpt_path):
|
878 |
+
checkpoint = torch.load(os.path.join(ckpt_path, "trainer.ckpt"))
|
879 |
+
self.effective_iter = checkpoint["effective_iter"]
|
880 |
+
self.epoch = checkpoint["epoch"]
|
881 |
+
self.n_batch_in_epoch = checkpoint["n_batch_in_epoch"]
|
882 |
+
self.in_evaluation = checkpoint["in_evaluation"]
|
883 |
+
self.global_seed_sequence = checkpoint["global_seed_sequence"]
|
884 |
+
|
885 |
+
self.best_metric = checkpoint["best_metric"]
|
886 |
+
|
887 |
+
logging.info(f"Misc state is loaded from {ckpt_path}")
|
888 |
+
|
889 |
+
|
890 |
+
def save_checkpoint(self, ckpt_name, save_train_state):
|
891 |
+
ckpt_dir = os.path.join(self.out_dir_ckpt, ckpt_name)
|
892 |
+
logging.info(f"Saving checkpoint to: {ckpt_dir}")
|
893 |
+
# Backup previous checkpoint
|
894 |
+
temp_ckpt_dir = None
|
895 |
+
if os.path.exists(ckpt_dir) and os.path.isdir(ckpt_dir):
|
896 |
+
temp_ckpt_dir = os.path.join(
|
897 |
+
os.path.dirname(ckpt_dir), f"_old_{os.path.basename(ckpt_dir)}"
|
898 |
+
)
|
899 |
+
if os.path.exists(temp_ckpt_dir):
|
900 |
+
shutil.rmtree(temp_ckpt_dir, ignore_errors=True)
|
901 |
+
os.rename(ckpt_dir, temp_ckpt_dir)
|
902 |
+
logging.debug(f"Old checkpoint is backed up at: {temp_ckpt_dir}")
|
903 |
+
|
904 |
+
# Save UNet
|
905 |
+
unet_path = os.path.join(ckpt_dir, "unet")
|
906 |
+
self.model.unet.save_pretrained(unet_path, safe_serialization=False)
|
907 |
+
logging.info(f"UNet is saved to: {unet_path}")
|
908 |
+
|
909 |
+
if save_train_state:
|
910 |
+
state = {
|
911 |
+
"config": self.cfg,
|
912 |
+
"effective_iter": self.effective_iter,
|
913 |
+
"epoch": self.epoch,
|
914 |
+
"n_batch_in_epoch": self.n_batch_in_epoch,
|
915 |
+
"best_metric": self.best_metric,
|
916 |
+
"in_evaluation": self.in_evaluation,
|
917 |
+
"global_seed_sequence": self.global_seed_sequence,
|
918 |
+
}
|
919 |
+
train_state_path = os.path.join(ckpt_dir, "trainer.ckpt")
|
920 |
+
torch.save(state, train_state_path)
|
921 |
+
# iteration indicator
|
922 |
+
f = open(os.path.join(ckpt_dir, self._get_backup_ckpt_name()), "w")
|
923 |
+
f.close()
|
924 |
+
|
925 |
+
logging.info(f"Trainer state is saved to: {train_state_path}")
|
926 |
+
|
927 |
+
# Remove temp ckpt
|
928 |
+
if temp_ckpt_dir is not None and os.path.exists(temp_ckpt_dir):
|
929 |
+
shutil.rmtree(temp_ckpt_dir, ignore_errors=True)
|
930 |
+
logging.debug("Old checkpoint backup is removed.")
|
931 |
+
|
932 |
+
def load_checkpoint(
|
933 |
+
self, ckpt_path, load_trainer_state=True, resume_lr_scheduler=True
|
934 |
+
):
|
935 |
+
logging.info(f"Loading checkpoint from: {ckpt_path}")
|
936 |
+
# Load UNet
|
937 |
+
_model_path = os.path.join(ckpt_path, "unet", "diffusion_pytorch_model.bin")
|
938 |
+
self.model.unet.load_state_dict(
|
939 |
+
torch.load(_model_path, map_location=self.device)
|
940 |
+
)
|
941 |
+
self.model.unet.to(self.device)
|
942 |
+
logging.info(f"UNet parameters are loaded from {_model_path}")
|
943 |
+
|
944 |
+
# Load training states
|
945 |
+
if load_trainer_state:
|
946 |
+
checkpoint = torch.load(os.path.join(ckpt_path, "trainer.ckpt"))
|
947 |
+
self.effective_iter = checkpoint["effective_iter"]
|
948 |
+
self.epoch = checkpoint["epoch"]
|
949 |
+
self.n_batch_in_epoch = checkpoint["n_batch_in_epoch"]
|
950 |
+
self.in_evaluation = checkpoint["in_evaluation"]
|
951 |
+
self.global_seed_sequence = checkpoint["global_seed_sequence"]
|
952 |
+
|
953 |
+
self.best_metric = checkpoint["best_metric"]
|
954 |
+
|
955 |
+
self.optimizer.load_state_dict(checkpoint["optimizer"])
|
956 |
+
logging.info(f"optimizer state is loaded from {ckpt_path}")
|
957 |
+
|
958 |
+
if resume_lr_scheduler:
|
959 |
+
self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
|
960 |
+
logging.info(f"LR scheduler state is loaded from {ckpt_path}")
|
961 |
+
|
962 |
+
logging.info(
|
963 |
+
f"Checkpoint loaded from: {ckpt_path}. Resume from iteration {self.effective_iter} (epoch {self.epoch})"
|
964 |
+
)
|
965 |
+
return
|
966 |
+
|
967 |
+
def _get_backup_ckpt_name(self):
|
968 |
+
return f"iter_{self.effective_iter:06d}"
|
src/trainer/marigold_xl_trainer.py
ADDED
@@ -0,0 +1,948 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# An official reimplemented version of Marigold training script.
|
2 |
+
# Last modified: 2024-04-29
|
3 |
+
#
|
4 |
+
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
+
# you may not use this file except in compliance with the License.
|
8 |
+
# You may obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
+
# See the License for the specific language governing permissions and
|
16 |
+
# limitations under the License.
|
17 |
+
# --------------------------------------------------------------------------
|
18 |
+
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
19 |
+
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
20 |
+
# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
|
21 |
+
# More information about the method can be found at https://marigoldmonodepth.github.io
|
22 |
+
# --------------------------------------------------------------------------
|
23 |
+
|
24 |
+
|
25 |
+
import logging
|
26 |
+
import os
|
27 |
+
import pdb
|
28 |
+
import shutil
|
29 |
+
from datetime import datetime
|
30 |
+
from typing import List, Union
|
31 |
+
import safetensors
|
32 |
+
import numpy as np
|
33 |
+
import torch
|
34 |
+
from diffusers import DDPMScheduler
|
35 |
+
from omegaconf import OmegaConf
|
36 |
+
from torch.nn import Conv2d
|
37 |
+
from torch.nn.parameter import Parameter
|
38 |
+
from torch.optim import Adam
|
39 |
+
from torch.optim.lr_scheduler import LambdaLR
|
40 |
+
from torch.utils.data import DataLoader
|
41 |
+
from tqdm import tqdm
|
42 |
+
from PIL import Image
|
43 |
+
# import torch.optim.lr_scheduler
|
44 |
+
|
45 |
+
from marigold.marigold_pipeline import MarigoldPipeline, MarigoldDepthOutput
|
46 |
+
from src.util import metric
|
47 |
+
from src.util.data_loader import skip_first_batches
|
48 |
+
from src.util.logging_util import tb_logger, eval_dic_to_text
|
49 |
+
from src.util.loss import get_loss
|
50 |
+
from src.util.lr_scheduler import IterExponential
|
51 |
+
from src.util.metric import MetricTracker
|
52 |
+
from src.util.multi_res_noise import multi_res_noise_like
|
53 |
+
from src.util.alignment import align_depth_least_square
|
54 |
+
from src.util.seeding import generate_seed_sequence
|
55 |
+
from accelerate import Accelerator
|
56 |
+
import random
|
57 |
+
|
58 |
+
class MarigoldXLTrainer:
|
59 |
+
def __init__(
|
60 |
+
self,
|
61 |
+
cfg: OmegaConf,
|
62 |
+
model: MarigoldPipeline,
|
63 |
+
train_dataloader: DataLoader,
|
64 |
+
device,
|
65 |
+
base_ckpt_dir,
|
66 |
+
out_dir_ckpt,
|
67 |
+
out_dir_eval,
|
68 |
+
out_dir_vis,
|
69 |
+
accumulation_steps: int,
|
70 |
+
separate_list: List = None,
|
71 |
+
val_dataloaders: List[DataLoader] = None,
|
72 |
+
vis_dataloaders: List[DataLoader] = None,
|
73 |
+
timestep_method: str = 'unidiffuser'
|
74 |
+
):
|
75 |
+
self.cfg: OmegaConf = cfg
|
76 |
+
self.model: MarigoldPipeline = model
|
77 |
+
self.device = device
|
78 |
+
self.seed: Union[int, None] = (
|
79 |
+
self.cfg.trainer.init_seed
|
80 |
+
) # used to generate seed sequence, set to `None` to train w/o seeding
|
81 |
+
self.out_dir_ckpt = out_dir_ckpt
|
82 |
+
self.out_dir_eval = out_dir_eval
|
83 |
+
self.out_dir_vis = out_dir_vis
|
84 |
+
self.train_loader: DataLoader = train_dataloader
|
85 |
+
self.val_loaders: List[DataLoader] = val_dataloaders
|
86 |
+
self.vis_loaders: List[DataLoader] = vis_dataloaders
|
87 |
+
self.accumulation_steps: int = accumulation_steps
|
88 |
+
self.separate_list = separate_list
|
89 |
+
self.timestep_method = timestep_method
|
90 |
+
# Adapt input layers
|
91 |
+
# if 8 != self.model.unet.config["in_channels"]:
|
92 |
+
# self._replace_unet_conv_in()
|
93 |
+
# if 8 != self.model.unet.config["out_channels"]:
|
94 |
+
# self._replace_unet_conv_out()
|
95 |
+
|
96 |
+
self.prompt = ['a view of a city skyline from a bridge',
|
97 |
+
'a man and a woman sitting on a couch',
|
98 |
+
'a black car parked in a parking lot next to the water',
|
99 |
+
'Enchanted forest with glowing plants, fairies, and ancient castle.',
|
100 |
+
'Futuristic city with skyscrapers, neon lights, and hovering vehicles.',
|
101 |
+
'Fantasy mountain landscape with waterfalls, dragons, and mythical creatures.']
|
102 |
+
# self.generator = torch.Generator('cuda:0').manual_seed(1024)
|
103 |
+
|
104 |
+
# Encode empty text prompt
|
105 |
+
# self.model.encode_empty_text()
|
106 |
+
# self.empty_text_embed = self.model.empty_text_embed.detach().clone().to(device)
|
107 |
+
|
108 |
+
self.model.unet.enable_xformers_memory_efficient_attention()
|
109 |
+
|
110 |
+
# Trainability
|
111 |
+
self.model.vae.requires_grad_(False)
|
112 |
+
self.model.text_encoder.requires_grad_(False)
|
113 |
+
# self.model.unet.requires_grad_(True)
|
114 |
+
|
115 |
+
grad_part = filter(lambda p: p.requires_grad, self.model.unet.parameters())
|
116 |
+
|
117 |
+
# Optimizer !should be defined after input layer is adapted
|
118 |
+
lr = self.cfg.lr
|
119 |
+
self.optimizer = Adam(grad_part, lr=lr)
|
120 |
+
|
121 |
+
total_params = sum(p.numel() for p in self.model.unet.parameters())
|
122 |
+
total_params_m = total_params / 1_000_000
|
123 |
+
print(f"Total parameters: {total_params_m:.2f}M")
|
124 |
+
trainable_params = sum(p.numel() for p in self.model.unet.parameters() if p.requires_grad)
|
125 |
+
trainable_params_m = trainable_params / 1_000_000
|
126 |
+
print(f"Trainable parameters: {trainable_params_m:.2f}M")
|
127 |
+
|
128 |
+
# LR scheduler
|
129 |
+
lr_func = IterExponential(
|
130 |
+
total_iter_length=self.cfg.lr_scheduler.kwargs.total_iter,
|
131 |
+
final_ratio=self.cfg.lr_scheduler.kwargs.final_ratio,
|
132 |
+
warmup_steps=self.cfg.lr_scheduler.kwargs.warmup_steps,
|
133 |
+
)
|
134 |
+
self.lr_scheduler = LambdaLR(optimizer=self.optimizer, lr_lambda=lr_func)
|
135 |
+
|
136 |
+
# Loss
|
137 |
+
self.loss = get_loss(loss_name=self.cfg.loss.name, **self.cfg.loss.kwargs)
|
138 |
+
|
139 |
+
# Training noise scheduler
|
140 |
+
self.training_noise_scheduler: DDPMScheduler = DDPMScheduler.from_pretrained(
|
141 |
+
os.path.join(
|
142 |
+
cfg.trainer.training_noise_scheduler.pretrained_path,
|
143 |
+
"scheduler",
|
144 |
+
)
|
145 |
+
)
|
146 |
+
self.prediction_type = self.training_noise_scheduler.config.prediction_type
|
147 |
+
assert (
|
148 |
+
self.prediction_type == self.model.scheduler.config.prediction_type
|
149 |
+
), "Different prediction types"
|
150 |
+
self.scheduler_timesteps = (
|
151 |
+
self.training_noise_scheduler.config.num_train_timesteps
|
152 |
+
)
|
153 |
+
|
154 |
+
# Eval metrics
|
155 |
+
self.metric_funcs = [getattr(metric, _met) for _met in cfg.eval.eval_metrics]
|
156 |
+
self.train_metrics = MetricTracker(*["loss", 'rgb_loss', 'depth_loss'])
|
157 |
+
self.val_metrics = MetricTracker(*[m.__name__ for m in self.metric_funcs])
|
158 |
+
# main metric for best checkpoint saving
|
159 |
+
self.main_val_metric = cfg.validation.main_val_metric
|
160 |
+
self.main_val_metric_goal = cfg.validation.main_val_metric_goal
|
161 |
+
assert (
|
162 |
+
self.main_val_metric in cfg.eval.eval_metrics
|
163 |
+
), f"Main eval metric `{self.main_val_metric}` not found in evaluation metrics."
|
164 |
+
self.best_metric = 1e8 if "minimize" == self.main_val_metric_goal else -1e8
|
165 |
+
|
166 |
+
# Settings
|
167 |
+
self.max_epoch = self.cfg.max_epoch
|
168 |
+
self.max_iter = self.cfg.max_iter
|
169 |
+
self.gradient_accumulation_steps = accumulation_steps
|
170 |
+
self.gt_depth_type = self.cfg.gt_depth_type
|
171 |
+
self.gt_mask_type = self.cfg.gt_mask_type
|
172 |
+
self.save_period = self.cfg.trainer.save_period
|
173 |
+
self.backup_period = self.cfg.trainer.backup_period
|
174 |
+
self.val_period = self.cfg.trainer.validation_period
|
175 |
+
self.vis_period = self.cfg.trainer.visualization_period
|
176 |
+
|
177 |
+
# Multi-resolution noise
|
178 |
+
self.apply_multi_res_noise = self.cfg.multi_res_noise is not None
|
179 |
+
if self.apply_multi_res_noise:
|
180 |
+
self.mr_noise_strength = self.cfg.multi_res_noise.strength
|
181 |
+
self.annealed_mr_noise = self.cfg.multi_res_noise.annealed
|
182 |
+
self.mr_noise_downscale_strategy = (
|
183 |
+
self.cfg.multi_res_noise.downscale_strategy
|
184 |
+
)
|
185 |
+
|
186 |
+
# Internal variables
|
187 |
+
self.epoch = 0
|
188 |
+
self.n_batch_in_epoch = 0 # batch index in the epoch, used when resume training
|
189 |
+
self.effective_iter = 0 # how many times optimizer.step() is called
|
190 |
+
self.in_evaluation = False
|
191 |
+
self.global_seed_sequence: List = [] # consistent global seed sequence, used to seed random generator, to ensure consistency when resuming
|
192 |
+
|
193 |
+
def _replace_unet_conv_in(self):
|
194 |
+
# replace the first layer to accept 8 in_channels
|
195 |
+
_weight = self.model.unet.conv_in.weight.clone() # [320, 4, 3, 3]
|
196 |
+
_bias = self.model.unet.conv_in.bias.clone() # [320]
|
197 |
+
zero_weight = torch.zeros(_weight.shape).to(_weight.device)
|
198 |
+
_weight = torch.cat([_weight, zero_weight], dim=1)
|
199 |
+
# _weight = _weight.repeat((1, 2, 1, 1)) # Keep selected channel(s)
|
200 |
+
# half the activation magnitude
|
201 |
+
# _weight *= 0.5
|
202 |
+
# new conv_in channel
|
203 |
+
_n_convin_out_channel = self.model.unet.conv_in.out_channels
|
204 |
+
_new_conv_in = Conv2d(
|
205 |
+
8, _n_convin_out_channel, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
|
206 |
+
)
|
207 |
+
_new_conv_in.weight = Parameter(_weight)
|
208 |
+
_new_conv_in.bias = Parameter(_bias)
|
209 |
+
self.model.unet.conv_in = _new_conv_in
|
210 |
+
logging.info("Unet conv_in layer is replaced")
|
211 |
+
# replace config
|
212 |
+
self.model.unet.config["in_channels"] = 8
|
213 |
+
logging.info("Unet config is updated")
|
214 |
+
return
|
215 |
+
|
216 |
+
def _replace_unet_conv_out(self):
|
217 |
+
# replace the first layer to accept 8 in_channels
|
218 |
+
_weight = self.model.unet.conv_out.weight.clone() # [8, 320, 3, 3]
|
219 |
+
_bias = self.model.unet.conv_out.bias.clone() # [320]
|
220 |
+
_weight = _weight.repeat((2, 1, 1, 1)) # Keep selected channel(s)
|
221 |
+
_bias = _bias.repeat((2))
|
222 |
+
# half the activation magnitude
|
223 |
+
|
224 |
+
# new conv_in channel
|
225 |
+
_n_convin_out_channel = self.model.unet.conv_out.out_channels
|
226 |
+
_new_conv_out = Conv2d(
|
227 |
+
_n_convin_out_channel, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
|
228 |
+
)
|
229 |
+
_new_conv_out.weight = Parameter(_weight)
|
230 |
+
_new_conv_out.bias = Parameter(_bias)
|
231 |
+
self.model.unet.conv_out = _new_conv_out
|
232 |
+
logging.info("Unet conv_out layer is replaced")
|
233 |
+
# replace config
|
234 |
+
self.model.unet.config["out_channels"] = 8
|
235 |
+
logging.info("Unet config is updated")
|
236 |
+
return
|
237 |
+
|
238 |
+
def parallel_train(self, t_end=None, accelerator=None):
|
239 |
+
logging.info("Start training")
|
240 |
+
|
241 |
+
self.model, self.optimizer, self.train_loader, self.lr_scheduler = accelerator.prepare(
|
242 |
+
self.model, self.optimizer, self.train_loader, self.lr_scheduler
|
243 |
+
)
|
244 |
+
self.accelerator = accelerator
|
245 |
+
if self.val_loaders is not None:
|
246 |
+
for idx, loader in enumerate(self.val_loaders):
|
247 |
+
self.val_loaders[idx] = accelerator.prepare(loader)
|
248 |
+
|
249 |
+
if os.path.exists(os.path.join(self.out_dir_ckpt, 'latest')):
|
250 |
+
accelerator.load_state(os.path.join(self.out_dir_ckpt, 'latest'))
|
251 |
+
self.load_miscs(os.path.join(self.out_dir_ckpt, 'latest'))
|
252 |
+
|
253 |
+
self.train_metrics.reset()
|
254 |
+
accumulated_step = 0
|
255 |
+
for epoch in range(self.epoch, self.max_epoch + 1):
|
256 |
+
self.epoch = epoch
|
257 |
+
logging.debug(f"epoch: {self.epoch}")
|
258 |
+
|
259 |
+
# Skip previous batches when resume
|
260 |
+
for batch in skip_first_batches(self.train_loader, self.n_batch_in_epoch):
|
261 |
+
self.model.unet.train()
|
262 |
+
|
263 |
+
# globally consistent random generators
|
264 |
+
if self.seed is not None:
|
265 |
+
local_seed = self._get_next_seed()
|
266 |
+
rand_num_generator = torch.Generator(device=self.model.device)
|
267 |
+
rand_num_generator.manual_seed(local_seed)
|
268 |
+
else:
|
269 |
+
rand_num_generator = None
|
270 |
+
|
271 |
+
# >>> With gradient accumulation >>>
|
272 |
+
|
273 |
+
# Get data
|
274 |
+
rgb = batch["rgb_norm"].to(self.model.device)
|
275 |
+
depth_gt_for_latent = batch[self.gt_depth_type].to(self.model.device)
|
276 |
+
batch_size = rgb.shape[0]
|
277 |
+
|
278 |
+
if self.gt_mask_type is not None:
|
279 |
+
valid_mask_for_latent = batch[self.gt_mask_type].to(self.model.device)
|
280 |
+
invalid_mask = ~valid_mask_for_latent
|
281 |
+
valid_mask_down = ~torch.max_pool2d(
|
282 |
+
invalid_mask.float(), 8, 8
|
283 |
+
).bool()
|
284 |
+
valid_mask_down = valid_mask_down.repeat((1, 4, 1, 1))
|
285 |
+
|
286 |
+
with torch.no_grad():
|
287 |
+
# Encode image
|
288 |
+
rgb_latent = self.model.encode_rgb(rgb) # [B, 4, h, w]
|
289 |
+
# Encode GT depth
|
290 |
+
gt_depth_latent = self.encode_depth(
|
291 |
+
depth_gt_for_latent
|
292 |
+
) # [B, 4, h, w]
|
293 |
+
|
294 |
+
# Sample a random timestep for each image
|
295 |
+
if self.cfg.loss.depth_factor == 1:
|
296 |
+
rgb_timesteps = torch.zeros(
|
297 |
+
(batch_size),
|
298 |
+
device=self.model.device
|
299 |
+
).long() # [B]
|
300 |
+
depth_timesteps = torch.randint(
|
301 |
+
0,
|
302 |
+
self.scheduler_timesteps,
|
303 |
+
(batch_size,),
|
304 |
+
device=self.model.device,
|
305 |
+
generator=rand_num_generator,
|
306 |
+
).long() # [B]
|
307 |
+
elif self.timestep_method == 'unidiffuser':
|
308 |
+
rgb_timesteps = torch.randint(
|
309 |
+
0,
|
310 |
+
self.scheduler_timesteps,
|
311 |
+
(batch_size,),
|
312 |
+
device=self.model.device,
|
313 |
+
generator=rand_num_generator,
|
314 |
+
).long() # [B]
|
315 |
+
depth_timesteps = torch.randint(
|
316 |
+
0,
|
317 |
+
self.scheduler_timesteps,
|
318 |
+
(batch_size,),
|
319 |
+
device=self.model.device,
|
320 |
+
generator=rand_num_generator,
|
321 |
+
).long() # [B]
|
322 |
+
elif self.timestep_method == 'partition':
|
323 |
+
rand_num = random.random()
|
324 |
+
if rand_num < 0.3333:
|
325 |
+
# joint generation
|
326 |
+
rgb_timesteps = torch.randint(
|
327 |
+
0,
|
328 |
+
self.scheduler_timesteps,
|
329 |
+
(batch_size,),
|
330 |
+
device=self.model.device,
|
331 |
+
generator=rand_num_generator,
|
332 |
+
).long() # [B]
|
333 |
+
depth_timesteps = rgb_timesteps
|
334 |
+
elif rand_num < 0.6666:
|
335 |
+
# image2depth generation
|
336 |
+
rgb_timesteps = torch.zeros(
|
337 |
+
(batch_size),
|
338 |
+
device=self.model.device
|
339 |
+
).long() # [B]
|
340 |
+
depth_timesteps = torch.randint(
|
341 |
+
0,
|
342 |
+
self.scheduler_timesteps,
|
343 |
+
(batch_size,),
|
344 |
+
device=self.model.device,
|
345 |
+
generator=rand_num_generator,
|
346 |
+
).long() # [B]
|
347 |
+
else:
|
348 |
+
# depth2image generation
|
349 |
+
rgb_timesteps = torch.randint(
|
350 |
+
0,
|
351 |
+
self.scheduler_timesteps,
|
352 |
+
(batch_size,),
|
353 |
+
device=self.model.device,
|
354 |
+
generator=rand_num_generator,
|
355 |
+
).long() # [B]
|
356 |
+
depth_timesteps = torch.zeros(
|
357 |
+
(batch_size),
|
358 |
+
device=self.model.device
|
359 |
+
).long() # [B]
|
360 |
+
|
361 |
+
# Sample noise
|
362 |
+
if self.apply_multi_res_noise:
|
363 |
+
rgb_strength = self.mr_noise_strength
|
364 |
+
if self.annealed_mr_noise:
|
365 |
+
# calculate strength depending on t
|
366 |
+
rgb_strength = rgb_strength * (rgb_timesteps / self.scheduler_timesteps)
|
367 |
+
rgb_noise = multi_res_noise_like(
|
368 |
+
rgb_latent,
|
369 |
+
strength=rgb_strength,
|
370 |
+
downscale_strategy=self.mr_noise_downscale_strategy,
|
371 |
+
generator=rand_num_generator,
|
372 |
+
device=self.model.device,
|
373 |
+
)
|
374 |
+
|
375 |
+
depth_strength = self.mr_noise_strength
|
376 |
+
if self.annealed_mr_noise:
|
377 |
+
# calculate strength depending on t
|
378 |
+
depth_strength = depth_strength * (depth_timesteps / self.scheduler_timesteps)
|
379 |
+
depth_noise = multi_res_noise_like(
|
380 |
+
gt_depth_latent,
|
381 |
+
strength=depth_strength,
|
382 |
+
downscale_strategy=self.mr_noise_downscale_strategy,
|
383 |
+
generator=rand_num_generator,
|
384 |
+
device=self.model.device,
|
385 |
+
)
|
386 |
+
else:
|
387 |
+
rgb_noise = torch.randn(
|
388 |
+
rgb_latent.shape,
|
389 |
+
device=self.model.device,
|
390 |
+
generator=rand_num_generator,
|
391 |
+
) # [B, 8, h, w]
|
392 |
+
|
393 |
+
depth_noise = torch.randn(
|
394 |
+
gt_depth_latent.shape,
|
395 |
+
device=self.model.device,
|
396 |
+
generator=rand_num_generator,
|
397 |
+
) # [B, 8, h, w]
|
398 |
+
# Add noise to the latents (diffusion forward process)
|
399 |
+
|
400 |
+
noisy_rgb_latents = self.training_noise_scheduler.add_noise(
|
401 |
+
rgb_latent, rgb_noise, rgb_timesteps
|
402 |
+
) # [B, 4, h, w]
|
403 |
+
noisy_depth_latents = self.training_noise_scheduler.add_noise(
|
404 |
+
gt_depth_latent, depth_noise, depth_timesteps
|
405 |
+
) # [B, 4, h, w]
|
406 |
+
|
407 |
+
noisy_latents = torch.cat(
|
408 |
+
[noisy_rgb_latents, noisy_depth_latents], dim=1
|
409 |
+
).float() # [B, 8, h, w]
|
410 |
+
|
411 |
+
# Text embedding
|
412 |
+
batch_text_embed = []
|
413 |
+
batch_pooled_text_embed = []
|
414 |
+
for p in batch['text']:
|
415 |
+
prompt_embed, pooled_prompt_embed = self.model.encode_text(p)
|
416 |
+
batch_text_embed.append(prompt_embed)
|
417 |
+
batch_pooled_text_embed.append(pooled_prompt_embed)
|
418 |
+
batch_text_embed = torch.cat(batch_text_embed, dim=0)
|
419 |
+
batch_pooled_text_embed = torch.cat(batch_pooled_text_embed, dim=0)
|
420 |
+
# input_ids = {k:v.squeeze().to(self.model.device) for k,v in batch['text'].items()}
|
421 |
+
# prompt_embed, pooled_prompt_embed = self.model.encode_text(batch['text'])
|
422 |
+
# text_embed = self.empty_text_embed.to(device).repeat(
|
423 |
+
# (batch_size, 1, 1)
|
424 |
+
# ) # [B, 77, 1024]
|
425 |
+
# Predict the noise residual
|
426 |
+
add_time_ids = self.model._get_add_time_ids(
|
427 |
+
(batch['rgb_int'].shape[-2], batch['rgb_int'].shape[-1]), (0, 0), (batch['rgb_int'].shape[-2], batch['rgb_int'].shape[-1]), dtype=batch_text_embed.dtype
|
428 |
+
)
|
429 |
+
pdb.set_trace()
|
430 |
+
dtype = self.model.unet.dtype
|
431 |
+
added_cond_kwargs = {"text_embeds": batch_pooled_text_embed.to(self.model.device).to(dtype), "time_ids": add_time_ids.to(self.model.device).to(dtype)}
|
432 |
+
model_pred = self.model.unet(
|
433 |
+
noisy_latents.to(self.model.unet.dtype), rgb_timesteps, depth_timesteps, encoder_hidden_states=batch_text_embed.to(dtype),
|
434 |
+
added_cond_kwargs=added_cond_kwargs, separate_list=self.separate_list
|
435 |
+
).sample # [B, 4, h, w]
|
436 |
+
if torch.isnan(model_pred).any():
|
437 |
+
logging.warning("model_pred contains NaN.")
|
438 |
+
|
439 |
+
# Get the target for loss depending on the prediction type
|
440 |
+
if "sample" == self.prediction_type:
|
441 |
+
rgb_target = rgb_latent
|
442 |
+
depth_target = gt_depth_latent
|
443 |
+
elif "epsilon" == self.prediction_type:
|
444 |
+
rgb_target = rgb_latent
|
445 |
+
depth_target = gt_depth_latent
|
446 |
+
elif "v_prediction" == self.prediction_type:
|
447 |
+
rgb_target = self.training_noise_scheduler.get_velocity(
|
448 |
+
rgb_latent, rgb_noise, rgb_timesteps
|
449 |
+
) # [B, 4, h, w]
|
450 |
+
depth_target = self.training_noise_scheduler.get_velocity(
|
451 |
+
gt_depth_latent, depth_noise, depth_timesteps
|
452 |
+
) # [B, 4, h, w]
|
453 |
+
else:
|
454 |
+
raise ValueError(f"Unknown prediction type {self.prediction_type}")
|
455 |
+
# Masked latent loss
|
456 |
+
with accelerator.accumulate(self.model):
|
457 |
+
if self.gt_mask_type is not None:
|
458 |
+
depth_loss = self.loss(
|
459 |
+
model_pred[:, 4:, :, :][valid_mask_down].float(),
|
460 |
+
depth_target[valid_mask_down].float(),
|
461 |
+
)
|
462 |
+
else:
|
463 |
+
depth_loss = self.cfg.loss.depth_factor * self.loss(model_pred[:, 4:, :, :].float(),depth_target.float())
|
464 |
+
|
465 |
+
rgb_loss = (1 - self.cfg.loss.depth_factor) * self.loss(model_pred[:, 0:4, :, :].float(), rgb_target.float())
|
466 |
+
if self.cfg.loss.depth_factor == 1:
|
467 |
+
loss = depth_loss
|
468 |
+
else:
|
469 |
+
loss = rgb_loss + depth_loss
|
470 |
+
|
471 |
+
self.train_metrics.update("loss", loss.item())
|
472 |
+
self.train_metrics.update("rgb_loss", rgb_loss.item())
|
473 |
+
self.train_metrics.update("depth_loss", depth_loss.item())
|
474 |
+
# loss = loss / self.gradient_accumulation_steps
|
475 |
+
accelerator.backward(loss)
|
476 |
+
self.optimizer.step()
|
477 |
+
self.optimizer.zero_grad()
|
478 |
+
# loss.backward()
|
479 |
+
self.n_batch_in_epoch += 1
|
480 |
+
# print(accelerator.process_index, self.lr_scheduler.get_last_lr())
|
481 |
+
self.lr_scheduler.step(self.effective_iter)
|
482 |
+
|
483 |
+
if accelerator.sync_gradients:
|
484 |
+
accumulated_step += 1
|
485 |
+
|
486 |
+
if accumulated_step >= self.gradient_accumulation_steps:
|
487 |
+
accumulated_step = 0
|
488 |
+
self.effective_iter += 1
|
489 |
+
|
490 |
+
if accelerator.is_main_process:
|
491 |
+
# Log to tensorboard
|
492 |
+
if self.effective_iter == 1:
|
493 |
+
generator = torch.Generator(self.model.device).manual_seed(1024)
|
494 |
+
img = self.model.generate_rgbd(self.prompt, num_inference_steps=50, generator=generator,
|
495 |
+
show_pbar=True)
|
496 |
+
for idx in range(len(self.prompt)):
|
497 |
+
tb_logger.writer.add_image(f'image/{self.prompt[idx]}', img[idx], self.effective_iter)
|
498 |
+
|
499 |
+
accumulated_loss = self.train_metrics.result()["loss"]
|
500 |
+
rgb_loss = self.train_metrics.result()["rgb_loss"]
|
501 |
+
depth_loss = self.train_metrics.result()["depth_loss"]
|
502 |
+
tb_logger.log_dic(
|
503 |
+
{
|
504 |
+
f"train/{k}": v
|
505 |
+
for k, v in self.train_metrics.result().items()
|
506 |
+
},
|
507 |
+
global_step=self.effective_iter,
|
508 |
+
)
|
509 |
+
tb_logger.writer.add_scalar(
|
510 |
+
"lr",
|
511 |
+
self.lr_scheduler.get_last_lr()[0],
|
512 |
+
global_step=self.effective_iter,
|
513 |
+
)
|
514 |
+
tb_logger.writer.add_scalar(
|
515 |
+
"n_batch_in_epoch",
|
516 |
+
self.n_batch_in_epoch,
|
517 |
+
global_step=self.effective_iter,
|
518 |
+
)
|
519 |
+
logging.info(
|
520 |
+
f"iter {self.effective_iter:5d} (epoch {epoch:2d}): loss={accumulated_loss:.5f}, rgb_loss={rgb_loss:.5f}, depth_loss={depth_loss:.5f}"
|
521 |
+
)
|
522 |
+
accelerator.wait_for_everyone()
|
523 |
+
|
524 |
+
if self.save_period > 0 and 0 == self.effective_iter % self.save_period:
|
525 |
+
accelerator.save_state(output_dir=os.path.join(self.out_dir_ckpt, 'latest'))
|
526 |
+
unwrapped_model = accelerator.unwrap_model(self.model)
|
527 |
+
if accelerator.is_main_process:
|
528 |
+
accelerator.save_model(unwrapped_model.unet,
|
529 |
+
os.path.join(self.out_dir_ckpt, 'latest'), safe_serialization=False)
|
530 |
+
self.save_miscs('latest')
|
531 |
+
|
532 |
+
# RGB-D joint generation
|
533 |
+
generator = torch.Generator(self.model.device).manual_seed(1024)
|
534 |
+
img = self.model.generate_rgbd(self.prompt, num_inference_steps=50, generator=generator,show_pbar=False)
|
535 |
+
for idx in range(len(self.prompt)):
|
536 |
+
tb_logger.writer.add_image(f'image/{self.prompt[idx]}', img[idx], self.effective_iter)
|
537 |
+
|
538 |
+
# depth to RGB generation
|
539 |
+
self._depth2image()
|
540 |
+
from diffusers import StableDiffusionControlNetInpaintPipeline
|
541 |
+
# RGB to depth generation
|
542 |
+
self._image2depth()
|
543 |
+
|
544 |
+
accelerator.wait_for_everyone()
|
545 |
+
|
546 |
+
accelerator.wait_for_everyone()
|
547 |
+
|
548 |
+
if self.backup_period > 0 and 0 == self.effective_iter % self.backup_period:
|
549 |
+
unwrapped_model = accelerator.unwrap_model(self.model)
|
550 |
+
if accelerator.is_main_process:
|
551 |
+
unwrapped_model.unet.save_pretrained(
|
552 |
+
os.path.join(self.out_dir_ckpt, self._get_backup_ckpt_name()))
|
553 |
+
accelerator.wait_for_everyone()
|
554 |
+
|
555 |
+
if self.val_period > 0 and 0 == self.effective_iter % self.val_period:
|
556 |
+
self.validate()
|
557 |
+
|
558 |
+
# End of training
|
559 |
+
if self.max_iter > 0 and self.effective_iter >= self.max_iter:
|
560 |
+
unwrapped_model = accelerator.unwrap_model(self.model)
|
561 |
+
if accelerator.is_main_process:
|
562 |
+
unwrapped_model.unet.save_pretrained(
|
563 |
+
os.path.join(self.out_dir_ckpt, self._get_backup_ckpt_name()))
|
564 |
+
accelerator.wait_for_everyone()
|
565 |
+
return
|
566 |
+
|
567 |
+
torch.cuda.empty_cache()
|
568 |
+
# <<< Effective batch end <<<
|
569 |
+
|
570 |
+
# Epoch end
|
571 |
+
self.n_batch_in_epoch = 0
|
572 |
+
|
573 |
+
def _image2depth(self):
|
574 |
+
generator = torch.Generator(self.model.device).manual_seed(1024)
|
575 |
+
image2dept_paths = ['/home/aiops/wangzh/data/scannet/scene0593_00/color/000100.jpg',
|
576 |
+
'/home/aiops/wangzh/data/scannet/scene0593_00/color/000700.jpg',
|
577 |
+
'/home/aiops/wangzh/data/scannet/scene0591_01/color/000600.jpg',
|
578 |
+
'/home/aiops/wangzh/data/scannet/scene0591_01/color/001500.jpg']
|
579 |
+
for img_idx, image_path in enumerate(image2dept_paths):
|
580 |
+
rgb_input = Image.open(image_path)
|
581 |
+
depth_pred: MarigoldDepthOutput = self.model.image2depth(
|
582 |
+
rgb_input,
|
583 |
+
denoising_steps=self.cfg.validation.denoising_steps,
|
584 |
+
ensemble_size=self.cfg.validation.ensemble_size,
|
585 |
+
processing_res=self.cfg.validation.processing_res,
|
586 |
+
match_input_res=self.cfg.validation.match_input_res,
|
587 |
+
generator=generator,
|
588 |
+
batch_size=self.cfg.validation.ensemble_size,
|
589 |
+
# use batch size 1 to increase reproducibility
|
590 |
+
color_map="Spectral",
|
591 |
+
show_progress_bar=False,
|
592 |
+
resample_method=self.cfg.validation.resample_method,
|
593 |
+
)
|
594 |
+
img = self.model.post_process_rgbd(['None'], [rgb_input], [depth_pred['depth_colored']])
|
595 |
+
tb_logger.writer.add_image(f'image2depth_{img_idx}', img[0], self.effective_iter)
|
596 |
+
|
597 |
+
def _depth2image(self):
|
598 |
+
generator = torch.Generator(self.model.device).manual_seed(1024)
|
599 |
+
if "least_square_disparity" == self.cfg.eval.alignment:
|
600 |
+
depth2image_path = ['/home/aiops/wangzh/data/ori_depth_part0-0/sa_10000335.jpg',
|
601 |
+
'/home/aiops/wangzh/data/ori_depth_part0-0/sa_3572319.jpg',
|
602 |
+
'/home/aiops/wangzh/data/ori_depth_part0-0/sa_457934.jpg']
|
603 |
+
else:
|
604 |
+
depth2image_path = ['/home/aiops/wangzh/data/depth_part0-0/sa_10000335.jpg',
|
605 |
+
'/home/aiops/wangzh/data/depth_part0-0/sa_3572319.jpg',
|
606 |
+
'/home/aiops/wangzh/data/depth_part0-0/sa_457934.jpg']
|
607 |
+
prompts = ['Red car parked in the factory',
|
608 |
+
'White gothic church with cemetery next to it',
|
609 |
+
'House with red roof and starry sky in the background']
|
610 |
+
for img_idx, depth_path in enumerate(depth2image_path):
|
611 |
+
depth_input = Image.open(depth_path)
|
612 |
+
image_pred = self.model.single_depth2image(
|
613 |
+
depth_input,
|
614 |
+
prompts[img_idx],
|
615 |
+
num_inference_steps=50,
|
616 |
+
processing_res=1024,
|
617 |
+
generator=generator,
|
618 |
+
show_pbar=False,
|
619 |
+
resample_method=self.cfg.validation.resample_method,
|
620 |
+
)
|
621 |
+
img = self.model.post_process_rgbd([prompts[img_idx]], [image_pred], [depth_input])
|
622 |
+
tb_logger.writer.add_image(f'depth2image_{img_idx}', img[0], self.effective_iter)
|
623 |
+
|
624 |
+
def encode_depth(self, depth_in):
|
625 |
+
# stack depth into 3-channel
|
626 |
+
stacked = self.stack_depth_images(depth_in)
|
627 |
+
# encode using VAE encoder
|
628 |
+
depth_latent = self.model.encode_rgb(stacked)
|
629 |
+
return depth_latent
|
630 |
+
|
631 |
+
@staticmethod
|
632 |
+
def stack_depth_images(depth_in):
|
633 |
+
if 4 == len(depth_in.shape):
|
634 |
+
stacked = depth_in.repeat(1, 3, 1, 1)
|
635 |
+
elif 3 == len(depth_in.shape):
|
636 |
+
stacked = depth_in.unsqueeze(1)
|
637 |
+
stacked = depth_in.repeat(1, 3, 1, 1)
|
638 |
+
return stacked
|
639 |
+
|
640 |
+
def _train_step_callback(self):
|
641 |
+
"""Executed after every iteration"""
|
642 |
+
# Save backup (with a larger interval, without training states)
|
643 |
+
if self.backup_period > 0 and 0 == self.effective_iter % self.backup_period:
|
644 |
+
self.save_checkpoint(
|
645 |
+
ckpt_name=self._get_backup_ckpt_name(), save_train_state=False
|
646 |
+
)
|
647 |
+
|
648 |
+
_is_latest_saved = False
|
649 |
+
# Validation
|
650 |
+
if self.val_period > 0 and 0 == self.effective_iter % self.val_period:
|
651 |
+
self.in_evaluation = True # flag to do evaluation in resume run if validation is not finished
|
652 |
+
self.save_checkpoint(ckpt_name="latest", save_train_state=True)
|
653 |
+
_is_latest_saved = True
|
654 |
+
self.validate()
|
655 |
+
self.in_evaluation = False
|
656 |
+
self.save_checkpoint(ckpt_name="latest", save_train_state=True)
|
657 |
+
|
658 |
+
# Save training checkpoint (can be resumed)
|
659 |
+
if (
|
660 |
+
self.save_period > 0
|
661 |
+
and 0 == self.effective_iter % self.save_period
|
662 |
+
and not _is_latest_saved
|
663 |
+
):
|
664 |
+
generator = torch.Generator(self.model.device).manual_seed(1024)
|
665 |
+
img = self.model.generate_rgbd(self.prompt, num_inference_steps=50, generator=generator, show_pbar=True)
|
666 |
+
for idx in range(len(self.prompt)):
|
667 |
+
tb_logger.writer.add_image(f'image/{self.prompt[idx]}', img[idx], self.effective_iter)
|
668 |
+
|
669 |
+
self.save_checkpoint(ckpt_name="latest", save_train_state=True)
|
670 |
+
|
671 |
+
# Visualization
|
672 |
+
if self.vis_period > 0 and 0 == self.effective_iter % self.vis_period:
|
673 |
+
self.visualize()
|
674 |
+
|
675 |
+
def validate(self):
|
676 |
+
for i, val_loader in enumerate(self.val_loaders):
|
677 |
+
val_dataset_name = val_loader.dataset.disp_name
|
678 |
+
val_metric_dic = self.validate_single_dataset(
|
679 |
+
data_loader=val_loader, metric_tracker=self.val_metrics
|
680 |
+
)
|
681 |
+
|
682 |
+
if self.accelerator.is_main_process:
|
683 |
+
val_metric_dic = {k:torch.tensor(v).cuda() for k,v in val_metric_dic.items()}
|
684 |
+
|
685 |
+
tb_logger.log_dic(
|
686 |
+
{f"val/{val_dataset_name}/{k}": v for k, v in val_metric_dic.items()},
|
687 |
+
global_step=self.effective_iter,
|
688 |
+
)
|
689 |
+
# save to file
|
690 |
+
eval_text = eval_dic_to_text(
|
691 |
+
val_metrics=val_metric_dic,
|
692 |
+
dataset_name=val_dataset_name,
|
693 |
+
sample_list_path=val_loader.dataset.filename_ls_path,
|
694 |
+
)
|
695 |
+
_save_to = os.path.join(
|
696 |
+
self.out_dir_eval,
|
697 |
+
f"eval-{val_dataset_name}-iter{self.effective_iter:06d}.txt",
|
698 |
+
)
|
699 |
+
with open(_save_to, "w+") as f:
|
700 |
+
f.write(eval_text)
|
701 |
+
|
702 |
+
# Update main eval metric
|
703 |
+
if 0 == i:
|
704 |
+
main_eval_metric = val_metric_dic[self.main_val_metric]
|
705 |
+
if (
|
706 |
+
"minimize" == self.main_val_metric_goal
|
707 |
+
and main_eval_metric < self.best_metric
|
708 |
+
or "maximize" == self.main_val_metric_goal
|
709 |
+
and main_eval_metric > self.best_metric
|
710 |
+
):
|
711 |
+
self.best_metric = main_eval_metric
|
712 |
+
logging.info(
|
713 |
+
f"Best metric: {self.main_val_metric} = {self.best_metric} at iteration {self.effective_iter}"
|
714 |
+
)
|
715 |
+
# Save a checkpoint
|
716 |
+
self.save_checkpoint(
|
717 |
+
ckpt_name='best', save_train_state=False
|
718 |
+
)
|
719 |
+
|
720 |
+
self.accelerator.wait_for_everyone()
|
721 |
+
|
722 |
+
def visualize(self):
|
723 |
+
for val_loader in self.vis_loaders:
|
724 |
+
vis_dataset_name = val_loader.dataset.disp_name
|
725 |
+
vis_out_dir = os.path.join(
|
726 |
+
self.out_dir_vis, self._get_backup_ckpt_name(), vis_dataset_name
|
727 |
+
)
|
728 |
+
os.makedirs(vis_out_dir, exist_ok=True)
|
729 |
+
_ = self.validate_single_dataset(
|
730 |
+
data_loader=val_loader,
|
731 |
+
metric_tracker=self.val_metrics,
|
732 |
+
save_to_dir=vis_out_dir,
|
733 |
+
)
|
734 |
+
|
735 |
+
@torch.no_grad()
|
736 |
+
def validate_single_dataset(
|
737 |
+
self,
|
738 |
+
data_loader: DataLoader,
|
739 |
+
metric_tracker: MetricTracker,
|
740 |
+
save_to_dir: str = None,
|
741 |
+
):
|
742 |
+
self.model.to(self.device)
|
743 |
+
metric_tracker.reset()
|
744 |
+
|
745 |
+
# Generate seed sequence for consistent evaluation
|
746 |
+
val_init_seed = self.cfg.validation.init_seed
|
747 |
+
val_seed_ls = generate_seed_sequence(val_init_seed, len(data_loader))
|
748 |
+
|
749 |
+
for i, batch in enumerate(
|
750 |
+
tqdm(data_loader, desc=f"evaluating on {data_loader.dataset.disp_name}"),
|
751 |
+
start=1,
|
752 |
+
):
|
753 |
+
|
754 |
+
rgb_int = batch["rgb_int"] # [3, H, W]
|
755 |
+
# GT depth
|
756 |
+
depth_raw_ts = batch["depth_raw_linear"].squeeze()
|
757 |
+
depth_raw = depth_raw_ts.cpu().numpy()
|
758 |
+
depth_raw_ts = depth_raw_ts.to(self.device)
|
759 |
+
valid_mask_ts = batch["valid_mask_raw"].squeeze()
|
760 |
+
valid_mask = valid_mask_ts.cpu().numpy()
|
761 |
+
valid_mask_ts = valid_mask_ts.to(self.device)
|
762 |
+
|
763 |
+
# Random number generator
|
764 |
+
seed = val_seed_ls.pop()
|
765 |
+
if seed is None:
|
766 |
+
generator = None
|
767 |
+
else:
|
768 |
+
generator = torch.Generator(device=self.device)
|
769 |
+
generator.manual_seed(seed)
|
770 |
+
|
771 |
+
# Predict depth
|
772 |
+
pipe_out: MarigoldDepthOutput = self.model.image2depth(
|
773 |
+
rgb_int,
|
774 |
+
denoising_steps=self.cfg.validation.denoising_steps,
|
775 |
+
ensemble_size=self.cfg.validation.ensemble_size,
|
776 |
+
processing_res=self.cfg.validation.processing_res,
|
777 |
+
match_input_res=self.cfg.validation.match_input_res,
|
778 |
+
generator=generator,
|
779 |
+
batch_size=self.cfg.validation.ensemble_size, # use batch size 1 to increase reproducibility
|
780 |
+
color_map=None,
|
781 |
+
show_progress_bar=False,
|
782 |
+
resample_method=self.cfg.validation.resample_method,
|
783 |
+
)
|
784 |
+
|
785 |
+
depth_pred: np.ndarray = pipe_out.depth_np
|
786 |
+
|
787 |
+
if "least_square" == self.cfg.eval.alignment:
|
788 |
+
depth_pred, scale, shift = align_depth_least_square(
|
789 |
+
gt_arr=depth_raw,
|
790 |
+
pred_arr=depth_pred,
|
791 |
+
valid_mask_arr=valid_mask,
|
792 |
+
return_scale_shift=True,
|
793 |
+
max_resolution=self.cfg.eval.align_max_res,
|
794 |
+
)
|
795 |
+
else:
|
796 |
+
raise RuntimeError(f"Unknown alignment type: {self.cfg.eval.alignment}")
|
797 |
+
|
798 |
+
# Clip to dataset min max
|
799 |
+
depth_pred = np.clip(
|
800 |
+
depth_pred,
|
801 |
+
a_min=data_loader.dataset.min_depth,
|
802 |
+
a_max=data_loader.dataset.max_depth,
|
803 |
+
)
|
804 |
+
|
805 |
+
# clip to d > 0 for evaluation
|
806 |
+
depth_pred = np.clip(depth_pred, a_min=1e-6, a_max=None)
|
807 |
+
|
808 |
+
# Evaluate
|
809 |
+
sample_metric = []
|
810 |
+
depth_pred_ts = torch.from_numpy(depth_pred).to(self.device)
|
811 |
+
|
812 |
+
for met_func in self.metric_funcs:
|
813 |
+
_metric_name = met_func.__name__
|
814 |
+
_metric = met_func(depth_pred_ts, depth_raw_ts, valid_mask_ts).cuda(self.accelerator.process_index)
|
815 |
+
self.accelerator.wait_for_everyone()
|
816 |
+
_metric = self.accelerator.gather_for_metrics(_metric.unsqueeze(0)).mean().item()
|
817 |
+
sample_metric.append(_metric.__str__())
|
818 |
+
metric_tracker.update(_metric_name, _metric)
|
819 |
+
|
820 |
+
self.accelerator.wait_for_everyone()
|
821 |
+
# Save as 16-bit uint png
|
822 |
+
if save_to_dir is not None:
|
823 |
+
img_name = batch["rgb_relative_path"][0].replace("/", "_")
|
824 |
+
png_save_path = os.path.join(save_to_dir, f"{img_name}.png")
|
825 |
+
depth_to_save = (pipe_out.depth_np * 65535.0).astype(np.uint16)
|
826 |
+
Image.fromarray(depth_to_save).save(png_save_path, mode="I;16")
|
827 |
+
|
828 |
+
return metric_tracker.result()
|
829 |
+
|
830 |
+
def _get_next_seed(self):
|
831 |
+
if 0 == len(self.global_seed_sequence):
|
832 |
+
self.global_seed_sequence = generate_seed_sequence(
|
833 |
+
initial_seed=self.seed,
|
834 |
+
length=self.max_iter * self.gradient_accumulation_steps,
|
835 |
+
)
|
836 |
+
logging.info(
|
837 |
+
f"Global seed sequence is generated, length={len(self.global_seed_sequence)}"
|
838 |
+
)
|
839 |
+
return self.global_seed_sequence.pop()
|
840 |
+
|
841 |
+
def save_miscs(self, ckpt_name):
|
842 |
+
ckpt_dir = os.path.join(self.out_dir_ckpt, ckpt_name)
|
843 |
+
state = {
|
844 |
+
"config": self.cfg,
|
845 |
+
"effective_iter": self.effective_iter,
|
846 |
+
"epoch": self.epoch,
|
847 |
+
"n_batch_in_epoch": self.n_batch_in_epoch,
|
848 |
+
"best_metric": self.best_metric,
|
849 |
+
"in_evaluation": self.in_evaluation,
|
850 |
+
"global_seed_sequence": self.global_seed_sequence,
|
851 |
+
}
|
852 |
+
train_state_path = os.path.join(ckpt_dir, "trainer.ckpt")
|
853 |
+
torch.save(state, train_state_path)
|
854 |
+
|
855 |
+
logging.info(f"Misc state is saved to: {train_state_path}")
|
856 |
+
|
857 |
+
def load_miscs(self, ckpt_path):
|
858 |
+
checkpoint = torch.load(os.path.join(ckpt_path, "trainer.ckpt"))
|
859 |
+
self.effective_iter = checkpoint["effective_iter"]
|
860 |
+
self.epoch = checkpoint["epoch"]
|
861 |
+
self.n_batch_in_epoch = checkpoint["n_batch_in_epoch"]
|
862 |
+
self.in_evaluation = checkpoint["in_evaluation"]
|
863 |
+
self.global_seed_sequence = checkpoint["global_seed_sequence"]
|
864 |
+
|
865 |
+
self.best_metric = checkpoint["best_metric"]
|
866 |
+
|
867 |
+
logging.info(f"Misc state is loaded from {ckpt_path}")
|
868 |
+
|
869 |
+
|
870 |
+
def save_checkpoint(self, ckpt_name, save_train_state):
|
871 |
+
ckpt_dir = os.path.join(self.out_dir_ckpt, ckpt_name)
|
872 |
+
logging.info(f"Saving checkpoint to: {ckpt_dir}")
|
873 |
+
# Backup previous checkpoint
|
874 |
+
temp_ckpt_dir = None
|
875 |
+
if os.path.exists(ckpt_dir) and os.path.isdir(ckpt_dir):
|
876 |
+
temp_ckpt_dir = os.path.join(
|
877 |
+
os.path.dirname(ckpt_dir), f"_old_{os.path.basename(ckpt_dir)}"
|
878 |
+
)
|
879 |
+
if os.path.exists(temp_ckpt_dir):
|
880 |
+
shutil.rmtree(temp_ckpt_dir, ignore_errors=True)
|
881 |
+
os.rename(ckpt_dir, temp_ckpt_dir)
|
882 |
+
logging.debug(f"Old checkpoint is backed up at: {temp_ckpt_dir}")
|
883 |
+
|
884 |
+
# Save UNet
|
885 |
+
unet_path = os.path.join(ckpt_dir, "unet")
|
886 |
+
self.model.unet.save_pretrained(unet_path, safe_serialization=False)
|
887 |
+
logging.info(f"UNet is saved to: {unet_path}")
|
888 |
+
|
889 |
+
if save_train_state:
|
890 |
+
state = {
|
891 |
+
"config": self.cfg,
|
892 |
+
"effective_iter": self.effective_iter,
|
893 |
+
"epoch": self.epoch,
|
894 |
+
"n_batch_in_epoch": self.n_batch_in_epoch,
|
895 |
+
"best_metric": self.best_metric,
|
896 |
+
"in_evaluation": self.in_evaluation,
|
897 |
+
"global_seed_sequence": self.global_seed_sequence,
|
898 |
+
}
|
899 |
+
train_state_path = os.path.join(ckpt_dir, "trainer.ckpt")
|
900 |
+
torch.save(state, train_state_path)
|
901 |
+
# iteration indicator
|
902 |
+
f = open(os.path.join(ckpt_dir, self._get_backup_ckpt_name()), "w")
|
903 |
+
f.close()
|
904 |
+
|
905 |
+
logging.info(f"Trainer state is saved to: {train_state_path}")
|
906 |
+
|
907 |
+
# Remove temp ckpt
|
908 |
+
if temp_ckpt_dir is not None and os.path.exists(temp_ckpt_dir):
|
909 |
+
shutil.rmtree(temp_ckpt_dir, ignore_errors=True)
|
910 |
+
logging.debug("Old checkpoint backup is removed.")
|
911 |
+
|
912 |
+
def load_checkpoint(
|
913 |
+
self, ckpt_path, load_trainer_state=True, resume_lr_scheduler=True
|
914 |
+
):
|
915 |
+
logging.info(f"Loading checkpoint from: {ckpt_path}")
|
916 |
+
# Load UNet
|
917 |
+
_model_path = os.path.join(ckpt_path, "unet", "diffusion_pytorch_model.bin")
|
918 |
+
self.model.unet.load_state_dict(
|
919 |
+
torch.load(_model_path, map_location=self.device)
|
920 |
+
)
|
921 |
+
self.model.unet.to(self.device)
|
922 |
+
logging.info(f"UNet parameters are loaded from {_model_path}")
|
923 |
+
|
924 |
+
# Load training states
|
925 |
+
if load_trainer_state:
|
926 |
+
checkpoint = torch.load(os.path.join(ckpt_path, "trainer.ckpt"))
|
927 |
+
self.effective_iter = checkpoint["effective_iter"]
|
928 |
+
self.epoch = checkpoint["epoch"]
|
929 |
+
self.n_batch_in_epoch = checkpoint["n_batch_in_epoch"]
|
930 |
+
self.in_evaluation = checkpoint["in_evaluation"]
|
931 |
+
self.global_seed_sequence = checkpoint["global_seed_sequence"]
|
932 |
+
|
933 |
+
self.best_metric = checkpoint["best_metric"]
|
934 |
+
|
935 |
+
self.optimizer.load_state_dict(checkpoint["optimizer"])
|
936 |
+
logging.info(f"optimizer state is loaded from {ckpt_path}")
|
937 |
+
|
938 |
+
if resume_lr_scheduler:
|
939 |
+
self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
|
940 |
+
logging.info(f"LR scheduler state is loaded from {ckpt_path}")
|
941 |
+
|
942 |
+
logging.info(
|
943 |
+
f"Checkpoint loaded from: {ckpt_path}. Resume from iteration {self.effective_iter} (epoch {self.epoch})"
|
944 |
+
)
|
945 |
+
return
|
946 |
+
|
947 |
+
def _get_backup_ckpt_name(self):
|
948 |
+
return f"iter_{self.effective_iter:06d}"
|
src/util/__pycache__/alignment.cpython-310.pyc
ADDED
Binary file (1.63 kB). View file
|
|
src/util/__pycache__/config_util.cpython-310.pyc
ADDED
Binary file (1.21 kB). View file
|
|
src/util/__pycache__/data_loader.cpython-310.pyc
ADDED
Binary file (3.41 kB). View file
|
|
src/util/__pycache__/depth_transform.cpython-310.pyc
ADDED
Binary file (3.03 kB). View file
|
|
src/util/__pycache__/logging_util.cpython-310.pyc
ADDED
Binary file (3.25 kB). View file
|
|
src/util/__pycache__/loss.cpython-310.pyc
ADDED
Binary file (3.87 kB). View file
|
|
src/util/__pycache__/lr_scheduler.cpython-310.pyc
ADDED
Binary file (1.61 kB). View file
|
|
src/util/__pycache__/metric.cpython-310.pyc
ADDED
Binary file (4.47 kB). View file
|
|
src/util/__pycache__/multi_res_noise.cpython-310.pyc
ADDED
Binary file (1.52 kB). View file
|
|
src/util/__pycache__/seeding.cpython-310.pyc
ADDED
Binary file (937 Bytes). View file
|
|
src/util/__pycache__/slurm_util.cpython-310.pyc
ADDED
Binary file (483 Bytes). View file
|
|
src/util/alignment.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Author: Bingxin Ke
|
2 |
+
# Last modified: 2024-01-11
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
|
7 |
+
|
8 |
+
def align_depth_least_square(
|
9 |
+
gt_arr: np.ndarray,
|
10 |
+
pred_arr: np.ndarray,
|
11 |
+
valid_mask_arr: np.ndarray,
|
12 |
+
return_scale_shift=True,
|
13 |
+
max_resolution=None,
|
14 |
+
):
|
15 |
+
ori_shape = pred_arr.shape # input shape
|
16 |
+
|
17 |
+
gt = gt_arr.squeeze() # [H, W]
|
18 |
+
pred = pred_arr.squeeze()
|
19 |
+
valid_mask = valid_mask_arr.squeeze()
|
20 |
+
|
21 |
+
# Downsample
|
22 |
+
if max_resolution is not None:
|
23 |
+
scale_factor = np.min(max_resolution / np.array(ori_shape[-2:]))
|
24 |
+
if scale_factor < 1:
|
25 |
+
downscaler = torch.nn.Upsample(scale_factor=scale_factor, mode="nearest")
|
26 |
+
gt = downscaler(torch.as_tensor(gt).unsqueeze(0)).numpy()
|
27 |
+
pred = downscaler(torch.as_tensor(pred).unsqueeze(0)).numpy()
|
28 |
+
valid_mask = (
|
29 |
+
downscaler(torch.as_tensor(valid_mask).unsqueeze(0).float())
|
30 |
+
.bool()
|
31 |
+
.numpy()
|
32 |
+
)
|
33 |
+
|
34 |
+
assert (
|
35 |
+
gt.shape == pred.shape == valid_mask.shape
|
36 |
+
), f"{gt.shape}, {pred.shape}, {valid_mask.shape}"
|
37 |
+
|
38 |
+
gt_masked = gt[valid_mask].reshape((-1, 1))
|
39 |
+
pred_masked = pred[valid_mask].reshape((-1, 1))
|
40 |
+
|
41 |
+
# numpy solver
|
42 |
+
_ones = np.ones_like(pred_masked)
|
43 |
+
A = np.concatenate([pred_masked, _ones], axis=-1)
|
44 |
+
X = np.linalg.lstsq(A, gt_masked, rcond=None)[0]
|
45 |
+
scale, shift = X
|
46 |
+
|
47 |
+
aligned_pred = pred_arr * scale + shift
|
48 |
+
|
49 |
+
# restore dimensions
|
50 |
+
aligned_pred = aligned_pred.reshape(ori_shape)
|
51 |
+
|
52 |
+
if return_scale_shift:
|
53 |
+
return aligned_pred, scale, shift
|
54 |
+
else:
|
55 |
+
return aligned_pred
|
56 |
+
|
57 |
+
|
58 |
+
# ******************** disparity space ********************
|
59 |
+
def depth2disparity(depth, return_mask=False):
|
60 |
+
if isinstance(depth, torch.Tensor):
|
61 |
+
disparity = torch.zeros_like(depth)
|
62 |
+
elif isinstance(depth, np.ndarray):
|
63 |
+
disparity = np.zeros_like(depth)
|
64 |
+
non_negtive_mask = depth > 0
|
65 |
+
disparity[non_negtive_mask] = 1.0 / depth[non_negtive_mask]
|
66 |
+
if return_mask:
|
67 |
+
return disparity, non_negtive_mask
|
68 |
+
else:
|
69 |
+
return disparity
|
70 |
+
|
71 |
+
def disparity2depth(disparity, **kwargs):
|
72 |
+
return depth2disparity(disparity, **kwargs)
|
src/util/config_util.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Author: Bingxin Ke
|
2 |
+
# Last modified: 2024-02-14
|
3 |
+
|
4 |
+
import omegaconf
|
5 |
+
from omegaconf import OmegaConf
|
6 |
+
|
7 |
+
|
8 |
+
def recursive_load_config(config_path: str) -> OmegaConf:
|
9 |
+
conf = OmegaConf.load(config_path)
|
10 |
+
|
11 |
+
output_conf = OmegaConf.create({})
|
12 |
+
|
13 |
+
# Load base config. Later configs on the list will overwrite previous
|
14 |
+
base_configs = conf.get("base_config", default_value=None)
|
15 |
+
if base_configs is not None:
|
16 |
+
assert isinstance(base_configs, omegaconf.listconfig.ListConfig)
|
17 |
+
for _path in base_configs:
|
18 |
+
assert (
|
19 |
+
_path != config_path
|
20 |
+
), "Circulate merging, base_config should not include itself."
|
21 |
+
_base_conf = recursive_load_config(_path)
|
22 |
+
output_conf = OmegaConf.merge(output_conf, _base_conf)
|
23 |
+
|
24 |
+
# Merge configs and overwrite values
|
25 |
+
output_conf = OmegaConf.merge(output_conf, conf)
|
26 |
+
|
27 |
+
return output_conf
|
28 |
+
|
29 |
+
|
30 |
+
def find_value_in_omegaconf(search_key, config):
|
31 |
+
result_list = []
|
32 |
+
|
33 |
+
if isinstance(config, omegaconf.DictConfig):
|
34 |
+
for key, value in config.items():
|
35 |
+
if key == search_key:
|
36 |
+
result_list.append(value)
|
37 |
+
elif isinstance(value, (omegaconf.DictConfig, omegaconf.ListConfig)):
|
38 |
+
result_list.extend(find_value_in_omegaconf(search_key, value))
|
39 |
+
elif isinstance(config, omegaconf.ListConfig):
|
40 |
+
for item in config:
|
41 |
+
if isinstance(item, (omegaconf.DictConfig, omegaconf.ListConfig)):
|
42 |
+
result_list.extend(find_value_in_omegaconf(search_key, item))
|
43 |
+
|
44 |
+
return result_list
|
45 |
+
|
46 |
+
|
47 |
+
if "__main__" == __name__:
|
48 |
+
conf = recursive_load_config("config/train_base.yaml")
|
49 |
+
print(OmegaConf.to_yaml(conf))
|