diff --git a/common/config.yaml b/common/config.yaml
index 0d0590491f883990c56af0e526880d65b45f88d8..1fce1f14162d15d4195c13380c54055dca99ca1a 100644
--- a/common/config.yaml
+++ b/common/config.yaml
@@ -27,6 +27,17 @@ matcher_zoo:
paper: https://arxiv.org/abs/2405.12979
project: https://hwjiang1510.github.io/OmniGlue
display: true
+ Mast3R:
+ enable: false
+ matcher: mast3r
+ dense: true
+ info:
+ name: Mast3R #dispaly name
+ source: "CVPR 2024"
+ github: https://github.com/naver/mast3r
+ paper: https://arxiv.org/abs/2312.14132
+ project: https://dust3r.europe.naverlabs.com
+ display: true
DUSt3R:
# TODO: duster is under development
enable: true
diff --git a/hloc/match_dense.py b/hloc/match_dense.py
index ac95d937523adb3e1bd28b8a301517bda35028a6..a4a4a30aedc75f8f16444cb9729724cfe3cd7d44 100644
--- a/hloc/match_dense.py
+++ b/hloc/match_dense.py
@@ -144,6 +144,20 @@ confs = {
"dfactor": 16,
},
},
+ "mast3r": {
+ "output": "matches-mast3r",
+ "model": {
+ "name": "mast3r",
+ "weights": "vit_large",
+ "max_keypoints": 2000,
+ "match_threshold": 0.2,
+ },
+ "preprocessing": {
+ "grayscale": False,
+ "resize_max": 512,
+ "dfactor": 16,
+ },
+ },
"xfeat_dense": {
"output": "matches-xfeat_dense",
"model": {
diff --git a/hloc/matchers/mast3r.py b/hloc/matchers/mast3r.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d673a672df330e8a8513152ec1a2982353becb6
--- /dev/null
+++ b/hloc/matchers/mast3r.py
@@ -0,0 +1,113 @@
+import os
+import sys
+import urllib.request
+from pathlib import Path
+
+import numpy as np
+import torch
+import torchvision.transforms as tfm
+
+from .. import logger
+from ..utils.base_model import BaseModel
+
+mast3r_path = Path(__file__).parent / "../../third_party/mast3r"
+sys.path.append(str(mast3r_path))
+
+dust3r_path = Path(__file__).parent / "../../third_party/dust3r"
+sys.path.append(str(dust3r_path))
+
+from mast3r.model import AsymmetricMASt3R
+from mast3r.fast_nn import fast_reciprocal_NNs
+
+from dust3r.image_pairs import make_pairs
+from dust3r.inference import inference
+from dust3r.utils.image import load_images
+from hloc.matchers.duster import Duster
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+class Mast3r(Duster):
+ default_conf = {
+ "name": "Mast3r",
+ "model_path": mast3r_path
+ / "model_weights/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth",
+ "max_keypoints": 2000,
+ "vit_patch_size": 16,
+ }
+
+ def _init(self, conf):
+ self.normalize = tfm.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+ self.model_path = self.conf["model_path"]
+ self.download_weights()
+ self.net = AsymmetricMASt3R.from_pretrained(self.model_path).to(device)
+ logger.info("Loaded Mast3r model")
+
+ def download_weights(self):
+ url = "https://download.europe.naverlabs.com/ComputerVision/MASt3R/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth"
+
+ self.model_path.parent.mkdir(parents=True, exist_ok=True)
+ if not os.path.isfile(self.model_path):
+ logger.info("Downloading Mast3r(ViT large)... (takes a while)")
+ urllib.request.urlretrieve(url, self.model_path)
+ logger.info("Downloading Mast3r(ViT large)... done!")
+
+ def _forward(self, data):
+ img0, img1 = data["image0"], data["image1"]
+ mean = torch.tensor([0.5, 0.5, 0.5]).to(device)
+ std = torch.tensor([0.5, 0.5, 0.5]).to(device)
+
+ img0 = (img0 - mean.view(1, 3, 1, 1)) / std.view(1, 3, 1, 1)
+ img1 = (img1 - mean.view(1, 3, 1, 1)) / std.view(1, 3, 1, 1)
+
+ images = [
+ {"img": img0, "idx": 0, "instance": 0},
+ {"img": img1, "idx": 1, "instance": 1},
+ ]
+ pairs = make_pairs(
+ images, scene_graph="complete", prefilter=None, symmetrize=True
+ )
+ output = inference(pairs, self.net, device, batch_size=1)
+
+ # at this stage, you have the raw dust3r predictions
+ view1, pred1 = output["view1"], output["pred1"]
+ view2, pred2 = output["view2"], output["pred2"]
+
+ desc1, desc2 = (
+ pred1["desc"][1].squeeze(0).detach(),
+ pred2["desc"][1].squeeze(0).detach(),
+ )
+
+ # find 2D-2D matches between the two images
+ matches_im0, matches_im1 = fast_reciprocal_NNs(
+ desc1,
+ desc2,
+ subsample_or_initxy1=2,
+ device=device,
+ dist="dot",
+ block_size=2**13,
+ )
+
+ mkpts0 = matches_im0.copy()
+ mkpts1 = matches_im1.copy()
+
+ if len(mkpts0) == 0:
+ pred = {
+ "keypoints0": torch.zeros([0, 2]),
+ "keypoints1": torch.zeros([0, 2]),
+ }
+ logger.warning(f"Matched {0} points")
+ else:
+
+ top_k = self.conf["max_keypoints"]
+ if top_k is not None and len(mkpts0) > top_k:
+ keep = np.round(np.linspace(0, len(mkpts0) - 1, top_k)).astype(
+ int
+ )
+ mkpts0 = mkpts0[keep]
+ mkpts1 = mkpts1[keep]
+ pred = {
+ "keypoints0": torch.from_numpy(mkpts0),
+ "keypoints1": torch.from_numpy(mkpts1),
+ }
+ return pred
diff --git a/third_party/mast3r/.gitignore b/third_party/mast3r/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b6e47617de110dea7ca47e087ff1347cc2646eda
--- /dev/null
+++ b/third_party/mast3r/.gitignore
@@ -0,0 +1,129 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/third_party/mast3r/.gitmodules b/third_party/mast3r/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..d544dca780e738ef30f618ce3cab4810bad806e3
--- /dev/null
+++ b/third_party/mast3r/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "dust3r"]
+ path = dust3r
+ url = https://github.com/naver/dust3r
+ branch = cvpr
diff --git a/third_party/mast3r/CHECKPOINTS_NOTICE b/third_party/mast3r/CHECKPOINTS_NOTICE
new file mode 100644
index 0000000000000000000000000000000000000000..040aed77ec78156cb6c1af8da1652e13ade10bcd
--- /dev/null
+++ b/third_party/mast3r/CHECKPOINTS_NOTICE
@@ -0,0 +1,1376 @@
+MASt3R
+Copyright 2024-present NAVER Corp.
+
+This project's checkpoints were trained on datasets with separate license terms.
+Your use of theses checkpoints is subject to the terms and conditions of the following licenses.
+
+===
+pretrained model:
+DUSt3R: DUSt3R_ViTLarge_BaseDecoder_512_dpt
+https://github.com/naver/dust3r
+
+In particular, from the croco training set:
+
+3D_Street_View
+https://github.com/amir32002/3D_Street_View/blob/master/LICENSE
+This dataset is made freely available to academic and non-academic entities for non-commercial purposes such as academic research, teaching, scientific publications, or personal experimentation. Permission is granted to use the data given that you agree:
+
+1. That the dataset comes "AS IS", without express or implied warranty. Although every effort has been made to ensure accuracy, we do not accept any responsibility for errors or omissions.
+
+2. That you include a reference to the Dataset in any work that makes use of the dataset. For research papers, cite our publication as listed on our website.
+
+3. That you do not distribute this dataset or modified versions. It is permissible to distribute derivative works in as far as they are abstract representations of this dataset (such as models trained on it or additional annotations that do not directly include any of our data) and do not allow to recover the dataset or something similar in character.
+
+4. That you may not use the dataset or any derivative work for commercial purposes as, for example, licensing or selling the data, or using the data with a purpose to procure a commercial gain.
+That all rights not expressly granted to you are reserved by us.
+
+In addition, using the dataset is subject to the following standard terms:
+
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+Indoor Visual Localization datasets (IndoorVL)
+https://challenge.naverlabs.com/kapture/GangnamStation_LICENSE.txt
+https://challenge.naverlabs.com/kapture/HyundaiDepartmentStore_LICENSE.txt
+
+LICENSE.txt
+Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 (modified ver.)
+International Public License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial-NoDerivatives 4.0 International Public
+License ("Public License"). To the extent this Public License may be
+interpreted as a contract, You are granted the Licensed Rights in
+consideration of Your acceptance of these terms and conditions, and the
+Licensor grants You such rights in consideration of benefits the
+Licensor receives from making the Licensed Material available under
+these terms and conditions.
+
+
+Section 1 -- Definitions.
+
+ a. Adapted Material means material subject to Copyright and Similar
+ Rights that is derived from or based upon the Licensed Material
+ and in which the Licensed Material is translated, altered,
+ arranged, transformed, or otherwise modified in a manner requiring
+ permission under the Copyright and Similar Rights held by the
+ Licensor. For purposes of this Public License, where the Licensed
+ Material is a musical work, performance, or sound recording,
+ Adapted Material is always produced where the Licensed Material is
+ synched in timed relation with a moving image.
+
+ b. Copyright and Similar Rights means copyright and/or similar rights
+ closely related to copyright including, without limitation,
+ performance, broadcast, sound recording, and Sui Generis Database
+ Rights, without regard to how the rights are labeled or
+ categorized. For purposes of this Public License, the rights
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
+ Rights.
+
+ c. Effective Technological Measures means those measures that, in the
+ absence of proper authority, may not be circumvented under laws
+ fulfilling obligations under Article 11 of the WIPO Copyright
+ Treaty adopted on December 20, 1996, and/or similar international
+ agreements.
+
+ d. Exceptions and Limitations means fair use, fair dealing, and/or
+ any other exception or limitation to Copyright and Similar Rights
+ that applies to Your use of the Licensed Material.
+
+ e. Licensed Material means the artistic or literary work, database,
+ or other material to which the Licensor applied this Public
+ License.
+
+ f. Licensed Rights means the rights granted to You subject to the
+ terms and conditions of this Public License, which are limited to
+ all Copyright and Similar Rights that apply to Your use of the
+ Licensed Material and that the Licensor has authority to license.
+
+ g. Licensor means the individual(s) or entity(ies) granting rights
+ under this Public License.
+
+ h. NonCommercial means not primarily intended for or directed towards
+ commercial advantage or monetary compensation. For purposes of
+ this Public License, the exchange of the Licensed Material for
+ other material subject to Copyright and Similar Rights by digital
+ file-sharing or similar means is NonCommercial provided there is
+ no payment of monetary compensation in connection with the
+ exchange.
+
+ i. Share means to provide material to the public by any means or
+ process that requires permission under the Licensed Rights, such
+ as reproduction, public display, public performance, distribution,
+ dissemination, communication, or importation, and to make material
+ available to the public including in ways that members of the
+ public may access the material from a place and at a time
+ individually chosen by them.
+
+ j. Sui Generis Database Rights means rights other than copyright
+ resulting from Directive 96/9/EC of the European Parliament and of
+ the Council of 11 March 1996 on the legal protection of databases,
+ as amended and/or succeeded, as well as other essentially
+ equivalent rights anywhere in the world.
+
+ k. You means the individual or entity exercising the Licensed Rights
+ under this Public License. Your has a corresponding meaning.
+
+ l. Research purpose means to publish research achievements in a research paper
+
+
+Section 2 -- Scope.
+
+ a. License grant.
+
+ 1. Subject to the terms and conditions of this Public License,
+ the Licensor hereby grants You a worldwide, royalty-free,
+ non-sublicensable, non-exclusive, irrevocable license to
+ exercise the Licensed Rights in the Licensed Material to:
+
+ a. reproduce and Share the Licensed Material, in whole or
+ in part, for NonCommercial purposes only; and
+
+ b. produce and reproduce, but not Share, Adapted Material
+ for NonCommercial purposes only.
+
+ c. reproduce and share the Adapted Matrerial, in part,
+ for Research purposes only.
+
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
+ Exceptions and Limitations apply to Your use, this Public
+ License does not apply, and You do not need to comply with
+ its terms and conditions.
+
+ 3. Term. The term of this Public License is specified in Section
+ 6(a).
+
+ 4. Media and formats; technical modifications allowed. The
+ Licensor authorizes You to exercise the Licensed Rights in
+ all media and formats whether now known or hereafter created,
+ and to make technical modifications necessary to do so. The
+ Licensor waives and/or agrees not to assert any right or
+ authority to forbid You from making technical modifications
+ necessary to exercise the Licensed Rights, including
+ technical modifications necessary to circumvent Effective
+ Technological Measures. For purposes of this Public License,
+ simply making modifications authorized by this Section 2(a)
+ (4) never produces Adapted Material.
+
+ 5. Downstream recipients.
+
+ a. Offer from the Licensor -- Licensed Material. Every
+ recipient of the Licensed Material automatically
+ receives an offer from the Licensor to exercise the
+ Licensed Rights under the terms and conditions of this
+ Public License.
+
+ b. No downstream restrictions. You may not offer or impose
+ any additional or different terms or conditions on, or
+ apply any Effective Technological Measures to, the
+ Licensed Material if doing so restricts exercise of the
+ Licensed Rights by any recipient of the Licensed
+ Material.
+
+ 6. No endorsement. Nothing in this Public License constitutes or
+ may be construed as permission to assert or imply that You
+ are, or that Your use of the Licensed Material is, connected
+ with, or sponsored, endorsed, or granted official status by,
+ the Licensor or others designated to receive attribution as
+ provided in Section 3(a)(1)(A)(i).
+
+ b. Other rights.
+
+ 1. Moral rights, such as the right of integrity, are not
+ licensed under this Public License, nor are publicity,
+ privacy, and/or other similar personality rights; however, to
+ the extent possible, the Licensor waives and/or agrees not to
+ assert any such rights held by the Licensor to the limited
+ extent necessary to allow You to exercise the Licensed
+ Rights, but not otherwise.
+
+ 2. Patent and trademark rights are not licensed under this
+ Public License.
+
+ 3. To the extent possible, the Licensor waives any right to
+ collect royalties from You for the exercise of the Licensed
+ Rights, whether directly or through a collecting society
+ under any voluntary or waivable statutory or compulsory
+ licensing scheme. In all other cases the Licensor expressly
+ reserves any right to collect such royalties, including when
+ the Licensed Material is used other than for NonCommercial
+ purposes.
+
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+ a. Attribution.
+
+ 1. If You Share the Licensed Material(including in a research paper),
+ You must:
+
+ a. retain the following if it is supplied by the Licensor
+ with the Licensed Material:
+
+ i. identification of the creator(s) of the Licensed
+ Material and any others designated to receive
+ attribution, in any reasonable manner requested by
+ the Licensor (including by pseudonym if
+ designated);
+
+ ii. a copyright notice;
+
+ iii. a notice that refers to this Public License;
+
+ iv. a notice that refers to the disclaimer of
+ warranties;
+
+ v. a URI or hyperlink to the Licensed Material to the
+ extent reasonably practicable;
+
+ b. indicate if You modified the Licensed Material and
+ retain an indication of any previous modifications; and
+
+ c. indicate the Licensed Material is licensed under this
+ Public License, and include the text of, or the URI or
+ hyperlink to, this Public License.
+
+ For the avoidance of doubt, You do not have permission under
+ this Public License to Share Adapted Material.
+
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
+ reasonable manner based on the medium, means, and context in
+ which You Share the Licensed Material. For example, it may be
+ reasonable to satisfy the conditions by providing a URI or
+ hyperlink to a resource that includes the required
+ information.
+
+ 3. If requested by the Licensor, You must remove any of the
+ information required by Section 3(a)(1)(A) to the extent
+ reasonably practicable.
+
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+ to extract, reuse, reproduce, and Share all or a substantial
+ portion of the contents of the database for NonCommercial purposes
+ only and provided You do not Share Adapted Material;
+
+ b. if You include all or a substantial portion of the database
+ contents in a database in which You have Sui Generis Database
+ Rights, then the database in which You have Sui Generis Database
+ Rights (but not its individual contents) is Adapted Material; and
+
+ c. You must comply with the conditions in Section 3(a) if You Share
+ all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+ c. The disclaimer of warranties and limitation of liability provided
+ above shall be interpreted in a manner that, to the extent
+ possible, most closely approximates an absolute disclaimer and
+ waiver of all liability.
+
+
+Section 6 -- Term and Termination.
+
+ a. This Public License applies for the term of the Copyright and
+ Similar Rights licensed here. However, if You fail to comply with
+ this Public License, then Your rights under this Public License
+ terminate automatically.
+
+ b. Where Your right to use the Licensed Material has terminated under
+ Section 6(a), it reinstates:
+
+ 1. automatically as of the date the violation is cured, provided
+ it is cured within 30 days of Your discovery of the
+ violation; or
+
+ 2. upon express reinstatement by the Licensor.
+
+ For the avoidance of doubt, this Section 6(b) does not affect any
+ right the Licensor may have to seek remedies for Your violations
+ of this Public License.
+
+ c. For the avoidance of doubt, the Licensor may also offer the
+ Licensed Material under separate terms or conditions or stop
+ distributing the Licensed Material at any time; however, doing so
+ will not terminate this Public License.
+
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+ License.
+
+
+Section 7 -- Other Terms and Conditions.
+
+ a. The Licensor shall not be bound by any additional or different
+ terms or conditions communicated by You unless expressly agreed.
+
+ b. Any arrangements, understandings, or agreements regarding the
+ Licensed Material not stated herein are separate from and
+ independent of the terms and conditions of this Public License.
+
+
+Section 8 -- Interpretation.
+
+ a. For the avoidance of doubt, this Public License does not, and
+ shall not be interpreted to, reduce, limit, restrict, or impose
+ conditions on any use of the Licensed Material that could lawfully
+ be made without permission under this Public License.
+
+ b. To the extent possible, if any provision of this Public License is
+ deemed unenforceable, it shall be automatically reformed to the
+ minimum extent necessary to make it enforceable. If the provision
+ cannot be reformed, it shall be severed from this Public License
+ without affecting the enforceability of the remaining terms and
+ conditions.
+
+ c. No term or condition of this Public License will be waived and no
+ failure to comply consented to unless expressly agreed to by the
+ Licensor.
+
+ d. Nothing in this Public License constitutes or may be interpreted
+ as a limitation upon, or waiver of, any privileges and immunities
+ that apply to the Licensor or You, including from the legal
+ processes of any jurisdiction or authority.
+
+===
+CO3Dv2
+
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+
+Section 1 -- Definitions.
+
+ a. Adapted Material means material subject to Copyright and Similar
+ Rights that is derived from or based upon the Licensed Material
+ and in which the Licensed Material is translated, altered,
+ arranged, transformed, or otherwise modified in a manner requiring
+ permission under the Copyright and Similar Rights held by the
+ Licensor. For purposes of this Public License, where the Licensed
+ Material is a musical work, performance, or sound recording,
+ Adapted Material is always produced where the Licensed Material is
+ synched in timed relation with a moving image.
+
+ b. Adapter's License means the license You apply to Your Copyright
+ and Similar Rights in Your contributions to Adapted Material in
+ accordance with the terms and conditions of this Public License.
+
+ c. Copyright and Similar Rights means copyright and/or similar rights
+ closely related to copyright including, without limitation,
+ performance, broadcast, sound recording, and Sui Generis Database
+ Rights, without regard to how the rights are labeled or
+ categorized. For purposes of this Public License, the rights
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
+ Rights.
+ d. Effective Technological Measures means those measures that, in the
+ absence of proper authority, may not be circumvented under laws
+ fulfilling obligations under Article 11 of the WIPO Copyright
+ Treaty adopted on December 20, 1996, and/or similar international
+ agreements.
+
+ e. Exceptions and Limitations means fair use, fair dealing, and/or
+ any other exception or limitation to Copyright and Similar Rights
+ that applies to Your use of the Licensed Material.
+
+ f. Licensed Material means the artistic or literary work, database,
+ or other material to which the Licensor applied this Public
+ License.
+
+ g. Licensed Rights means the rights granted to You subject to the
+ terms and conditions of this Public License, which are limited to
+ all Copyright and Similar Rights that apply to Your use of the
+ Licensed Material and that the Licensor has authority to license.
+
+ h. Licensor means the individual(s) or entity(ies) granting rights
+ under this Public License.
+
+ i. NonCommercial means not primarily intended for or directed towards
+ commercial advantage or monetary compensation. For purposes of
+ this Public License, the exchange of the Licensed Material for
+ other material subject to Copyright and Similar Rights by digital
+ file-sharing or similar means is NonCommercial provided there is
+ no payment of monetary compensation in connection with the
+ exchange.
+
+ j. Share means to provide material to the public by any means or
+ process that requires permission under the Licensed Rights, such
+ as reproduction, public display, public performance, distribution,
+ dissemination, communication, or importation, and to make material
+ available to the public including in ways that members of the
+ public may access the material from a place and at a time
+ individually chosen by them.
+
+ k. Sui Generis Database Rights means rights other than copyright
+ resulting from Directive 96/9/EC of the European Parliament and of
+ the Council of 11 March 1996 on the legal protection of databases,
+ as amended and/or succeeded, as well as other essentially
+ equivalent rights anywhere in the world.
+
+ l. You means the individual or entity exercising the Licensed Rights
+ under this Public License. Your has a corresponding meaning.
+
+Section 2 -- Scope.
+
+ a. License grant.
+
+ 1. Subject to the terms and conditions of this Public License,
+ the Licensor hereby grants You a worldwide, royalty-free,
+ non-sublicensable, non-exclusive, irrevocable license to
+ exercise the Licensed Rights in the Licensed Material to:
+
+ a. reproduce and Share the Licensed Material, in whole or
+ in part, for NonCommercial purposes only; and
+
+ b. produce, reproduce, and Share Adapted Material for
+ NonCommercial purposes only.
+
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
+ Exceptions and Limitations apply to Your use, this Public
+ License does not apply, and You do not need to comply with
+ its terms and conditions.
+
+ 3. Term. The term of this Public License is specified in Section
+ 6(a).
+
+ 4. Media and formats; technical modifications allowed. The
+ Licensor authorizes You to exercise the Licensed Rights in
+ all media and formats whether now known or hereafter created,
+ and to make technical modifications necessary to do so. The
+ Licensor waives and/or agrees not to assert any right or
+ authority to forbid You from making technical modifications
+ necessary to exercise the Licensed Rights, including
+ technical modifications necessary to circumvent Effective
+ Technological Measures. For purposes of this Public License,
+ simply making modifications authorized by this Section 2(a)
+ (4) never produces Adapted Material.
+
+ 5. Downstream recipients.
+
+ a. Offer from the Licensor -- Licensed Material. Every
+ recipient of the Licensed Material automatically
+ receives an offer from the Licensor to exercise the
+ Licensed Rights under the terms and conditions of this
+ Public License.
+
+ b. No downstream restrictions. You may not offer or impose
+ any additional or different terms or conditions on, or
+ apply any Effective Technological Measures to, the
+ Licensed Material if doing so restricts exercise of the
+ Licensed Rights by any recipient of the Licensed
+ Material.
+
+ 6. No endorsement. Nothing in this Public License constitutes or
+ may be construed as permission to assert or imply that You
+ are, or that Your use of the Licensed Material is, connected
+ with, or sponsored, endorsed, or granted official status by,
+ the Licensor or others designated to receive attribution as
+ provided in Section 3(a)(1)(A)(i).
+
+ b. Other rights.
+
+ 1. Moral rights, such as the right of integrity, are not
+ licensed under this Public License, nor are publicity,
+ privacy, and/or other similar personality rights; however, to
+ the extent possible, the Licensor waives and/or agrees not to
+ assert any such rights held by the Licensor to the limited
+ extent necessary to allow You to exercise the Licensed
+ Rights, but not otherwise.
+
+ 2. Patent and trademark rights are not licensed under this
+ Public License.
+
+ 3. To the extent possible, the Licensor waives any right to
+ collect royalties from You for the exercise of the Licensed
+ Rights, whether directly or through a collecting society
+ under any voluntary or waivable statutory or compulsory
+ licensing scheme. In all other cases the Licensor expressly
+ reserves any right to collect such royalties, including when
+ the Licensed Material is used other than for NonCommercial
+ purposes.
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+ a. Attribution.
+
+ 1. If You Share the Licensed Material (including in modified
+ form), You must:
+
+ a. retain the following if it is supplied by the Licensor
+ with the Licensed Material:
+
+ i. identification of the creator(s) of the Licensed
+ Material and any others designated to receive
+ attribution, in any reasonable manner requested by
+ the Licensor (including by pseudonym if
+ designated);
+
+ ii. a copyright notice;
+
+ iii. a notice that refers to this Public License;
+
+ iv. a notice that refers to the disclaimer of
+ warranties;
+
+ v. a URI or hyperlink to the Licensed Material to the
+ extent reasonably practicable;
+
+ b. indicate if You modified the Licensed Material and
+ retain an indication of any previous modifications; and
+
+ c. indicate the Licensed Material is licensed under this
+ Public License, and include the text of, or the URI or
+ hyperlink to, this Public License.
+
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
+ reasonable manner based on the medium, means, and context in
+ which You Share the Licensed Material. For example, it may be
+ reasonable to satisfy the conditions by providing a URI or
+ hyperlink to a resource that includes the required
+ information.
+
+ 3. If requested by the Licensor, You must remove any of the
+ information required by Section 3(a)(1)(A) to the extent
+ reasonably practicable.
+
+ 4. If You Share Adapted Material You produce, the Adapter's
+ License You apply must not prevent recipients of the Adapted
+ Material from complying with this Public License.
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+ to extract, reuse, reproduce, and Share all or a substantial
+ portion of the contents of the database for NonCommercial purposes
+ only;
+
+ b. if You include all or a substantial portion of the database
+ contents in a database in which You have Sui Generis Database
+ Rights, then the database in which You have Sui Generis Database
+ Rights (but not its individual contents) is Adapted Material; and
+
+ c. You must comply with the conditions in Section 3(a) if You Share
+ all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+ c. The disclaimer of warranties and limitation of liability provided
+ above shall be interpreted in a manner that, to the extent
+ possible, most closely approximates an absolute disclaimer and
+ waiver of all liability.
+
+Section 6 -- Term and Termination.
+
+ a. This Public License applies for the term of the Copyright and
+ Similar Rights licensed here. However, if You fail to comply with
+ this Public License, then Your rights under this Public License
+ terminate automatically.
+
+ b. Where Your right to use the Licensed Material has terminated under
+ Section 6(a), it reinstates:
+
+ 1. automatically as of the date the violation is cured, provided
+ it is cured within 30 days of Your discovery of the
+ violation; or
+
+ 2. upon express reinstatement by the Licensor.
+
+ For the avoidance of doubt, this Section 6(b) does not affect any
+ right the Licensor may have to seek remedies for Your violations
+ of this Public License.
+
+ c. For the avoidance of doubt, the Licensor may also offer the
+ Licensed Material under separate terms or conditions or stop
+ distributing the Licensed Material at any time; however, doing so
+ will not terminate this Public License.
+
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+ License.
+
+Section 7 -- Other Terms and Conditions.
+
+ a. The Licensor shall not be bound by any additional or different
+ terms or conditions communicated by You unless expressly agreed.
+
+ b. Any arrangements, understandings, or agreements regarding the
+ Licensed Material not stated herein are separate from and
+ independent of the terms and conditions of this Public License.
+
+Section 8 -- Interpretation.
+
+ a. For the avoidance of doubt, this Public License does not, and
+ shall not be interpreted to, reduce, limit, restrict, or impose
+ conditions on any use of the Licensed Material that could lawfully
+ be made without permission under this Public License.
+
+ b. To the extent possible, if any provision of this Public License is
+ deemed unenforceable, it shall be automatically reformed to the
+ minimum extent necessary to make it enforceable. If the provision
+ cannot be reformed, it shall be severed from this Public License
+ without affecting the enforceability of the remaining terms and
+ conditions.
+
+ c. No term or condition of this Public License will be waived and no
+ failure to comply consented to unless expressly agreed to by the
+ Licensor.
+
+ d. Nothing in this Public License constitutes or may be interpreted
+ as a limitation upon, or waiver of, any privileges and immunities
+ that apply to the Licensor or You, including from the legal
+ processes of any jurisdiction or authority.
+
+===
+ARKitScenes
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0: https://creativecommons.org/licenses/by-nc-sa/4.0/
+
+===
+ScanNet++
+https://kaldir.vc.in.tum.de/scannetpp/static/scannetpp-terms-of-use.pdf
+
+===
+BlendedMVS
+Creative Commons Attribution 4.0 International: http://creativecommons.org/licenses/by/4.0/
+
+===
+Habitat-Sim
+HM3D
+https://matterport.com/fr/legal/matterport-end-user-license-agreement-academic-use-model-data
+
+ScanNet
+https://kaldir.vc.in.tum.de/scannet/ScanNet_TOS.pdf
+
+Replica
+Before Facebook Technologies, LLC (“FB”) is able to offer you (“Researcher” or
+“You”) access to the Replica Dataset (the “Dataset”), please read the following
+agreement (“Agreement”).
+
+By accessing, and in exchange for receiving permission to access, the Dataset,
+Researcher hereby agrees to the following terms and conditions:
+1. Researcher may use, modify, improve and/or publish the Dataset only in
+connection with a research or educational purpose that is non-commercial or
+not-for-profit in nature, and not for any other purpose.
+1. Researcher may provide research associates and colleagues with access to the
+Dataset provided that they first agree to be bound by these terms and
+conditions.
+1. Researcher may use the Dataset in the scope of their employment at a
+for-profit or commercial entity provided that Researcher complies with Section 1
+of this Agreement. If Researcher is employed by a for-profit or commercial
+entity, Researcher's employer shall also be bound by these terms and conditions,
+and Researcher hereby represents that they are fully authorized to enter into
+this agreement on behalf of such employer.
+1. THE DATASET IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL FB OR ANY
+CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE DATASET OR THE USE OR OTHER DEALINGS IN THE DATASET.
+1. The law of the State of California shall apply to all disputes related to
+this Dataset.
+
+ReplicaCAD
+Creative Commons Attribution 4.0 International (CC BY 4.0): https://creativecommons.org/licenses/by/4.0/
+
+habitat-sim
+MIT License
+
+Copyright (c) Meta Platforms, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+===
+MegaDepth
+MIT License
+
+Copyright (c) 2018 Zhengqi Li
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+===
+StaticThings3D
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+===
+WildRGB-D
+https://github.com/wildrgbd/wildrgbd/
+MIT License
+
+Copyright (c) 2024 rowdataset
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+===
+TartanAir
+Creative Commons Attribution 4.0 International License: http://creativecommons.org/licenses/by/4.0/
+
+===
+UnrealStereo4K
+https://github.com/fabiotosi92/SMD-Nets
+MIT License
+
+Copyright (c) 2021 Fabio Tosi
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+===
+Virtual KITTI 2
+Creative Commons Attribution-NonCommercial-ShareAlike 3.0: http://creativecommons.org/licenses/by-nc-sa/3.0/legalcode
+
+===
+DL3DV
+DL3DV-10K Term of use and Creative Commons Attribution-NonCommercial 4.0 International License.
+
+Terms of Use
+
+Researcher shall use the Dataset only for non-commercial research and educational purposes.
+DL3DV-10K organization makes no representations or warranties regarding the dataset, including but not limited to warranties of non-infringement or fitness for a particular purpose.
+Researcher accepts full responsibility for his/her/their use of the Dataset and shall defend and indemnify DL3DV-10K organization, including its members, employees, Trustees, officers and agents, against any and all claims arising from Researcher's use of the Dataset, including but not limited to Researcher's use of any copies of copyrighted 3D models that he/she/they may create from the dataset.
+Researcher may provide research associates and colleagues with access to the Dataset, after receiving entity has also agreed to and signed these terms and conditions. Sharing the data otherwise is strictly prohibited.
+Following General Data Protection Regulation, Researcher must ensure that they can delete all person-specific data upon request.
+DL3DV-10K organization reserves the right to terminate Researcher's access to the Dataset at any time.
+If Researcher is employed by a for-profit, commercial entity, Researcher's employer shall also be bound by these terms and conditions, and Researcher hereby represents that he/she/they is/are fully authorized to enter into this agreement on behalf of such employer.
+The law of the Indiana State shall apply to all disputes under this agreement.
+
+Creative Commons Attribution-NonCommercial 4.0 International Public License
+
+By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
+
+Section 1 -- Definitions.
+
+a. Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
+
+b. Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License.
+
+c. Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. d. Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
+
+e. Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
+
+f. Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
+
+g. Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
+
+h. Licensor means the individual(s) or entity(ies) granting rights under this Public License.
+
+i. NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange.
+
+j. Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
+
+k. Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
+
+l. You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
+
+Section 2 -- Scope.
+
+a. License grant.
+
+ 1. Subject to the terms and conditions of this Public License,
+ the Licensor hereby grants You a worldwide, royalty-free,
+ non-sublicensable, non-exclusive, irrevocable license to
+ exercise the Licensed Rights in the Licensed Material to:
+
+ a. reproduce and Share the Licensed Material, in whole or
+ in part, for NonCommercial purposes only; and
+
+ b. produce, reproduce, and Share Adapted Material for
+ NonCommercial purposes only.
+
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
+ Exceptions and Limitations apply to Your use, this Public
+ License does not apply, and You do not need to comply with
+ its terms and conditions.
+
+ 3. Term. The term of this Public License is specified in Section
+ 6(a).
+
+ 4. Media and formats; technical modifications allowed. The
+ Licensor authorizes You to exercise the Licensed Rights in
+ all media and formats whether now known or hereafter created,
+ and to make technical modifications necessary to do so. The
+ Licensor waives and/or agrees not to assert any right or
+ authority to forbid You from making technical modifications
+ necessary to exercise the Licensed Rights, including
+ technical modifications necessary to circumvent Effective
+ Technological Measures. For purposes of this Public License,
+ simply making modifications authorized by this Section 2(a)
+ (4) never produces Adapted Material.
+
+ 5. Downstream recipients.
+
+ a. Offer from the Licensor -- Licensed Material. Every
+ recipient of the Licensed Material automatically
+ receives an offer from the Licensor to exercise the
+ Licensed Rights under the terms and conditions of this
+ Public License.
+
+ b. No downstream restrictions. You may not offer or impose
+ any additional or different terms or conditions on, or
+ apply any Effective Technological Measures to, the
+ Licensed Material if doing so restricts exercise of the
+ Licensed Rights by any recipient of the Licensed
+ Material.
+
+ 6. No endorsement. Nothing in this Public License constitutes or
+ may be construed as permission to assert or imply that You
+ are, or that Your use of the Licensed Material is, connected
+ with, or sponsored, endorsed, or granted official status by,
+ the Licensor or others designated to receive attribution as
+ provided in Section 3(a)(1)(A)(i).
+
+b. Other rights.
+
+ 1. Moral rights, such as the right of integrity, are not
+ licensed under this Public License, nor are publicity,
+ privacy, and/or other similar personality rights; however, to
+ the extent possible, the Licensor waives and/or agrees not to
+ assert any such rights held by the Licensor to the limited
+ extent necessary to allow You to exercise the Licensed
+ Rights, but not otherwise.
+
+ 2. Patent and trademark rights are not licensed under this
+ Public License.
+
+ 3. To the extent possible, the Licensor waives any right to
+ collect royalties from You for the exercise of the Licensed
+ Rights, whether directly or through a collecting society
+ under any voluntary or waivable statutory or compulsory
+ licensing scheme. In all other cases the Licensor expressly
+ reserves any right to collect such royalties, including when
+ the Licensed Material is used other than for NonCommercial
+ purposes.
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the following conditions.
+
+a. Attribution.
+
+ 1. If You Share the Licensed Material (including in modified
+ form), You must:
+
+ a. retain the following if it is supplied by the Licensor
+ with the Licensed Material:
+
+ i. identification of the creator(s) of the Licensed
+ Material and any others designated to receive
+ attribution, in any reasonable manner requested by
+ the Licensor (including by pseudonym if
+ designated);
+
+ ii. a copyright notice;
+
+ iii. a notice that refers to this Public License;
+
+ iv. a notice that refers to the disclaimer of
+ warranties;
+
+ v. a URI or hyperlink to the Licensed Material to the
+ extent reasonably practicable;
+
+ b. indicate if You modified the Licensed Material and
+ retain an indication of any previous modifications; and
+
+ c. indicate the Licensed Material is licensed under this
+ Public License, and include the text of, or the URI or
+ hyperlink to, this Public License.
+
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
+ reasonable manner based on the medium, means, and context in
+ which You Share the Licensed Material. For example, it may be
+ reasonable to satisfy the conditions by providing a URI or
+ hyperlink to a resource that includes the required
+ information.
+
+ 3. If requested by the Licensor, You must remove any of the
+ information required by Section 3(a)(1)(A) to the extent
+ reasonably practicable.
+
+ 4. If You Share Adapted Material You produce, the Adapter's
+ License You apply must not prevent recipients of the Adapted
+ Material from complying with this Public License.
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
+
+a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only;
+
+b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and
+
+c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
+
+Section 6 -- Term and Termination.
+
+a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
+
+b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
+
+ 1. automatically as of the date the violation is cured, provided
+ it is cured within 30 days of Your discovery of the
+ violation; or
+
+ 2. upon express reinstatement by the Licensor.
+
+ For the avoidance of doubt, this Section 6(b) does not affect any
+ right the Licensor may have to seek remedies for Your violations
+ of this Public License.
+
+c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
+
+d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
+
+Section 7 -- Other Terms and Conditions.
+
+a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
+
+b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
+
+Section 8 -- Interpretation.
+
+a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
+
+b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
+
+c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
+
+d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.
+
+===
+Niantic Map Free Relocalization Dataset License Agreement
+This Niantic Map Free Relocalization Dataset License Agreement ("Agreement") is an agreement between you and Niantic, Inc. (“Niantic” or “we”). By downloading or otherwise using Niantic’s Map-Free Relocalization dataset or dataset-derived materials (collectively, the "Dataset") you agree to:
+
+1. Purpose and Restrictions. You may only use the Dataset only for non-commercial purposes, such as academic research at educational and not-for-profit research institutions, teaching, public demonstrations, and personal experimentation. Non-commercial use expressly excludes any profit-making or commercial activities, including without limitation sale, license, manufacture or development of commercial products, use in commercially-sponsored research, use at a laboratory or other facility owned or controlled (whether in whole or in part) by a commercial entity, provision of consulting service, use for or on behalf of any commercial entity, and use in consulting service, use for or on behalf of any commercial entity, use in research where a commercial party obtains rights to research results or any other benefit. Notwithstanding the foregoing restrictions, you can use this Dataset for publishing comparison results for academic papers, including retraining your models on this Dataset.
+
+2. License. Subject to this Agreement, Niantic grants you a non-exclusive, non-transferable, non-sublicensable right to download and use the Dataset for the purpose stated in Section 1 of this Agreement. All rights not expressly granted to you in this Agreement are reserved.
+
+3. Condition of Use. You must not use the Dataset in a way that could diminish, tarnish, or in any way harm Niantic’s reputation or image.
+
+4. No Warranties. The Dataset comes “as is”, and you will use it at your own risk. Niantic makes no representations or warranties regarding the Dataset, including but not limited to warranties of non-infringement or fitness for a particular purpose. Neither Niantic nor any contributor to the Dataset will be liable for any damages related to the Dataset or this Agreement, including direct, indirect, special, consequential or incidental damages, to the maximum extent the law permits, no matter what legal theory they are based on. We are not obligated to (and will not) provide technical support for the Dataset.
+
+5. Indemnity. You accept full responsibility for your use of the Dataset and shall defend and indemnify Niantic, including its employees, officers and agents, against any and all claims arising from your use of the Dataset.
+
+6. Removal. Niantic reserves the right to remove access to the Dataset at any time without cause. If you have downloaded a copy of the Dataset prior to such removal, you may use such a copy subject to this Agreement, but you may not distribute your copy.
+
+7. Termination. This Agreement will terminate immediately upon your commercial use of the Dataset.
+
+8. Authorized Representative. If you are employed by a for-profit, commercial entity, your employer shall also be bound by the terms and conditions of this Agreement, and you hereby represent that you are fully authorized to enter into this Agreement on behalf of such employer.
+
+9. Survivability. Sections 2, 4, 5, 6, 7, 8, 9, and 10 of this Agreement survive the termination of this Agreement.
+
+10. Misc. This Agreement is governed and construed in all respects in accordance with the laws of the State of California, USA without regard to conflicts of law. If any provision of this Agreement is deemed unenforceable or contrary to law, the rest of this Agreement shall remain in full effect and enforceable. If you do not agree to this Agreement, do not download or use the Dataset. The Dataset is protected by copyright and other intellectual property laws and is licensed, not sold.
+
+===
+NVIDIA Source Code License for SegFormer
+
+1. Definitions
+
+“Licensor” means any person or entity that distributes its Work.
+
+“Software” means the original work of authorship made available under this License.
+
+“Work” means the Software and any additions to or derivative works of the Software that are made available under
+this License.
+
+The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as provided under
+U.S. copyright law; provided, however, that for the purposes of this License, derivative works shall not include
+works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are “made available” under this License by including in or with the Work either
+(a) a copyright notice referencing the applicability of this License to the Work, or (b) a copy of this License.
+
+2. License Grant
+
+2.1 Copyright Grant. Subject to the terms and conditions of this License, each Licensor grants to you a perpetual,
+worldwide, non-exclusive, royalty-free, copyright license to reproduce, prepare derivative works of, publicly
+display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form.
+
+3. Limitations
+
+3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this License, (b) you
+include a complete copy of this License with your distribution, and (c) you retain without modification any
+copyright, patent, trademark, or attribution notices that are present in the Work.
+
+3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and
+distribution of your derivative works of the Work (“Your Terms”) only if (a) Your Terms provide that the use
+limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works
+that are subject to Your Terms. Notwithstanding Your Terms, this License (including the redistribution
+requirements in Section 3.1) will continue to apply to the Work itself.
+
+3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use
+non-commercially. Notwithstanding the foregoing, NVIDIA and its affiliates may use the Work and any derivative
+works commercially. As used herein, “non-commercially” means for research or evaluation purposes only.
+
+3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim,
+cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then
+your rights under this License from such Licensor (including the grant in Section 2.1) will terminate immediately.
+
+3.5 Trademarks. This License does not grant any rights to use any Licensor’s or its affiliates’ names, logos,
+or trademarks, except as necessary to reproduce the notices described in this License.
+
+3.6 Termination. If you violate any term of this License, then your rights under this License (including the
+grant in Section 2.1) will terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING
+WARRANTIES OR CONDITIONS OF M ERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU
+BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING
+NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR
+INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR
+DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER COMM ERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+===
+CosXL License Agreement
+
+
+STABILITY AI NON-COMMERCIAL RESEARCH COMMUNITY LICENSE AGREEMENT Dated: April 7th, 2024
+By clicking “I Accept” below or by using or distributing any portion or element of the Models, Software, Software Products or Derivative Works, you agree to the terms of this License. If you do not agree to this License, then you do not have any rights to use the Software Products or Derivative Works through this License, and you must immediately cease using the Software Products or Derivative Works. If you are agreeing to be bound by the terms of this License on behalf of your employer or other entity, you represent and warrant to Stability AI that you have full legal authority to bind your employer or such entity to this License. If you do not have the requisite authority, you may not accept the License or access the Software Products or Derivative Works on behalf of your employer or other entity.
+"Agreement" means this Stable Non-Commercial Research Community License Agreement.
+“AUP” means the Stability AI Acceptable Use Policy available at https://stability.ai/use-policy, as may be updated from time to time.
+"Derivative Work(s)” means (a) any derivative work of the Software Products as recognized by U.S. copyright laws and (b) any modifications to a Model, and any other model created which is based on or derived from the Model or the Model’s output. For clarity, Derivative Works do not include the output of any Model.
+“Documentation” means any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software.
+"Licensee" or "you" means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity's behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+“Model(s)" means, collectively, Stability AI’s proprietary models and algorithms, including machine-learning models, trained model weights and other elements of the foregoing, made available under this Agreement.
+“Non-Commercial Uses” means exercising any of the rights granted herein for the purpose of research or non-commercial purposes. Non-Commercial Uses does not include any production use of the Software Products or any Derivative Works.
+"Stability AI" or "we" means Stability AI Ltd. and its affiliates.
+
+"Software" means Stability AI’s proprietary software made available under this Agreement.
+“Software Products” means the Models, Software and Documentation, individually or in any combination.
+
+ License Rights and Redistribution.
+ a. Subject to your compliance with this Agreement, the AUP (which is hereby incorporated herein by reference), and the Documentation, Stability AI grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under Stability AI’s intellectual property or other rights owned or controlled by Stability AI embodied in the Software Products to use, reproduce, distribute, and create Derivative Works of, the Software Products, in each case for Non-Commercial Uses only.
+ b. You may not use the Software Products or Derivative Works to enable third parties to use the Software Products or Derivative Works as part of your hosted service or via your APIs, whether you are adding substantial additional functionality thereto or not. Merely distributing the Software Products or Derivative Works for download online without offering any related service (ex. by distributing the Models on HuggingFace) is not a violation of this subsection. If you wish to use the Software Products or any Derivative Works for commercial or production use or you wish to make the Software Products or any Derivative Works available to third parties via your hosted service or your APIs, contact Stability AI at https://stability.ai/contact.
+ c. If you distribute or make the Software Products, or any Derivative Works thereof, available to a third party, the Software Products, Derivative Works, or any portion thereof, respectively, will remain subject to this Agreement and you must (i) provide a copy of this Agreement to such third party, and (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "This Stability AI Model is licensed under the Stability AI Non-Commercial Research Community License, Copyright (c) Stability AI Ltd. All Rights Reserved.” If you create a Derivative Work of a Software Product, you may add your own attribution notices to the Notice file included with the Software Product, provided that you clearly indicate which attributions apply to the Software Product and you must state in the NOTICE file that you changed the Software Product and how it was modified.
+ Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE SOFTWARE PRODUCTS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE SOFTWARE PRODUCTS, DERIVATIVE WORKS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE SOFTWARE PRODUCTS, DERIVATIVE WORKS AND ANY OUTPUT AND RESULTS. 3. Limitation of Liability. IN NO EVENT WILL STABILITY AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF STABILITY AI OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING. 4. Intellectual Property.
+ a. No trademark licenses are granted under this Agreement, and in connection with the Software Products or Derivative Works, neither Stability AI nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Software Products or Derivative Works.
+ b. Subject to Stability AI’s ownership of the Software Products and Derivative Works made by or for Stability AI, with respect to any Derivative Works that are made by you, as between you and Stability AI, you are and will be the owner of such Derivative Works
+ c. If you institute litigation or other proceedings against Stability AI (including a cross-claim or counterclaim in a lawsuit) alleging that the Software Products, Derivative Works or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out of or related to your use or distribution of the Software Products or Derivative Works in violation of this Agreement.
+ Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Software Products and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Stability AI may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of any Software Products or Derivative Works. Sections 2-4 shall survive the termination of this Agreement.
+ Governing Law. This Agreement will be governed by and construed in accordance with the laws of the United States and the State of California without regard to choice of law
+ principles.
+
diff --git a/third_party/mast3r/LICENSE b/third_party/mast3r/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..a97986e3a8ddd49973959f6c748dfa8b881b64d3
--- /dev/null
+++ b/third_party/mast3r/LICENSE
@@ -0,0 +1,7 @@
+DUSt3R, Copyright (c) 2024-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license.
+
+A summary of the CC BY-NC-SA 4.0 license is located here:
+ https://creativecommons.org/licenses/by-nc-sa/4.0/
+
+The CC BY-NC-SA 4.0 license is located here:
+ https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
diff --git a/third_party/mast3r/NOTICE b/third_party/mast3r/NOTICE
new file mode 100644
index 0000000000000000000000000000000000000000..86583416b75cc1749cac38d437b376842975ca06
--- /dev/null
+++ b/third_party/mast3r/NOTICE
@@ -0,0 +1,103 @@
+MASt3R
+Copyright 2024-present NAVER Corp.
+
+This project contains subcomponents with separate copyright notices and license terms.
+Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses.
+
+====
+
+naver/dust3r
+https://github.com/naver/dust3r/
+
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0
+
+====
+
+naver/croco
+https://github.com/naver/croco/
+
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0
+
+====
+
+pytorch/pytorch
+https://github.com/pytorch/pytorch
+
+From PyTorch:
+
+Copyright (c) 2016- Facebook, Inc (Adam Paszke)
+Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+From Caffe2:
+
+Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+
+All contributions by Kakao Brain:
+Copyright 2019-2020 Kakao Brain
+
+All contributions by Cruise LLC:
+Copyright (c) 2022 Cruise LLC.
+All rights reserved.
+
+All contributions from Caffe:
+Copyright(c) 2013, 2014, 2015, the respective contributors
+All rights reserved.
+
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+
+Caffe2 uses a copyright model similar to Caffe: each contributor holds
+copyright over their contributions to Caffe2. The project versioning records
+all such contribution and copyright details. If a contributor wants to further
+mark their specific copyright on a particular contribution, they should
+indicate their copyright solely in the commit message of the change when it is
+committed.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+ and IDIAP Research Institute nor the names of its contributors may be
+ used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/third_party/mast3r/README.md b/third_party/mast3r/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7eef329100d32f76c707bf0c489db115895c274e
--- /dev/null
+++ b/third_party/mast3r/README.md
@@ -0,0 +1,316 @@
+![banner](assets/mast3r.jpg)
+
+Official implementation of `Grounding Image Matching in 3D with MASt3R`
+[[Project page](https://dust3r.europe.naverlabs.com/)], [[MASt3R arxiv](https://arxiv.org/abs/2406.09756)], [[DUSt3R arxiv](https://arxiv.org/abs/2312.14132)]
+
+![Example of matching results obtained from MASt3R](assets/examples.jpg)
+
+![High level overview of MASt3R's architecture](assets/mast3r_archi.jpg)
+
+```bibtex
+@misc{mast3r_arxiv24,
+ title={Grounding Image Matching in 3D with MASt3R},
+ author={Vincent Leroy and Yohann Cabon and Jerome Revaud},
+ year={2024},
+ eprint={2406.09756},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+
+@inproceedings{dust3r_cvpr24,
+ title={DUSt3R: Geometric 3D Vision Made Easy},
+ author={Shuzhe Wang and Vincent Leroy and Yohann Cabon and Boris Chidlovskii and Jerome Revaud},
+ booktitle = {CVPR},
+ year = {2024}
+}
+```
+
+## Table of Contents
+
+- [Table of Contents](#table-of-contents)
+- [License](#license)
+- [Get Started](#get-started)
+ - [Installation](#installation)
+ - [Checkpoints](#checkpoints)
+ - [Interactive demo](#interactive-demo)
+ - [Interactive demo with docker](#interactive-demo-with-docker)
+- [Usage](#usage)
+- [Training](#training)
+ - [Datasets](#datasets)
+ - [Demo](#demo)
+ - [Our Hyperparameters](#our-hyperparameters)
+- [Visual Localization](#visual-localization)
+ - [Dataset Preparation](#dataset-preparation)
+ - [Example Commands](#example-commands)
+
+## License
+
+The code is distributed under the CC BY-NC-SA 4.0 License.
+See [LICENSE](LICENSE) for more information.
+
+```python
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+```
+
+## Get Started
+
+### Installation
+
+1. Clone MASt3R.
+```bash
+git clone --recursive https://github.com/naver/mast3r
+cd mast3r
+# if you have already cloned mast3r:
+# git submodule update --init --recursive
+```
+
+2. Create the environment, here we show an example using conda.
+```bash
+conda create -n mast3r python=3.11 cmake=3.14.0
+conda activate mast3r
+conda install pytorch torchvision pytorch-cuda=12.1 -c pytorch -c nvidia # use the correct version of cuda for your system
+pip install -r requirements.txt
+pip install -r dust3r/requirements.txt
+# Optional: you can also install additional packages to:
+# - add support for HEIC images
+# - add required packages for visloc.py
+pip install -r dust3r/requirements_optional.txt
+```
+
+3. Optional, compile the cuda kernels for RoPE (as in CroCo v2).
+```bash
+# DUST3R relies on RoPE positional embeddings for which you can compile some cuda kernels for faster runtime.
+cd dust3r/croco/models/curope/
+python setup.py build_ext --inplace
+cd ../../../../
+```
+
+
+### Checkpoints
+
+You can obtain the checkpoints by two ways:
+
+1) You can use our huggingface_hub integration: the models will be downloaded automatically.
+
+2) Otherwise, We provide several pre-trained models:
+
+| Modelname | Training resolutions | Head | Encoder | Decoder |
+|-------------|----------------------|------|---------|---------|
+| [`MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric`](https://download.europe.naverlabs.com/ComputerVision/MASt3R/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth) | 512x384, 512x336, 512x288, 512x256, 512x160 | CatMLP+DPT | ViT-L | ViT-B |
+
+You can check the hyperparameters we used to train these models in the [section: Our Hyperparameters](#our-hyperparameters)
+Make sure to check license of the datasets we used.
+
+To download a specific model, for example `MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth`:
+```bash
+mkdir -p checkpoints/
+wget https://download.europe.naverlabs.com/ComputerVision/MASt3R/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth -P checkpoints/
+```
+
+For these checkpoints, make sure to agree to the license of all the training datasets we used, in addition to CC-BY-NC-SA 4.0.
+The mapfree dataset license in particular is very restrictive. For more information, check [CHECKPOINTS_NOTICE](CHECKPOINTS_NOTICE).
+
+
+### Interactive demo
+
+There are two demos available:
+
+```
+demo.py is the updated demo for MASt3R. It uses our new sparse global alignment method that allows you to reconstruct larger scenes
+
+python3 demo.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric
+
+# Use --weights to load a checkpoint from a local file, eg --weights checkpoints/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth
+# Use --local_network to make it accessible on the local network, or --server_name to specify the url manually
+# Use --server_port to change the port, by default it will search for an available port starting at 7860
+# Use --device to use a different device, by default it's "cuda"
+
+demo_dust3r_ga.py is the same demo as in dust3r (+ compatibility for MASt3R models)
+see https://github.com/naver/dust3r?tab=readme-ov-file#interactive-demo for details
+```
+### Interactive demo with docker
+
+TODO
+
+![demo](assets/demo.jpg)
+
+## Usage
+
+```python
+from mast3r.model import AsymmetricMASt3R
+from mast3r.fast_nn import fast_reciprocal_NNs
+
+import mast3r.utils.path_to_dust3r
+from dust3r.inference import inference
+from dust3r.utils.image import load_images
+
+if __name__ == '__main__':
+ device = 'cuda'
+ schedule = 'cosine'
+ lr = 0.01
+ niter = 300
+
+ model_name = "naver/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"
+ # you can put the path to a local checkpoint in model_name if needed
+ model = AsymmetricMASt3R.from_pretrained(model_name).to(device)
+ images = load_images(['dust3r/croco/assets/Chateau1.png', 'dust3r/croco/assets/Chateau2.png'], size=512)
+ output = inference([tuple(images)], model, device, batch_size=1, verbose=False)
+
+ # at this stage, you have the raw dust3r predictions
+ view1, pred1 = output['view1'], output['pred1']
+ view2, pred2 = output['view2'], output['pred2']
+
+ desc1, desc2 = pred1['desc'].squeeze(0).detach(), pred2['desc'].squeeze(0).detach()
+
+ # find 2D-2D matches between the two images
+ matches_im0, matches_im1 = fast_reciprocal_NNs(desc1, desc2, subsample_or_initxy1=8,
+ device=device, dist='dot', block_size=2**13)
+
+ # ignore small border around the edge
+ H0, W0 = view1['true_shape'][0]
+ valid_matches_im0 = (matches_im0[:, 0] >= 3) & (matches_im0[:, 0] < int(W0) - 3) & (
+ matches_im0[:, 1] >= 3) & (matches_im0[:, 1] < int(H0) - 3)
+
+ H1, W1 = view2['true_shape'][0]
+ valid_matches_im1 = (matches_im1[:, 0] >= 3) & (matches_im1[:, 0] < int(W1) - 3) & (
+ matches_im1[:, 1] >= 3) & (matches_im1[:, 1] < int(H1) - 3)
+
+ valid_matches = valid_matches_im0 & valid_matches_im1
+ matches_im0, matches_im1 = matches_im0[valid_matches], matches_im1[valid_matches]
+
+ # visualize a few matches
+ import numpy as np
+ import torch
+ import torchvision.transforms.functional
+ from matplotlib import pyplot as pl
+
+ n_viz = 20
+ num_matches = matches_im0.shape[0]
+ match_idx_to_viz = np.round(np.linspace(0, num_matches - 1, n_viz)).astype(int)
+ viz_matches_im0, viz_matches_im1 = matches_im0[match_idx_to_viz], matches_im1[match_idx_to_viz]
+
+ image_mean = torch.as_tensor([0.5, 0.5, 0.5], device='cpu').reshape(1, 3, 1, 1)
+ image_std = torch.as_tensor([0.5, 0.5, 0.5], device='cpu').reshape(1, 3, 1, 1)
+
+ viz_imgs = []
+ for i, view in enumerate([view1, view2]):
+ rgb_tensor = view['img'] * image_std + image_mean
+ viz_imgs.append(rgb_tensor.squeeze(0).permute(1, 2, 0).cpu().numpy())
+
+ H0, W0, H1, W1 = *viz_imgs[0].shape[:2], *viz_imgs[1].shape[:2]
+ img0 = np.pad(viz_imgs[0], ((0, max(H1 - H0, 0)), (0, 0), (0, 0)), 'constant', constant_values=0)
+ img1 = np.pad(viz_imgs[1], ((0, max(H0 - H1, 0)), (0, 0), (0, 0)), 'constant', constant_values=0)
+ img = np.concatenate((img0, img1), axis=1)
+ pl.figure()
+ pl.imshow(img)
+ cmap = pl.get_cmap('jet')
+ for i in range(n_viz):
+ (x0, y0), (x1, y1) = viz_matches_im0[i].T, viz_matches_im1[i].T
+ pl.plot([x0, x1 + W0], [y0, y1], '-+', color=cmap(i / (n_viz - 1)), scalex=False, scaley=False)
+ pl.show(block=True)
+```
+![matching example on croco pair](assets/matching.jpg)
+
+## Training
+
+In this section, we present a short demonstration to get started with training MASt3R.
+
+### Datasets
+
+See [Datasets section in DUSt3R](https://github.com/naver/dust3r/tree/datasets?tab=readme-ov-file#datasets)
+
+### Demo
+
+Like for the DUSt3R training demo, we're going to download and prepare the same subset of [CO3Dv2](https://github.com/facebookresearch/co3d) - [Creative Commons Attribution-NonCommercial 4.0 International](https://github.com/facebookresearch/co3d/blob/main/LICENSE) and launch the training code on it.
+It is the exact same process as DUSt3R.
+The demo model will be trained for a few epochs on a very small dataset.
+It will not be very good.
+
+```bash
+# download and prepare the co3d subset
+mkdir -p data/co3d_subset
+cd data/co3d_subset
+git clone https://github.com/facebookresearch/co3d
+cd co3d
+python3 ./co3d/download_dataset.py --download_folder ../ --single_sequence_subset
+rm ../*.zip
+cd ../../..
+
+python3 datasets_preprocess/preprocess_co3d.py --co3d_dir data/co3d_subset --output_dir data/co3d_subset_processed --single_sequence_subset
+
+# download the pretrained dust3r checkpoint
+mkdir -p checkpoints/
+wget https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth -P checkpoints/
+
+# for this example we'll do fewer epochs, for the actual hyperparameters we used in the paper, see the next section: "Our Hyperparameters"
+torchrun --nproc_per_node=4 train.py \
+ --train_dataset "1000 @ Co3d(split='train', ROOT='data/co3d_subset_processed', aug_crop='auto', aug_monocular=0.005, aug_rot90='diff', mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], n_corres=8192, nneg=0.5, transform=ColorJitter)" \
+ --test_dataset "100 @ Co3d(split='test', ROOT='data/co3d_subset_processed', resolution=(512,384), n_corres=1024, seed=777)" \
+ --model "AsymmetricMASt3R(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='catmlp+dpt', output_mode='pts3d+desc24', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12, two_confs=True)" \
+ --train_criterion "ConfLoss(Regr3D(L21, norm_mode='?avg_dis'), alpha=0.2) + 0.075*ConfMatchingLoss(MatchingLoss(InfoNCE(mode='proper', temperature=0.05), negatives_padding=0, blocksize=8192), alpha=10.0, confmode='mean')" \
+ --test_criterion "Regr3D_ScaleShiftInv(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + -1.*MatchingLoss(APLoss(nq='torch', fp=torch.float16), negatives_padding=12288)" \
+ --pretrained "checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth" \
+ --lr 0.0001 --min_lr 1e-06 --warmup_epochs 1 --epochs 10 --batch_size 4 --accum_iter 4 \
+ --save_freq 1 --keep_freq 5 --eval_freq 1 \
+ --output_dir "checkpoints/mast3r_demo"
+
+```
+
+### Our Hyperparameters
+We didn't release all the training datasets, but here are the commands we used for training our models:
+
+```bash
+# MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric - train mast3r with metric regression and matching loss
+# we used cosxl to generate variations of DL3DV: "foggy", "night", "rainy", "snow", "sunny" but we were not convinced by it.
+
+torchrun --nproc_per_node=8 train.py \
+ --train_dataset "57_000 @ Habitat512(1_000_000, split='train', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 68_400 @ BlendedMVS(split='train', mask_sky=True, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 68_400 @ MegaDepth(split='train', mask_sky=True, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 45_600 @ ARKitScenes(split='train', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 22_800 @ Co3d(split='train', mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 22_800 @ StaticThings3D(mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 45_600 @ ScanNetpp(split='train', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 45_600 @ TartanAir(pairs_subset='', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 4_560 @ UnrealStereo4K(resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 1_140 @ VirtualKitti(optical_center_is_centered=True, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 22_800 @ WildRgbd(split='train', mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 145_920 @ NianticMapFree(split='train', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 57_000 @ DL3DV(split='nlight', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 57_000 @ DL3DV(split='not-nlight', cosxl_augmentations=None, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 34_200 @ InternalUnreleasedDataset(resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5)" \
+ --test_dataset "Habitat512(1_000, split='val', resolution=(512,384), seed=777, n_corres=1024) + 1_000 @ BlendedMVS(split='val', resolution=(512,384), mask_sky=True, seed=777, n_corres=1024) + 1_000 @ ARKitScenes(split='test', resolution=(512,384), seed=777, n_corres=1024) + 1_000 @ MegaDepth(split='val', mask_sky=True, resolution=(512,336), seed=777, n_corres=1024) + 1_000 @ Co3d(split='test', resolution=(512,384), mask_bg='rand', seed=777, n_corres=1024)" \
+ --model "AsymmetricMASt3R(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='catmlp+dpt', output_mode='pts3d+desc24', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12, two_confs=True, desc_conf_mode=('exp', 0, inf))" \
+ --train_criterion "ConfLoss(Regr3D(L21, norm_mode='?avg_dis'), alpha=0.2, loss_in_log=False) + 0.075*ConfMatchingLoss(MatchingLoss(InfoNCE(mode='proper', temperature=0.05), negatives_padding=0, blocksize=8192), alpha=10.0, confmode='mean')" \
+ --test_criterion "Regr3D(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + -1.*MatchingLoss(APLoss(nq='torch', fp=torch.float16), negatives_padding=12288)" \
+ --pretrained "checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth" \
+ --lr 0.0001 --min_lr 1e-06 --warmup_epochs 8 --epochs 50 --batch_size 4 --accum_iter 2 \
+ --save_freq 1 --keep_freq 5 --eval_freq 1 --print_freq=10 \
+ --output_dir "checkpoints/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"
+
+```
+
+## Visual Localization
+
+### Dataset preparation
+
+See [Visloc section in DUSt3R](https://github.com/naver/dust3r/tree/dust3r_visloc#dataset-preparation)
+
+### Example Commands
+
+With `visloc.py` you can run our visual localization experiments on Aachen-Day-Night, InLoc, Cambridge Landmarks and 7 Scenes.
+
+
+```bash
+# Aachen-Day-Night-v1.1:
+# scene in 'day' 'night'
+# scene can also be 'all'
+python3 visloc.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric --dataset "VislocAachenDayNight('/path/to/prepared/Aachen-Day-Night-v1.1/', subscene='${scene}', pairsfile='fire_top50', topk=20)" --pixel_tol 5 --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/Aachen-Day-Night-v1.1/${scene}/loc
+
+# or with coarse to fine:
+
+python3 visloc.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric --dataset "VislocAachenDayNight('/path/to/prepared/Aachen-Day-Night-v1.1/', subscene='${scene}', pairsfile='fire_top50', topk=20)" --pixel_tol 5 --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/Aachen-Day-Night-v1.1/${scene}/loc --coarse_to_fine --max_batch_size 48 --c2f_crop_with_homography
+
+# InLoc
+python3 visloc.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric --dataset "VislocInLoc('/path/to/prepared/InLoc/', pairsfile='pairs-query-netvlad40-temporal', topk=20)" --pixel_tol 5 --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/InLoc/loc
+
+# or with coarse to fine:
+
+python3 visloc.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric --dataset "VislocInLoc('/path/to/prepared/InLoc/', pairsfile='pairs-query-netvlad40-temporal', topk=20)" --pixel_tol 5 --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/InLoc/loc --coarse_to_fine --max_image_size 1200 --max_batch_size 48 --c2f_crop_with_homography
+
+# 7-scenes:
+# scene in 'chess' 'fire' 'heads' 'office' 'pumpkin' 'redkitchen' 'stairs'
+python3 visloc.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric --dataset "VislocSevenScenes('/path/to/prepared/7-scenes/', subscene='${scene}', pairsfile='APGeM-LM18_top20', topk=1)" --pixel_tol 5 --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/7-scenes/${scene}/loc
+
+# Cambridge Landmarks:
+# scene in 'ShopFacade' 'GreatCourt' 'KingsCollege' 'OldHospital' 'StMarysChurch'
+python3 visloc.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric --dataset "VislocCambridgeLandmarks('/path/to/prepared/Cambridge_Landmarks/', subscene='${scene}', pairsfile='APGeM-LM18_top20', topk=1)" --pixel_tol 5 --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/Cambridge_Landmarks/${scene}/loc
+
+```
diff --git a/third_party/mast3r/assets/NLE_tower/01D90321-69C8-439F-B0B0-E87E7634741C-83120-000041DAE419D7AE.jpg b/third_party/mast3r/assets/NLE_tower/01D90321-69C8-439F-B0B0-E87E7634741C-83120-000041DAE419D7AE.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6df8360463bcf59f07bc88c6518fdfbfefac516e
--- /dev/null
+++ b/third_party/mast3r/assets/NLE_tower/01D90321-69C8-439F-B0B0-E87E7634741C-83120-000041DAE419D7AE.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09facfdf35aa2f047104108d5c72a2a280df5621c22630774629d8ad1000ab22
+size 240860
diff --git a/third_party/mast3r/assets/NLE_tower/1AD85EF5-B651-4291-A5C0-7BDB7D966384-83120-000041DADF639E09.jpg b/third_party/mast3r/assets/NLE_tower/1AD85EF5-B651-4291-A5C0-7BDB7D966384-83120-000041DADF639E09.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bcbf3e0ceadc0a9cf14e0812bdf86db11113ac36
--- /dev/null
+++ b/third_party/mast3r/assets/NLE_tower/1AD85EF5-B651-4291-A5C0-7BDB7D966384-83120-000041DADF639E09.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:757ce3334fe5384fe5ea894ae57517c5c5c0fff70dca035b5e32986945d37658
+size 340823
diff --git a/third_party/mast3r/assets/NLE_tower/2679C386-1DC0-4443-81B5-93D7EDE4AB37-83120-000041DADB2EA917.jpg b/third_party/mast3r/assets/NLE_tower/2679C386-1DC0-4443-81B5-93D7EDE4AB37-83120-000041DADB2EA917.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..64682728eeff8a242e9c685de31a6e19516a157d
--- /dev/null
+++ b/third_party/mast3r/assets/NLE_tower/2679C386-1DC0-4443-81B5-93D7EDE4AB37-83120-000041DADB2EA917.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cbb0484bdd923843030acbf9c4e7f0b54b22ede657ce9f11afeb413bb680c417
+size 374313
diff --git a/third_party/mast3r/assets/NLE_tower/28EDBB63-B9F9-42FB-AC86-4852A33ED71B-83120-000041DAF22407A1.jpg b/third_party/mast3r/assets/NLE_tower/28EDBB63-B9F9-42FB-AC86-4852A33ED71B-83120-000041DAF22407A1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..33f42d3af3f46184a6d9315361bcc54d3c23fd2c
--- /dev/null
+++ b/third_party/mast3r/assets/NLE_tower/28EDBB63-B9F9-42FB-AC86-4852A33ED71B-83120-000041DAF22407A1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0615b731161063e5fdcbbc8ed5fba30ec27bb0866bba2d364285ecc2c331037
+size 329199
diff --git a/third_party/mast3r/assets/NLE_tower/91E9B685-7A7D-42D7-B933-23A800EE4129-83120-000041DAE12C8176.jpg b/third_party/mast3r/assets/NLE_tower/91E9B685-7A7D-42D7-B933-23A800EE4129-83120-000041DAE12C8176.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..700aa9f7e2981206d2852e031169a4d80c2c3c6d
--- /dev/null
+++ b/third_party/mast3r/assets/NLE_tower/91E9B685-7A7D-42D7-B933-23A800EE4129-83120-000041DAE12C8176.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3df9a79b7256f7cb51f713af1115a2ea04a32963ad697eb741cb0b916b6ae706
+size 474174
diff --git a/third_party/mast3r/assets/NLE_tower/CDBBD885-54C3-4EB4-9181-226059A60EE0-83120-000041DAE0C3D612.jpg b/third_party/mast3r/assets/NLE_tower/CDBBD885-54C3-4EB4-9181-226059A60EE0-83120-000041DAE0C3D612.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..04fff2f1d571bc00c9321a7ea5c731f31513958c
--- /dev/null
+++ b/third_party/mast3r/assets/NLE_tower/CDBBD885-54C3-4EB4-9181-226059A60EE0-83120-000041DAE0C3D612.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d41a16810a3b59389dd7d8291341a3bdf037a8cc8a9d9a7ac75694517a0bd5c
+size 503837
diff --git a/third_party/mast3r/assets/NLE_tower/FF5599FD-768B-431A-AB83-BDA5FB44CB9D-83120-000041DADDE35483.jpg b/third_party/mast3r/assets/NLE_tower/FF5599FD-768B-431A-AB83-BDA5FB44CB9D-83120-000041DADDE35483.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a3ae0ad2e43a29fa2637d4e662ffc688f7238002
--- /dev/null
+++ b/third_party/mast3r/assets/NLE_tower/FF5599FD-768B-431A-AB83-BDA5FB44CB9D-83120-000041DADDE35483.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c060178a3e35c3bdfe92003e45c2e450e7a24932e4c46ddf12cc221b64c90159
+size 288082
diff --git a/third_party/mast3r/assets/demo.jpg b/third_party/mast3r/assets/demo.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7072dc4cc1983091d3044b3e239b15d45eca3f47
--- /dev/null
+++ b/third_party/mast3r/assets/demo.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b1b3a31c5f8933657bfd89d842da36f9cf554b33e6aae9fe43b69a48619d3d2
+size 154962
diff --git a/third_party/mast3r/assets/examples.jpg b/third_party/mast3r/assets/examples.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..853508c3b08b339dc20e72ef91d6eea693065352
--- /dev/null
+++ b/third_party/mast3r/assets/examples.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1daa4e793c955873d61099f2c242ff32a4c8385c06c7f914f87d21deeed6db9d
+size 665910
diff --git a/third_party/mast3r/assets/mast3r.jpg b/third_party/mast3r/assets/mast3r.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0cbf418cbd292f2e8cf9a723218a7f83d785d25b
--- /dev/null
+++ b/third_party/mast3r/assets/mast3r.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa7955b03beb2e0cf613f24a3aef06e9d77cecd7f56fd89466187ce1e25f21f4
+size 29211
diff --git a/third_party/mast3r/assets/mast3r_archi.jpg b/third_party/mast3r/assets/mast3r_archi.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..be2a5ee8f4bced213ef46292b1b76264fc2ed99c
--- /dev/null
+++ b/third_party/mast3r/assets/mast3r_archi.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c833fc9666b8944d4dcc2f386b796a924f11213803d89cd29d69d9a6d78ffbe6
+size 90691
diff --git a/third_party/mast3r/assets/matching.jpg b/third_party/mast3r/assets/matching.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..dc6cd0fa805044f6c5a2737485f89f2e1a0ce28f
--- /dev/null
+++ b/third_party/mast3r/assets/matching.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6d160908cc53674d1d3ecb30e2670e30b42fed91fb553c6058bc58009ee095f
+size 170508
diff --git a/third_party/mast3r/demo.py b/third_party/mast3r/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..94459191de6404e0c036ac7b6529755ede16faad
--- /dev/null
+++ b/third_party/mast3r/demo.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# gradio demo
+# --------------------------------------------------------
+import math
+import gradio
+import os
+import torch
+import numpy as np
+import tempfile
+import functools
+import trimesh
+import copy
+from scipy.spatial.transform import Rotation
+
+from mast3r.cloud_opt.sparse_ga import sparse_global_alignment
+from mast3r.cloud_opt.tsdf_optimizer import TSDFPostProcess
+
+from mast3r.model import AsymmetricMASt3R
+from mast3r.utils.misc import hash_md5
+import mast3r.utils.path_to_dust3r # noqa
+from dust3r.image_pairs import make_pairs
+from dust3r.utils.image import load_images
+from dust3r.utils.device import to_numpy
+from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
+from dust3r.demo import get_args_parser as dust3r_get_args_parser
+
+import matplotlib.pyplot as pl
+pl.ion()
+
+torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12
+batch_size = 1
+
+
+def get_args_parser():
+ parser = dust3r_get_args_parser()
+ parser.add_argument('--share', action='store_true')
+
+ actions = parser._actions
+ for action in actions:
+ if action.dest == 'model_name':
+ action.choices = ["MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"]
+ # change defaults
+ parser.prog = 'mast3r demo'
+ return parser
+
+
+def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
+ cam_color=None, as_pointcloud=False,
+ transparent_cams=False, silent=False):
+ assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
+ pts3d = to_numpy(pts3d)
+ imgs = to_numpy(imgs)
+ focals = to_numpy(focals)
+ cams2world = to_numpy(cams2world)
+
+ scene = trimesh.Scene()
+
+ # full pointcloud
+ if as_pointcloud:
+ pts = np.concatenate([p[m.ravel()] for p, m in zip(pts3d, mask)])
+ col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
+ pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
+ scene.add_geometry(pct)
+ else:
+ meshes = []
+ for i in range(len(imgs)):
+ meshes.append(pts3d_to_trimesh(imgs[i], pts3d[i].reshape(imgs[i].shape), mask[i]))
+ mesh = trimesh.Trimesh(**cat_meshes(meshes))
+ scene.add_geometry(mesh)
+
+ # add each camera
+ for i, pose_c2w in enumerate(cams2world):
+ if isinstance(cam_color, list):
+ camera_edge_color = cam_color[i]
+ else:
+ camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+ add_scene_cam(scene, pose_c2w, camera_edge_color,
+ None if transparent_cams else imgs[i], focals[i],
+ imsize=imgs[i].shape[1::-1], screen_width=cam_size)
+
+ rot = np.eye(4)
+ rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
+ scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
+ outfile = os.path.join(outdir, 'scene.glb')
+ if not silent:
+ print('(exporting 3D scene to', outfile, ')')
+ scene.export(file_obj=outfile)
+ return outfile
+
+
+def get_3D_model_from_scene(outdir, silent, scene, min_conf_thr=2, as_pointcloud=False, mask_sky=False,
+ clean_depth=False, transparent_cams=False, cam_size=0.05, TSDF_thresh=0):
+ """
+ extract 3D_model (glb file) from a reconstructed scene
+ """
+ if scene is None:
+ return None
+
+ # get optimized values from scene
+ rgbimg = scene.imgs
+ focals = scene.get_focals().cpu()
+ cams2world = scene.get_im_poses().cpu()
+
+ # 3D pointcloud from depthmap, poses and intrinsics
+ if TSDF_thresh > 0:
+ tsdf = TSDFPostProcess(scene, TSDF_thresh=TSDF_thresh)
+ pts3d, _, confs = to_numpy(tsdf.get_dense_pts3d(clean_depth=clean_depth))
+ else:
+ pts3d, _, confs = to_numpy(scene.get_dense_pts3d(clean_depth=clean_depth))
+ msk = to_numpy([c > min_conf_thr for c in confs])
+ return _convert_scene_output_to_glb(outdir, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud,
+ transparent_cams=transparent_cams, cam_size=cam_size, silent=silent)
+
+
+def get_reconstructed_scene(outdir, model, device, silent, image_size, filelist, optim_level, lr1, niter1, lr2, niter2, min_conf_thr,
+ as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
+ scenegraph_type, winsize, refid, TSDF_thresh, **kw):
+ """
+ from a list of images, run mast3r inference, sparse global aligner.
+ then run get_3D_model_from_scene
+ """
+ imgs = load_images(filelist, size=image_size, verbose=not silent)
+ if len(imgs) == 1:
+ imgs = [imgs[0], copy.deepcopy(imgs[0])]
+ imgs[1]['idx'] = 1
+ filelist = [filelist[0], filelist[0] + '_2']
+ if scenegraph_type == "swin":
+ scenegraph_type = scenegraph_type + "-" + str(winsize)
+ elif scenegraph_type == "oneref":
+ scenegraph_type = scenegraph_type + "-" + str(refid)
+
+ pairs = make_pairs(imgs, scene_graph=scenegraph_type, prefilter=None, symmetrize=True)
+ if optim_level == 'coarse':
+ niter2 = 0
+ # Sparse GA (forward mast3r -> matching -> 3D optim -> 2D refinement -> triangulation)
+ scene = sparse_global_alignment(filelist, pairs, os.path.join(outdir, 'cache'),
+ model, lr1=lr1, niter1=niter1, lr2=lr2, niter2=niter2, device=device,
+ opt_depth='depth' in optim_level, **kw)
+ outfile = get_3D_model_from_scene(outdir, silent, scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size, TSDF_thresh)
+ return scene, outfile
+
+
+def set_scenegraph_options(inputfiles, winsize, refid, scenegraph_type):
+ num_files = len(inputfiles) if inputfiles is not None else 1
+ max_winsize = max(1, math.ceil((num_files - 1) / 2))
+ if scenegraph_type == "swin":
+ winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+ minimum=1, maximum=max_winsize, step=1, visible=True)
+ refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+ maximum=num_files - 1, step=1, visible=False)
+ elif scenegraph_type == "oneref":
+ winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+ minimum=1, maximum=max_winsize, step=1, visible=False)
+ refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+ maximum=num_files - 1, step=1, visible=True)
+ else:
+ winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+ minimum=1, maximum=max_winsize, step=1, visible=False)
+ refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+ maximum=num_files - 1, step=1, visible=False)
+ return winsize, refid
+
+
+def main_demo(tmpdirname, model, device, image_size, server_name, server_port, silent=False, share=False):
+ if not silent:
+ print('Outputing stuff in', tmpdirname)
+
+ recon_fun = functools.partial(get_reconstructed_scene, tmpdirname, model, device, silent, image_size)
+ model_from_scene_fun = functools.partial(get_3D_model_from_scene, tmpdirname, silent)
+ with gradio.Blocks(css=""".gradio-container {margin: 0 !important; min-width: 100%};""", title="MASt3R Demo") as demo:
+ # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
+ scene = gradio.State(None)
+ gradio.HTML('
MASt3R Demo
')
+ with gradio.Column():
+ inputfiles = gradio.File(file_count="multiple")
+ with gradio.Row():
+ lr1 = gradio.Slider(label="Coarse LR", value=0.07, minimum=0.01, maximum=0.2, step=0.01)
+ niter1 = gradio.Number(value=200, precision=0, minimum=0, maximum=10_000,
+ label="num_iterations", info="For coarse alignment!")
+ lr2 = gradio.Slider(label="Fine LR", value=0.014, minimum=0.005, maximum=0.05, step=0.001)
+ niter2 = gradio.Number(value=500, precision=0, minimum=0, maximum=100_000,
+ label="num_iterations", info="For refinement!")
+ optim_level = gradio.Dropdown(["coarse", "refine", "refine+depth"],
+ value='refine', label="OptLevel",
+ info="Optimization level")
+
+ scenegraph_type = gradio.Dropdown(["complete", "swin", "oneref"],
+ value='complete', label="Scenegraph",
+ info="Define how to make pairs",
+ interactive=True)
+ winsize = gradio.Slider(label="Scene Graph: Window Size", value=1,
+ minimum=1, maximum=1, step=1, visible=False)
+ refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0, maximum=0, step=1, visible=False)
+
+ run_btn = gradio.Button("Run")
+
+ with gradio.Row():
+ # adjust the confidence threshold
+ min_conf_thr = gradio.Slider(label="min_conf_thr", value=1.5, minimum=0.0, maximum=10, step=0.1)
+ # adjust the camera size in the output pointcloud
+ cam_size = gradio.Slider(label="cam_size", value=0.2, minimum=0.001, maximum=1.0, step=0.001)
+ TSDF_thresh = gradio.Slider(label="TSDF Threshold", value=0., minimum=0., maximum=1., step=0.01)
+ with gradio.Row():
+ as_pointcloud = gradio.Checkbox(value=True, label="As pointcloud")
+ # two post process implemented
+ mask_sky = gradio.Checkbox(value=False, label="Mask sky")
+ clean_depth = gradio.Checkbox(value=True, label="Clean-up depthmaps")
+ transparent_cams = gradio.Checkbox(value=False, label="Transparent cameras")
+
+ outmodel = gradio.Model3D()
+
+ # events
+ scenegraph_type.change(set_scenegraph_options,
+ inputs=[inputfiles, winsize, refid, scenegraph_type],
+ outputs=[winsize, refid])
+ inputfiles.change(set_scenegraph_options,
+ inputs=[inputfiles, winsize, refid, scenegraph_type],
+ outputs=[winsize, refid])
+ run_btn.click(fn=recon_fun,
+ inputs=[inputfiles, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, as_pointcloud,
+ mask_sky, clean_depth, transparent_cams, cam_size,
+ scenegraph_type, winsize, refid, TSDF_thresh],
+ outputs=[scene, outmodel])
+ min_conf_thr.release(fn=model_from_scene_fun,
+ inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size, TSDF_thresh],
+ outputs=outmodel)
+ cam_size.change(fn=model_from_scene_fun,
+ inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size, TSDF_thresh],
+ outputs=outmodel)
+ TSDF_thresh.change(fn=model_from_scene_fun,
+ inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size, TSDF_thresh],
+ outputs=outmodel)
+ as_pointcloud.change(fn=model_from_scene_fun,
+ inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size, TSDF_thresh],
+ outputs=outmodel)
+ mask_sky.change(fn=model_from_scene_fun,
+ inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size, TSDF_thresh],
+ outputs=outmodel)
+ clean_depth.change(fn=model_from_scene_fun,
+ inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size, TSDF_thresh],
+ outputs=outmodel)
+ transparent_cams.change(model_from_scene_fun,
+ inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size, TSDF_thresh],
+ outputs=outmodel)
+ demo.launch(share=False, server_name=server_name, server_port=server_port)
+
+
+if __name__ == '__main__':
+ parser = get_args_parser()
+ args = parser.parse_args()
+
+ if args.server_name is not None:
+ server_name = args.server_name
+ else:
+ server_name = '0.0.0.0' if args.local_network else '127.0.0.1'
+
+ if args.weights is not None:
+ weights_path = args.weights
+ else:
+ weights_path = "naver/" + args.model_name
+
+ model = AsymmetricMASt3R.from_pretrained(weights_path).to(args.device)
+ chkpt_tag = hash_md5(weights_path)
+
+ # mast3r will write the 3D model inside tmpdirname/chkpt_tag
+ if args.tmp_dir is not None:
+ tmpdirname = args.tmp_dir
+ cache_path = os.path.join(tmpdirname, chkpt_tag)
+ os.makedirs(cache_path, exist_ok=True)
+ main_demo(cache_path, model, args.device, args.image_size, server_name, args.server_port, silent=args.silent,
+ share=args.share)
+ else:
+ with tempfile.TemporaryDirectory(suffix='_mast3r_gradio_demo') as tmpdirname:
+ cache_path = os.path.join(tmpdirname, chkpt_tag)
+ os.makedirs(cache_path, exist_ok=True)
+ main_demo(tmpdirname, model, args.device, args.image_size,
+ server_name, args.server_port, silent=args.silent,
+ share=args.share)
diff --git a/third_party/mast3r/demo_dust3r_ga.py b/third_party/mast3r/demo_dust3r_ga.py
new file mode 100644
index 0000000000000000000000000000000000000000..31d5be0501949e2393ae389d2fe7ac16cf3651dc
--- /dev/null
+++ b/third_party/mast3r/demo_dust3r_ga.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# mast3r gradio demo executable
+# --------------------------------------------------------
+import os
+import torch
+import tempfile
+
+import mast3r.utils.path_to_dust3r # noqa
+from dust3r.model import AsymmetricCroCo3DStereo
+from mast3r.model import AsymmetricMASt3R
+from dust3r.demo import get_args_parser as dust3r_get_args_parser
+from dust3r.demo import main_demo
+
+import matplotlib.pyplot as pl
+pl.ion()
+
+torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12
+
+
+def get_args_parser():
+ parser = dust3r_get_args_parser()
+
+ actions = parser._actions
+ for action in actions:
+ if action.dest == 'model_name':
+ action.choices.append('MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric')
+ # change defaults
+ parser.prog = 'mast3r demo'
+ return parser
+
+
+if __name__ == '__main__':
+ parser = get_args_parser()
+ args = parser.parse_args()
+
+ if args.tmp_dir is not None:
+ tmp_path = args.tmp_dir
+ os.makedirs(tmp_path, exist_ok=True)
+ tempfile.tempdir = tmp_path
+
+ if args.server_name is not None:
+ server_name = args.server_name
+ else:
+ server_name = '0.0.0.0' if args.local_network else '127.0.0.1'
+
+ if args.weights is not None:
+ weights_path = args.weights
+ else:
+ weights_path = "naver/" + args.model_name
+
+ try:
+ model = AsymmetricMASt3R.from_pretrained(weights_path).to(args.device)
+ except Exception as e:
+ model = AsymmetricCroCo3DStereo.from_pretrained(weights_path).to(args.device)
+
+ # dust3r will write the 3D model inside tmpdirname
+ with tempfile.TemporaryDirectory(suffix='dust3r_gradio_demo') as tmpdirname:
+ if not args.silent:
+ print('Outputing stuff in', tmpdirname)
+ main_demo(tmpdirname, model, args.device, args.image_size, server_name, args.server_port, silent=args.silent)
diff --git a/third_party/mast3r/dust3r/.gitignore b/third_party/mast3r/dust3r/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..194e236cbd708160926c3513b4232285eb47b029
--- /dev/null
+++ b/third_party/mast3r/dust3r/.gitignore
@@ -0,0 +1,132 @@
+data/
+checkpoints/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/third_party/mast3r/dust3r/.gitmodules b/third_party/mast3r/dust3r/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..c950ef981a8d2e47599dd7acbbe1bf8de9a42aca
--- /dev/null
+++ b/third_party/mast3r/dust3r/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "croco"]
+ path = croco
+ url = https://github.com/naver/croco
diff --git a/third_party/mast3r/dust3r/LICENSE b/third_party/mast3r/dust3r/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..a97986e3a8ddd49973959f6c748dfa8b881b64d3
--- /dev/null
+++ b/third_party/mast3r/dust3r/LICENSE
@@ -0,0 +1,7 @@
+DUSt3R, Copyright (c) 2024-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license.
+
+A summary of the CC BY-NC-SA 4.0 license is located here:
+ https://creativecommons.org/licenses/by-nc-sa/4.0/
+
+The CC BY-NC-SA 4.0 license is located here:
+ https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
diff --git a/third_party/mast3r/dust3r/NOTICE b/third_party/mast3r/dust3r/NOTICE
new file mode 100644
index 0000000000000000000000000000000000000000..81da544dd534c5465361f35cf6a5a0cfff7c1d3f
--- /dev/null
+++ b/third_party/mast3r/dust3r/NOTICE
@@ -0,0 +1,12 @@
+DUSt3R
+Copyright 2024-present NAVER Corp.
+
+This project contains subcomponents with separate copyright notices and license terms.
+Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses.
+
+====
+
+naver/croco
+https://github.com/naver/croco/
+
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0
diff --git a/third_party/mast3r/dust3r/README.md b/third_party/mast3r/dust3r/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6df7772a18830d1249d1f9992cf6c28e3e794993
--- /dev/null
+++ b/third_party/mast3r/dust3r/README.md
@@ -0,0 +1,388 @@
+![demo](assets/dust3r.jpg)
+
+Official implementation of `DUSt3R: Geometric 3D Vision Made Easy`
+[[Project page](https://dust3r.europe.naverlabs.com/)], [[DUSt3R arxiv](https://arxiv.org/abs/2312.14132)]
+
+![Example of reconstruction from two images](assets/pipeline1.jpg)
+
+![High level overview of DUSt3R capabilities](assets/dust3r_archi.jpg)
+
+```bibtex
+@inproceedings{dust3r_cvpr24,
+ title={DUSt3R: Geometric 3D Vision Made Easy},
+ author={Shuzhe Wang and Vincent Leroy and Yohann Cabon and Boris Chidlovskii and Jerome Revaud},
+ booktitle = {CVPR},
+ year = {2024}
+}
+
+@misc{dust3r_arxiv23,
+ title={DUSt3R: Geometric 3D Vision Made Easy},
+ author={Shuzhe Wang and Vincent Leroy and Yohann Cabon and Boris Chidlovskii and Jerome Revaud},
+ year={2023},
+ eprint={2312.14132},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+## Table of Contents
+
+- [Table of Contents](#table-of-contents)
+- [License](#license)
+- [Get Started](#get-started)
+ - [Installation](#installation)
+ - [Checkpoints](#checkpoints)
+ - [Interactive demo](#interactive-demo)
+ - [Interactive demo with docker](#interactive-demo-with-docker)
+- [Usage](#usage)
+- [Training](#training)
+ - [Datasets](#datasets)
+ - [Demo](#demo)
+ - [Our Hyperparameters](#our-hyperparameters)
+
+## License
+
+The code is distributed under the CC BY-NC-SA 4.0 License.
+See [LICENSE](LICENSE) for more information.
+
+```python
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+```
+
+## Get Started
+
+### Installation
+
+1. Clone DUSt3R.
+```bash
+git clone --recursive https://github.com/naver/dust3r
+cd dust3r
+# if you have already cloned dust3r:
+# git submodule update --init --recursive
+```
+
+2. Create the environment, here we show an example using conda.
+```bash
+conda create -n dust3r python=3.11 cmake=3.14.0
+conda activate dust3r
+conda install pytorch torchvision pytorch-cuda=12.1 -c pytorch -c nvidia # use the correct version of cuda for your system
+pip install -r requirements.txt
+# Optional: you can also install additional packages to:
+# - add support for HEIC images
+# - add pyrender, used to render depthmap in some datasets preprocessing
+# - add required packages for visloc.py
+pip install -r requirements_optional.txt
+```
+
+3. Optional, compile the cuda kernels for RoPE (as in CroCo v2).
+```bash
+# DUST3R relies on RoPE positional embeddings for which you can compile some cuda kernels for faster runtime.
+cd croco/models/curope/
+python setup.py build_ext --inplace
+cd ../../../
+```
+
+### Checkpoints
+
+You can obtain the checkpoints by two ways:
+
+1) You can use our huggingface_hub integration: the models will be downloaded automatically.
+
+2) Otherwise, We provide several pre-trained models:
+
+| Modelname | Training resolutions | Head | Encoder | Decoder |
+|-------------|----------------------|------|---------|---------|
+| [`DUSt3R_ViTLarge_BaseDecoder_224_linear.pth`](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_224_linear.pth) | 224x224 | Linear | ViT-L | ViT-B |
+| [`DUSt3R_ViTLarge_BaseDecoder_512_linear.pth`](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_linear.pth) | 512x384, 512x336, 512x288, 512x256, 512x160 | Linear | ViT-L | ViT-B |
+| [`DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth`](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth) | 512x384, 512x336, 512x288, 512x256, 512x160 | DPT | ViT-L | ViT-B |
+
+You can check the hyperparameters we used to train these models in the [section: Our Hyperparameters](#our-hyperparameters)
+
+To download a specific model, for example `DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth`:
+```bash
+mkdir -p checkpoints/
+wget https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth -P checkpoints/
+```
+
+For the checkpoints, make sure to agree to the license of all the public training datasets and base checkpoints we used, in addition to CC-BY-NC-SA 4.0. Again, see [section: Our Hyperparameters](#our-hyperparameters) for details.
+
+### Interactive demo
+
+In this demo, you should be able run DUSt3R on your machine to reconstruct a scene.
+First select images that depicts the same scene.
+
+You can adjust the global alignment schedule and its number of iterations.
+
+> [!NOTE]
+> If you selected one or two images, the global alignment procedure will be skipped (mode=GlobalAlignerMode.PairViewer)
+
+Hit "Run" and wait.
+When the global alignment ends, the reconstruction appears.
+Use the slider "min_conf_thr" to show or remove low confidence areas.
+
+```bash
+python3 demo.py --model_name DUSt3R_ViTLarge_BaseDecoder_512_dpt
+
+# Use --weights to load a checkpoint from a local file, eg --weights checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth
+# Use --image_size to select the correct resolution for the selected checkpoint. 512 (default) or 224
+# Use --local_network to make it accessible on the local network, or --server_name to specify the url manually
+# Use --server_port to change the port, by default it will search for an available port starting at 7860
+# Use --device to use a different device, by default it's "cuda"
+```
+
+### Interactive demo with docker
+
+To run DUSt3R using Docker, including with NVIDIA CUDA support, follow these instructions:
+
+1. **Install Docker**: If not already installed, download and install `docker` and `docker compose` from the [Docker website](https://www.docker.com/get-started).
+
+2. **Install NVIDIA Docker Toolkit**: For GPU support, install the NVIDIA Docker toolkit from the [Nvidia website](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
+
+3. **Build the Docker image and run it**: `cd` into the `./docker` directory and run the following commands:
+
+```bash
+cd docker
+bash run.sh --with-cuda --model_name="DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+```
+
+Or if you want to run the demo without CUDA support, run the following command:
+
+```bash
+cd docker
+bash run.sh --model_name="DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+```
+
+By default, `demo.py` is lanched with the option `--local_network`.
+Visit `http://localhost:7860/` to access the web UI (or replace `localhost` with the machine's name to access it from the network).
+
+`run.sh` will launch docker-compose using either the [docker-compose-cuda.yml](docker/docker-compose-cuda.yml) or [docker-compose-cpu.ym](docker/docker-compose-cpu.yml) config file, then it starts the demo using [entrypoint.sh](docker/files/entrypoint.sh).
+
+
+![demo](assets/demo.jpg)
+
+## Usage
+
+```python
+from dust3r.inference import inference
+from dust3r.model import AsymmetricCroCo3DStereo
+from dust3r.utils.image import load_images
+from dust3r.image_pairs import make_pairs
+from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
+
+if __name__ == '__main__':
+ device = 'cuda'
+ batch_size = 1
+ schedule = 'cosine'
+ lr = 0.01
+ niter = 300
+
+ model_name = "naver/DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+ # you can put the path to a local checkpoint in model_name if needed
+ model = AsymmetricCroCo3DStereo.from_pretrained(model_name).to(device)
+ # load_images can take a list of images or a directory
+ images = load_images(['croco/assets/Chateau1.png', 'croco/assets/Chateau2.png'], size=512)
+ pairs = make_pairs(images, scene_graph='complete', prefilter=None, symmetrize=True)
+ output = inference(pairs, model, device, batch_size=batch_size)
+
+ # at this stage, you have the raw dust3r predictions
+ view1, pred1 = output['view1'], output['pred1']
+ view2, pred2 = output['view2'], output['pred2']
+ # here, view1, pred1, view2, pred2 are dicts of lists of len(2)
+ # -> because we symmetrize we have (im1, im2) and (im2, im1) pairs
+ # in each view you have:
+ # an integer image identifier: view1['idx'] and view2['idx']
+ # the img: view1['img'] and view2['img']
+ # the image shape: view1['true_shape'] and view2['true_shape']
+ # an instance string output by the dataloader: view1['instance'] and view2['instance']
+ # pred1 and pred2 contains the confidence values: pred1['conf'] and pred2['conf']
+ # pred1 contains 3D points for view1['img'] in view1['img'] space: pred1['pts3d']
+ # pred2 contains 3D points for view2['img'] in view1['img'] space: pred2['pts3d_in_other_view']
+
+ # next we'll use the global_aligner to align the predictions
+ # depending on your task, you may be fine with the raw output and not need it
+ # with only two input images, you could use GlobalAlignerMode.PairViewer: it would just convert the output
+ # if using GlobalAlignerMode.PairViewer, no need to run compute_global_alignment
+ scene = global_aligner(output, device=device, mode=GlobalAlignerMode.PointCloudOptimizer)
+ loss = scene.compute_global_alignment(init="mst", niter=niter, schedule=schedule, lr=lr)
+
+ # retrieve useful values from scene:
+ imgs = scene.imgs
+ focals = scene.get_focals()
+ poses = scene.get_im_poses()
+ pts3d = scene.get_pts3d()
+ confidence_masks = scene.get_masks()
+
+ # visualize reconstruction
+ scene.show()
+
+ # find 2D-2D matches between the two images
+ from dust3r.utils.geometry import find_reciprocal_matches, xy_grid
+ pts2d_list, pts3d_list = [], []
+ for i in range(2):
+ conf_i = confidence_masks[i].cpu().numpy()
+ pts2d_list.append(xy_grid(*imgs[i].shape[:2][::-1])[conf_i]) # imgs[i].shape[:2] = (H, W)
+ pts3d_list.append(pts3d[i].detach().cpu().numpy()[conf_i])
+ reciprocal_in_P2, nn2_in_P1, num_matches = find_reciprocal_matches(*pts3d_list)
+ print(f'found {num_matches} matches')
+ matches_im1 = pts2d_list[1][reciprocal_in_P2]
+ matches_im0 = pts2d_list[0][nn2_in_P1][reciprocal_in_P2]
+
+ # visualize a few matches
+ import numpy as np
+ from matplotlib import pyplot as pl
+ n_viz = 10
+ match_idx_to_viz = np.round(np.linspace(0, num_matches-1, n_viz)).astype(int)
+ viz_matches_im0, viz_matches_im1 = matches_im0[match_idx_to_viz], matches_im1[match_idx_to_viz]
+
+ H0, W0, H1, W1 = *imgs[0].shape[:2], *imgs[1].shape[:2]
+ img0 = np.pad(imgs[0], ((0, max(H1 - H0, 0)), (0, 0), (0, 0)), 'constant', constant_values=0)
+ img1 = np.pad(imgs[1], ((0, max(H0 - H1, 0)), (0, 0), (0, 0)), 'constant', constant_values=0)
+ img = np.concatenate((img0, img1), axis=1)
+ pl.figure()
+ pl.imshow(img)
+ cmap = pl.get_cmap('jet')
+ for i in range(n_viz):
+ (x0, y0), (x1, y1) = viz_matches_im0[i].T, viz_matches_im1[i].T
+ pl.plot([x0, x1 + W0], [y0, y1], '-+', color=cmap(i / (n_viz - 1)), scalex=False, scaley=False)
+ pl.show(block=True)
+
+```
+![matching example on croco pair](assets/matching.jpg)
+
+## Training
+
+In this section, we present a short demonstration to get started with training DUSt3R.
+
+### Datasets
+At this moment, we have added the following training datasets:
+ - [CO3Dv2](https://github.com/facebookresearch/co3d) - [Creative Commons Attribution-NonCommercial 4.0 International](https://github.com/facebookresearch/co3d/blob/main/LICENSE)
+ - [ARKitScenes](https://github.com/apple/ARKitScenes) - [Creative Commons Attribution-NonCommercial-ShareAlike 4.0](https://github.com/apple/ARKitScenes/tree/main?tab=readme-ov-file#license)
+ - [ScanNet++](https://kaldir.vc.in.tum.de/scannetpp/) - [non-commercial research and educational purposes](https://kaldir.vc.in.tum.de/scannetpp/static/scannetpp-terms-of-use.pdf)
+ - [BlendedMVS](https://github.com/YoYo000/BlendedMVS) - [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/)
+ - [WayMo Open dataset](https://github.com/waymo-research/waymo-open-dataset) - [Non-Commercial Use](https://waymo.com/open/terms/)
+ - [Habitat-Sim](https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md)
+ - [MegaDepth](https://www.cs.cornell.edu/projects/megadepth/)
+ - [StaticThings3D](https://github.com/lmb-freiburg/robustmvd/blob/master/rmvd/data/README.md#staticthings3d)
+ - [WildRGB-D](https://github.com/wildrgbd/wildrgbd/)
+
+For each dataset, we provide a preprocessing script in the `datasets_preprocess` directory and an archive containing the list of pairs when needed.
+You have to download the datasets yourself from their official sources, agree to their license, download our list of pairs, and run the preprocessing script.
+
+Links:
+
+[ARKitScenes pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/arkitscenes_pairs.zip)
+[ScanNet++ pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/scannetpp_pairs.zip)
+[BlendedMVS pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/blendedmvs_pairs.npy)
+[WayMo Open dataset pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/waymo_pairs.npz)
+[Habitat metadata](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/habitat_5views_v1_512x512_metadata.tar.gz)
+[MegaDepth pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/megadepth_pairs.npz)
+[StaticThings3D pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/staticthings_pairs.npy)
+
+> [!NOTE]
+> They are not strictly equivalent to what was used to train DUSt3R, but they should be close enough.
+
+### Demo
+For this training demo, we're going to download and prepare a subset of [CO3Dv2](https://github.com/facebookresearch/co3d) - [Creative Commons Attribution-NonCommercial 4.0 International](https://github.com/facebookresearch/co3d/blob/main/LICENSE) and launch the training code on it.
+The demo model will be trained for a few epochs on a very small dataset.
+It will not be very good.
+
+```bash
+# download and prepare the co3d subset
+mkdir -p data/co3d_subset
+cd data/co3d_subset
+git clone https://github.com/facebookresearch/co3d
+cd co3d
+python3 ./co3d/download_dataset.py --download_folder ../ --single_sequence_subset
+rm ../*.zip
+cd ../../..
+
+python3 datasets_preprocess/preprocess_co3d.py --co3d_dir data/co3d_subset --output_dir data/co3d_subset_processed --single_sequence_subset
+
+# download the pretrained croco v2 checkpoint
+mkdir -p checkpoints/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTLarge_BaseDecoder.pth -P checkpoints/
+
+# the training of dust3r is done in 3 steps.
+# for this example we'll do fewer epochs, for the actual hyperparameters we used in the paper, see the next section: "Our Hyperparameters"
+# step 1 - train dust3r for 224 resolution
+torchrun --nproc_per_node=4 train.py \
+ --train_dataset "1000 @ Co3d(split='train', ROOT='data/co3d_subset_processed', aug_crop=16, mask_bg='rand', resolution=224, transform=ColorJitter)" \
+ --test_dataset "100 @ Co3d(split='test', ROOT='data/co3d_subset_processed', resolution=224, seed=777)" \
+ --model "AsymmetricCroCo3DStereo(pos_embed='RoPE100', img_size=(224, 224), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+ --train_criterion "ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+ --test_criterion "Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+ --pretrained "checkpoints/CroCo_V2_ViTLarge_BaseDecoder.pth" \
+ --lr 0.0001 --min_lr 1e-06 --warmup_epochs 1 --epochs 10 --batch_size 16 --accum_iter 1 \
+ --save_freq 1 --keep_freq 5 --eval_freq 1 \
+ --output_dir "checkpoints/dust3r_demo_224"
+
+# step 2 - train dust3r for 512 resolution
+torchrun --nproc_per_node=4 train.py \
+ --train_dataset "1000 @ Co3d(split='train', ROOT='data/co3d_subset_processed', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter)" \
+ --test_dataset "100 @ Co3d(split='test', ROOT='data/co3d_subset_processed', resolution=(512,384), seed=777)" \
+ --model "AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+ --train_criterion "ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+ --test_criterion "Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+ --pretrained "checkpoints/dust3r_demo_224/checkpoint-best.pth" \
+ --lr 0.0001 --min_lr 1e-06 --warmup_epochs 1 --epochs 10 --batch_size 4 --accum_iter 4 \
+ --save_freq 1 --keep_freq 5 --eval_freq 1 \
+ --output_dir "checkpoints/dust3r_demo_512"
+
+# step 3 - train dust3r for 512 resolution with dpt
+torchrun --nproc_per_node=4 train.py \
+ --train_dataset "1000 @ Co3d(split='train', ROOT='data/co3d_subset_processed', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter)" \
+ --test_dataset "100 @ Co3d(split='test', ROOT='data/co3d_subset_processed', resolution=(512,384), seed=777)" \
+ --model "AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='dpt', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+ --train_criterion "ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+ --test_criterion "Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+ --pretrained "checkpoints/dust3r_demo_512/checkpoint-best.pth" \
+ --lr 0.0001 --min_lr 1e-06 --warmup_epochs 1 --epochs 10 --batch_size 2 --accum_iter 8 \
+ --save_freq 1 --keep_freq 5 --eval_freq 1 \
+ --output_dir "checkpoints/dust3r_demo_512dpt"
+
+```
+
+### Our Hyperparameters
+
+Here are the commands we used for training the models:
+
+```bash
+# NOTE: ROOT path omitted for datasets
+# 224 linear
+torchrun --nproc_per_node 8 train.py \
+ --train_dataset=" + 100_000 @ Habitat(1_000_000, split='train', aug_crop=16, resolution=224, transform=ColorJitter) + 100_000 @ BlendedMVS(split='train', aug_crop=16, resolution=224, transform=ColorJitter) + 100_000 @ MegaDepth(split='train', aug_crop=16, resolution=224, transform=ColorJitter) + 100_000 @ ARKitScenes(aug_crop=256, resolution=224, transform=ColorJitter) + 100_000 @ Co3d(split='train', aug_crop=16, mask_bg='rand', resolution=224, transform=ColorJitter) + 100_000 @ StaticThings3D(aug_crop=256, mask_bg='rand', resolution=224, transform=ColorJitter) + 100_000 @ ScanNetpp(split='train', aug_crop=256, resolution=224, transform=ColorJitter) + 100_000 @ InternalUnreleasedDataset(aug_crop=128, resolution=224, transform=ColorJitter) " \
+ --test_dataset=" Habitat(1_000, split='val', resolution=224, seed=777) + 1_000 @ BlendedMVS(split='val', resolution=224, seed=777) + 1_000 @ MegaDepth(split='val', resolution=224, seed=777) + 1_000 @ Co3d(split='test', mask_bg='rand', resolution=224, seed=777) " \
+ --train_criterion="ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+ --test_criterion="Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+ --model="AsymmetricCroCo3DStereo(pos_embed='RoPE100', img_size=(224, 224), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+ --pretrained="checkpoints/CroCo_V2_ViTLarge_BaseDecoder.pth" \
+ --lr=0.0001 --min_lr=1e-06 --warmup_epochs=10 --epochs=100 --batch_size=16 --accum_iter=1 \
+ --save_freq=5 --keep_freq=10 --eval_freq=1 \
+ --output_dir="checkpoints/dust3r_224"
+
+# 512 linear
+torchrun --nproc_per_node 8 train.py \
+ --train_dataset=" + 10_000 @ Habitat(1_000_000, split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ BlendedMVS(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ MegaDepth(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ARKitScenes(aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ Co3d(split='train', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ StaticThings3D(aug_crop=256, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ScanNetpp(split='train', aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ InternalUnreleasedDataset(aug_crop=128, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) " \
+ --test_dataset=" Habitat(1_000, split='val', resolution=(512,384), seed=777) + 1_000 @ BlendedMVS(split='val', resolution=(512,384), seed=777) + 1_000 @ MegaDepth(split='val', resolution=(512,336), seed=777) + 1_000 @ Co3d(split='test', resolution=(512,384), seed=777) " \
+ --train_criterion="ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+ --test_criterion="Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+ --model="AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+ --pretrained="checkpoints/dust3r_224/checkpoint-best.pth" \
+ --lr=0.0001 --min_lr=1e-06 --warmup_epochs=20 --epochs=100 --batch_size=4 --accum_iter=2 \
+ --save_freq=10 --keep_freq=10 --eval_freq=1 --print_freq=10 \
+ --output_dir="checkpoints/dust3r_512"
+
+# 512 dpt
+torchrun --nproc_per_node 8 train.py \
+ --train_dataset=" + 10_000 @ Habitat(1_000_000, split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ BlendedMVS(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ MegaDepth(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ARKitScenes(aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ Co3d(split='train', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ StaticThings3D(aug_crop=256, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ScanNetpp(split='train', aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ InternalUnreleasedDataset(aug_crop=128, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) " \
+ --test_dataset=" Habitat(1_000, split='val', resolution=(512,384), seed=777) + 1_000 @ BlendedMVS(split='val', resolution=(512,384), seed=777) + 1_000 @ MegaDepth(split='val', resolution=(512,336), seed=777) + 1_000 @ Co3d(split='test', resolution=(512,384), seed=777) " \
+ --train_criterion="ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+ --test_criterion="Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+ --model="AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='dpt', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+ --pretrained="checkpoints/dust3r_512/checkpoint-best.pth" \
+ --lr=0.0001 --min_lr=1e-06 --warmup_epochs=15 --epochs=90 --batch_size=4 --accum_iter=2 \
+ --save_freq=5 --keep_freq=10 --eval_freq=1 --print_freq=10 \
+ --output_dir="checkpoints/dust3r_512dpt"
+
+```
diff --git a/third_party/mast3r/dust3r/assets/demo.jpg b/third_party/mast3r/dust3r/assets/demo.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c815d468d83a7e91a0ccc24a2f491b10178e955f
--- /dev/null
+++ b/third_party/mast3r/dust3r/assets/demo.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:957a892f9033fb3e733546a202e3c07e362618c708eacf050979d4c4edd5435f
+size 339600
diff --git a/third_party/mast3r/dust3r/assets/dust3r.jpg b/third_party/mast3r/dust3r/assets/dust3r.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8402ae4d08eba0fb9c9e3d7441d3bc451e9f460f
--- /dev/null
+++ b/third_party/mast3r/dust3r/assets/dust3r.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bdf6ee8fd7ccb52ccd09937df60c72bd750a47c6d982efc2ba9808eb305bcba
+size 25927
diff --git a/third_party/mast3r/dust3r/assets/dust3r_archi.jpg b/third_party/mast3r/dust3r/assets/dust3r_archi.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fc2c5d1a154eb29d6c8e4507e408d7478eace3f3
--- /dev/null
+++ b/third_party/mast3r/dust3r/assets/dust3r_archi.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7262d42f63ac61acec20830602452a877264c5575fd7923834c1f2b035a2d9d1
+size 39454
diff --git a/third_party/mast3r/dust3r/assets/matching.jpg b/third_party/mast3r/dust3r/assets/matching.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..636e69c70921c7dac3872fedaee4d508af7ba4db
--- /dev/null
+++ b/third_party/mast3r/dust3r/assets/matching.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ecfe07fd00505045a155902c5686cc23060782a8b020f7596829fb60584a79ee
+size 159312
diff --git a/third_party/mast3r/dust3r/assets/pipeline1.jpg b/third_party/mast3r/dust3r/assets/pipeline1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..90b0b58701bf7a660d07cb0c54c617ca0aab8bda
--- /dev/null
+++ b/third_party/mast3r/dust3r/assets/pipeline1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fd599e928b3ab6560ecc8491c2000ca2809372f656f87bbdd7e6daaf0e2ce92
+size 72026
diff --git a/third_party/mast3r/dust3r/croco/LICENSE b/third_party/mast3r/dust3r/croco/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d9b84b1a65f9db6d8920a9048d162f52ba3ea56d
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/LICENSE
@@ -0,0 +1,52 @@
+CroCo, Copyright (c) 2022-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license.
+
+A summary of the CC BY-NC-SA 4.0 license is located here:
+ https://creativecommons.org/licenses/by-nc-sa/4.0/
+
+The CC BY-NC-SA 4.0 license is located here:
+ https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+
+
+SEE NOTICE BELOW WITH RESPECT TO THE FILE: models/pos_embed.py, models/blocks.py
+
+***************************
+
+NOTICE WITH RESPECT TO THE FILE: models/pos_embed.py
+
+This software is being redistributed in a modifiled form. The original form is available here:
+
+https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+
+This software in this file incorporates parts of the following software available here:
+
+Transformer: https://github.com/tensorflow/models/blob/master/official/legacy/transformer/model_utils.py
+available under the following license: https://github.com/tensorflow/models/blob/master/LICENSE
+
+MoCo v3: https://github.com/facebookresearch/moco-v3
+available under the following license: https://github.com/facebookresearch/moco-v3/blob/main/LICENSE
+
+DeiT: https://github.com/facebookresearch/deit
+available under the following license: https://github.com/facebookresearch/deit/blob/main/LICENSE
+
+
+ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW:
+
+https://github.com/facebookresearch/mae/blob/main/LICENSE
+
+Attribution-NonCommercial 4.0 International
+
+***************************
+
+NOTICE WITH RESPECT TO THE FILE: models/blocks.py
+
+This software is being redistributed in a modifiled form. The original form is available here:
+
+https://github.com/rwightman/pytorch-image-models
+
+ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW:
+
+https://github.com/rwightman/pytorch-image-models/blob/master/LICENSE
+
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
\ No newline at end of file
diff --git a/third_party/mast3r/dust3r/croco/NOTICE b/third_party/mast3r/dust3r/croco/NOTICE
new file mode 100644
index 0000000000000000000000000000000000000000..d51bb365036c12d428d6e3a4fd00885756d5261c
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/NOTICE
@@ -0,0 +1,21 @@
+CroCo
+Copyright 2022-present NAVER Corp.
+
+This project contains subcomponents with separate copyright notices and license terms.
+Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses.
+
+====
+
+facebookresearch/mae
+https://github.com/facebookresearch/mae
+
+Attribution-NonCommercial 4.0 International
+
+====
+
+rwightman/pytorch-image-models
+https://github.com/rwightman/pytorch-image-models
+
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
\ No newline at end of file
diff --git a/third_party/mast3r/dust3r/croco/README.MD b/third_party/mast3r/dust3r/croco/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..38e33b001a60bd16749317fb297acd60f28a6f1b
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/README.MD
@@ -0,0 +1,124 @@
+# CroCo + CroCo v2 / CroCo-Stereo / CroCo-Flow
+
+[[`CroCo arXiv`](https://arxiv.org/abs/2210.10716)] [[`CroCo v2 arXiv`](https://arxiv.org/abs/2211.10408)] [[`project page and demo`](https://croco.europe.naverlabs.com/)]
+
+This repository contains the code for our CroCo model presented in our NeurIPS'22 paper [CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion](https://openreview.net/pdf?id=wZEfHUM5ri) and its follow-up extension published at ICCV'23 [Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow](https://openaccess.thecvf.com/content/ICCV2023/html/Weinzaepfel_CroCo_v2_Improved_Cross-view_Completion_Pre-training_for_Stereo_Matching_and_ICCV_2023_paper.html), refered to as CroCo v2:
+
+![image](assets/arch.jpg)
+
+```bibtex
+@inproceedings{croco,
+ title={{CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion}},
+ author={{Weinzaepfel, Philippe and Leroy, Vincent and Lucas, Thomas and Br\'egier, Romain and Cabon, Yohann and Arora, Vaibhav and Antsfeld, Leonid and Chidlovskii, Boris and Csurka, Gabriela and Revaud J\'er\^ome}},
+ booktitle={{NeurIPS}},
+ year={2022}
+}
+
+@inproceedings{croco_v2,
+ title={{CroCo v2: Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow}},
+ author={Weinzaepfel, Philippe and Lucas, Thomas and Leroy, Vincent and Cabon, Yohann and Arora, Vaibhav and Br{\'e}gier, Romain and Csurka, Gabriela and Antsfeld, Leonid and Chidlovskii, Boris and Revaud, J{\'e}r{\^o}me},
+ booktitle={ICCV},
+ year={2023}
+}
+```
+
+## License
+
+The code is distributed under the CC BY-NC-SA 4.0 License. See [LICENSE](LICENSE) for more information.
+Some components are based on code from [MAE](https://github.com/facebookresearch/mae) released under the CC BY-NC-SA 4.0 License and [timm](https://github.com/rwightman/pytorch-image-models) released under the Apache 2.0 License.
+Some components for stereo matching and optical flow are based on code from [unimatch](https://github.com/autonomousvision/unimatch) released under the MIT license.
+
+## Preparation
+
+1. Install dependencies on a machine with a NVidia GPU using e.g. conda. Note that `habitat-sim` is required only for the interactive demo and the synthetic pre-training data generation. If you don't plan to use it, you can ignore the line installing it and use a more recent python version.
+
+```bash
+conda create -n croco python=3.7 cmake=3.14.0
+conda activate croco
+conda install habitat-sim headless -c conda-forge -c aihabitat
+conda install pytorch torchvision -c pytorch
+conda install notebook ipykernel matplotlib
+conda install ipywidgets widgetsnbextension
+conda install scikit-learn tqdm quaternion opencv # only for pretraining / habitat data generation
+
+```
+
+2. Compile cuda kernels for RoPE
+
+CroCo v2 relies on RoPE positional embeddings for which you need to compile some cuda kernels.
+```bash
+cd models/curope/
+python setup.py build_ext --inplace
+cd ../../
+```
+
+This can be a bit long as we compile for all cuda architectures, feel free to update L9 of `models/curope/setup.py` to compile for specific architectures only.
+You might also need to set the environment `CUDA_HOME` in case you use a custom cuda installation.
+
+In case you cannot provide, we also provide a slow pytorch version, which will be automatically loaded.
+
+3. Download pre-trained model
+
+We provide several pre-trained models:
+
+| modelname | pre-training data | pos. embed. | Encoder | Decoder |
+|------------------------------------------------------------------------------------------------------------------------------------|-------------------|-------------|---------|---------|
+| [`CroCo.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth) | Habitat | cosine | ViT-B | Small |
+| [`CroCo_V2_ViTBase_SmallDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_SmallDecoder.pth) | Habitat + real | RoPE | ViT-B | Small |
+| [`CroCo_V2_ViTBase_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_BaseDecoder.pth) | Habitat + real | RoPE | ViT-B | Base |
+| [`CroCo_V2_ViTLarge_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTLarge_BaseDecoder.pth) | Habitat + real | RoPE | ViT-L | Base |
+
+To download a specific model, i.e., the first one (`CroCo.pth`)
+```bash
+mkdir -p pretrained_models/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth -P pretrained_models/
+```
+
+## Reconstruction example
+
+Simply run after downloading the `CroCo_V2_ViTLarge_BaseDecoder` pretrained model (or update the corresponding line in `demo.py`)
+```bash
+python demo.py
+```
+
+## Interactive demonstration of cross-view completion reconstruction on the Habitat simulator
+
+First download the test scene from Habitat:
+```bash
+python -m habitat_sim.utils.datasets_download --uids habitat_test_scenes --data-path habitat-sim-data/
+```
+
+Then, run the Notebook demo `interactive_demo.ipynb`.
+
+In this demo, you should be able to sample a random reference viewpoint from an [Habitat](https://github.com/facebookresearch/habitat-sim) test scene. Use the sliders to change viewpoint and select a masked target view to reconstruct using CroCo.
+![croco_interactive_demo](https://user-images.githubusercontent.com/1822210/200516576-7937bc6a-55f8-49ed-8618-3ddf89433ea4.jpg)
+
+## Pre-training
+
+### CroCo
+
+To pre-train CroCo, please first generate the pre-training data from the Habitat simulator, following the instructions in [datasets/habitat_sim/README.MD](datasets/habitat_sim/README.MD) and then run the following command:
+```
+torchrun --nproc_per_node=4 pretrain.py --output_dir ./output/pretraining/
+```
+
+Our CroCo pre-training was launched on a single server with 4 GPUs.
+It should take around 10 days with A100 or 15 days with V100 to do the 400 pre-training epochs, but decent performances are obtained earlier in training.
+Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case.
+The first run can take a few minutes to start, to parse all available pre-training pairs.
+
+### CroCo v2
+
+For CroCo v2 pre-training, in addition to the generation of the pre-training data from the Habitat simulator above, please pre-extract the crops from the real datasets following the instructions in [datasets/crops/README.MD](datasets/crops/README.MD).
+Then, run the following command for the largest model (ViT-L encoder, Base decoder):
+```
+torchrun --nproc_per_node=8 pretrain.py --model "CroCoNet(enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_num_heads=12, dec_depth=12, pos_embed='RoPE100')" --dataset "habitat_release+ARKitScenes+MegaDepth+3DStreetView+IndoorVL" --warmup_epochs 12 --max_epoch 125 --epochs 250 --amp 0 --keep_freq 5 --output_dir ./output/pretraining_crocov2/
+```
+
+Our CroCo v2 pre-training was launched on a single server with 8 GPUs for the largest model, and on a single server with 4 GPUs for the smaller ones, keeping a batch size of 64 per gpu in all cases.
+The largest model should take around 12 days on A100.
+Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case.
+
+## Stereo matching and Optical flow downstream tasks
+
+For CroCo-Stereo and CroCo-Flow, please refer to [stereoflow/README.MD](stereoflow/README.MD).
diff --git a/third_party/mast3r/dust3r/croco/assets/Chateau1.png b/third_party/mast3r/dust3r/croco/assets/Chateau1.png
new file mode 100644
index 0000000000000000000000000000000000000000..295b00e46972ffcacaca60c2c7c7ec7a04c762fa
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/assets/Chateau1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71ffb8c7d77e5ced0bb3dcd2cb0db84d0e98e6ff5ffd2d02696a7156e5284857
+size 112106
diff --git a/third_party/mast3r/dust3r/croco/assets/Chateau2.png b/third_party/mast3r/dust3r/croco/assets/Chateau2.png
new file mode 100644
index 0000000000000000000000000000000000000000..97b3c058ff180a6d0c0853ab533b0823a06f8425
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/assets/Chateau2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3a0be9e19f6b89491d692c71e3f2317c2288a898a990561d48b7667218b47c8
+size 109905
diff --git a/third_party/mast3r/dust3r/croco/assets/arch.jpg b/third_party/mast3r/dust3r/croco/assets/arch.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..894c58e25c2d9ee0b579c6f5a6ce78d12217d106
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/assets/arch.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05fbf12896a79819a3864a800b174896bd3b6fa29b4f4f580d06725ff7c30dc7
+size 74842
diff --git a/third_party/mast3r/dust3r/croco/croco-stereo-flow-demo.ipynb b/third_party/mast3r/dust3r/croco/croco-stereo-flow-demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..2b00a7607ab5f82d1857041969bfec977e56b3e0
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/croco-stereo-flow-demo.ipynb
@@ -0,0 +1,191 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "9bca0f41",
+ "metadata": {},
+ "source": [
+ "# Simple inference example with CroCo-Stereo or CroCo-Flow"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "80653ef7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n",
+ "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4f033862",
+ "metadata": {},
+ "source": [
+ "First download the model(s) of your choice by running\n",
+ "```\n",
+ "bash stereoflow/download_model.sh crocostereo.pth\n",
+ "bash stereoflow/download_model.sh crocoflow.pth\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1fb2e392",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n",
+ "device = torch.device('cuda:0' if use_gpu else 'cpu')\n",
+ "import matplotlib.pylab as plt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e0e25d77",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from stereoflow.test import _load_model_and_criterion\n",
+ "from stereoflow.engine import tiled_pred\n",
+ "from stereoflow.datasets_stereo import img_to_tensor, vis_disparity\n",
+ "from stereoflow.datasets_flow import flowToColor\n",
+ "tile_overlap=0.7 # recommended value, higher value can be slightly better but slower"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "86a921f5",
+ "metadata": {},
+ "source": [
+ "### CroCo-Stereo example"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "64e483cb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "image1 = np.asarray(Image.open(''))\n",
+ "image2 = np.asarray(Image.open(''))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f0d04303",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocostereo.pth', None, device)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "47dc14b5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n",
+ "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n",
+ "with torch.inference_mode():\n",
+ " pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n",
+ "pred = pred.squeeze(0).squeeze(0).cpu().numpy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "583b9f16",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plt.imshow(vis_disparity(pred))\n",
+ "plt.axis('off')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d2df5d70",
+ "metadata": {},
+ "source": [
+ "### CroCo-Flow example"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ee257a7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "image1 = np.asarray(Image.open(''))\n",
+ "image2 = np.asarray(Image.open(''))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d5edccf0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocoflow.pth', None, device)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b19692c3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n",
+ "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n",
+ "with torch.inference_mode():\n",
+ " pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n",
+ "pred = pred.squeeze(0).permute(1,2,0).cpu().numpy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "26f79db3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plt.imshow(flowToColor(pred))\n",
+ "plt.axis('off')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/third_party/mast3r/dust3r/croco/datasets/__init__.py b/third_party/mast3r/dust3r/croco/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/third_party/mast3r/dust3r/croco/datasets/crops/README.MD b/third_party/mast3r/dust3r/croco/datasets/crops/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..47ddabebb177644694ee247ae878173a3a16644f
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/datasets/crops/README.MD
@@ -0,0 +1,104 @@
+## Generation of crops from the real datasets
+
+The instructions below allow to generate the crops used for pre-training CroCo v2 from the following real-world datasets: ARKitScenes, MegaDepth, 3DStreetView and IndoorVL.
+
+### Download the metadata of the crops to generate
+
+First, download the metadata and put them in `./data/`:
+```
+mkdir -p data
+cd data/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/crop_metadata.zip
+unzip crop_metadata.zip
+rm crop_metadata.zip
+cd ..
+```
+
+### Prepare the original datasets
+
+Second, download the original datasets in `./data/original_datasets/`.
+```
+mkdir -p data/original_datasets
+```
+
+##### ARKitScenes
+
+Download the `raw` dataset from https://github.com/apple/ARKitScenes/blob/main/DATA.md and put it in `./data/original_datasets/ARKitScenes/`.
+The resulting file structure should be like:
+```
+./data/original_datasets/ARKitScenes/
+└───Training
+ └───40753679
+ │ │ ultrawide
+ │ │ ...
+ └───40753686
+ │
+ ...
+```
+
+##### MegaDepth
+
+Download `MegaDepth v1 Dataset` from https://www.cs.cornell.edu/projects/megadepth/ and put it in `./data/original_datasets/MegaDepth/`.
+The resulting file structure should be like:
+
+```
+./data/original_datasets/MegaDepth/
+└───0000
+│ └───images
+│ │ │ 1000557903_87fa96b8a4_o.jpg
+│ │ └ ...
+│ └─── ...
+└───0001
+│ │
+│ └ ...
+└─── ...
+```
+
+##### 3DStreetView
+
+Download `3D_Street_View` dataset from https://github.com/amir32002/3D_Street_View and put it in `./data/original_datasets/3DStreetView/`.
+The resulting file structure should be like:
+
+```
+./data/original_datasets/3DStreetView/
+└───dataset_aligned
+│ └───0002
+│ │ │ 0000002_0000001_0000002_0000001.jpg
+│ │ └ ...
+│ └─── ...
+└───dataset_unaligned
+│ └───0003
+│ │ │ 0000003_0000001_0000002_0000001.jpg
+│ │ └ ...
+│ └─── ...
+```
+
+##### IndoorVL
+
+Download the `IndoorVL` datasets using [Kapture](https://github.com/naver/kapture).
+
+```
+pip install kapture
+mkdir -p ./data/original_datasets/IndoorVL
+cd ./data/original_datasets/IndoorVL
+kapture_download_dataset.py update
+kapture_download_dataset.py install "HyundaiDepartmentStore_*"
+kapture_download_dataset.py install "GangnamStation_*"
+cd -
+```
+
+### Extract the crops
+
+Now, extract the crops for each of the dataset:
+```
+for dataset in ARKitScenes MegaDepth 3DStreetView IndoorVL;
+do
+ python3 datasets/crops/extract_crops_from_images.py --crops ./data/crop_metadata/${dataset}/crops_release.txt --root-dir ./data/original_datasets/${dataset}/ --output-dir ./data/${dataset}_crops/ --imsize 256 --nthread 8 --max-subdir-levels 5 --ideal-number-pairs-in-dir 500;
+done
+```
+
+##### Note for IndoorVL
+
+Due to some legal issues, we can only release 144,228 pairs out of the 1,593,689 pairs used in the paper.
+To account for it in terms of number of pre-training iterations, the pre-training command in this repository uses 125 training epochs including 12 warm-up epochs and learning rate cosine schedule of 250, instead of 100, 10 and 200 respectively.
+The impact on the performance is negligible.
diff --git a/third_party/mast3r/dust3r/croco/datasets/crops/extract_crops_from_images.py b/third_party/mast3r/dust3r/croco/datasets/crops/extract_crops_from_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb66a0474ce44b54c44c08887cbafdb045b11ff3
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/datasets/crops/extract_crops_from_images.py
@@ -0,0 +1,159 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Extracting crops for pre-training
+# --------------------------------------------------------
+
+import os
+import argparse
+from tqdm import tqdm
+from PIL import Image
+import functools
+from multiprocessing import Pool
+import math
+
+
+def arg_parser():
+ parser = argparse.ArgumentParser('Generate cropped image pairs from image crop list')
+
+ parser.add_argument('--crops', type=str, required=True, help='crop file')
+ parser.add_argument('--root-dir', type=str, required=True, help='root directory')
+ parser.add_argument('--output-dir', type=str, required=True, help='output directory')
+ parser.add_argument('--imsize', type=int, default=256, help='size of the crops')
+ parser.add_argument('--nthread', type=int, required=True, help='number of simultaneous threads')
+ parser.add_argument('--max-subdir-levels', type=int, default=5, help='maximum number of subdirectories')
+ parser.add_argument('--ideal-number-pairs-in-dir', type=int, default=500, help='number of pairs stored in a dir')
+ return parser
+
+
+def main(args):
+ listing_path = os.path.join(args.output_dir, 'listing.txt')
+
+ print(f'Loading list of crops ... ({args.nthread} threads)')
+ crops, num_crops_to_generate = load_crop_file(args.crops)
+
+ print(f'Preparing jobs ({len(crops)} candidate image pairs)...')
+ num_levels = min(math.ceil(math.log(num_crops_to_generate, args.ideal_number_pairs_in_dir)), args.max_subdir_levels)
+ num_pairs_in_dir = math.ceil(num_crops_to_generate ** (1/num_levels))
+
+ jobs = prepare_jobs(crops, num_levels, num_pairs_in_dir)
+ del crops
+
+ os.makedirs(args.output_dir, exist_ok=True)
+ mmap = Pool(args.nthread).imap_unordered if args.nthread > 1 else map
+ call = functools.partial(save_image_crops, args)
+
+ print(f"Generating cropped images to {args.output_dir} ...")
+ with open(listing_path, 'w') as listing:
+ listing.write('# pair_path\n')
+ for results in tqdm(mmap(call, jobs), total=len(jobs)):
+ for path in results:
+ listing.write(f'{path}\n')
+ print('Finished writing listing to', listing_path)
+
+
+def load_crop_file(path):
+ data = open(path).read().splitlines()
+ pairs = []
+ num_crops_to_generate = 0
+ for line in tqdm(data):
+ if line.startswith('#'):
+ continue
+ line = line.split(', ')
+ if len(line) < 8:
+ img1, img2, rotation = line
+ pairs.append((img1, img2, int(rotation), []))
+ else:
+ l1, r1, t1, b1, l2, r2, t2, b2 = map(int, line)
+ rect1, rect2 = (l1, t1, r1, b1), (l2, t2, r2, b2)
+ pairs[-1][-1].append((rect1, rect2))
+ num_crops_to_generate += 1
+ return pairs, num_crops_to_generate
+
+
+def prepare_jobs(pairs, num_levels, num_pairs_in_dir):
+ jobs = []
+ powers = [num_pairs_in_dir**level for level in reversed(range(num_levels))]
+
+ def get_path(idx):
+ idx_array = []
+ d = idx
+ for level in range(num_levels - 1):
+ idx_array.append(idx // powers[level])
+ idx = idx % powers[level]
+ idx_array.append(d)
+ return '/'.join(map(lambda x: hex(x)[2:], idx_array))
+
+ idx = 0
+ for pair_data in tqdm(pairs):
+ img1, img2, rotation, crops = pair_data
+ if -60 <= rotation and rotation <= 60:
+ rotation = 0 # most likely not a true rotation
+ paths = [get_path(idx + k) for k in range(len(crops))]
+ idx += len(crops)
+ jobs.append(((img1, img2), rotation, crops, paths))
+ return jobs
+
+
+def load_image(path):
+ try:
+ return Image.open(path).convert('RGB')
+ except Exception as e:
+ print('skipping', path, e)
+ raise OSError()
+
+
+def save_image_crops(args, data):
+ # load images
+ img_pair, rot, crops, paths = data
+ try:
+ img1, img2 = [load_image(os.path.join(args.root_dir, impath)) for impath in img_pair]
+ except OSError as e:
+ return []
+
+ def area(sz):
+ return sz[0] * sz[1]
+
+ tgt_size = (args.imsize, args.imsize)
+
+ def prepare_crop(img, rect, rot=0):
+ # actual crop
+ img = img.crop(rect)
+
+ # resize to desired size
+ interp = Image.Resampling.LANCZOS if area(img.size) > 4*area(tgt_size) else Image.Resampling.BICUBIC
+ img = img.resize(tgt_size, resample=interp)
+
+ # rotate the image
+ rot90 = (round(rot/90) % 4) * 90
+ if rot90 == 90:
+ img = img.transpose(Image.Transpose.ROTATE_90)
+ elif rot90 == 180:
+ img = img.transpose(Image.Transpose.ROTATE_180)
+ elif rot90 == 270:
+ img = img.transpose(Image.Transpose.ROTATE_270)
+ return img
+
+ results = []
+ for (rect1, rect2), path in zip(crops, paths):
+ crop1 = prepare_crop(img1, rect1)
+ crop2 = prepare_crop(img2, rect2, rot)
+
+ fullpath1 = os.path.join(args.output_dir, path+'_1.jpg')
+ fullpath2 = os.path.join(args.output_dir, path+'_2.jpg')
+ os.makedirs(os.path.dirname(fullpath1), exist_ok=True)
+
+ assert not os.path.isfile(fullpath1), fullpath1
+ assert not os.path.isfile(fullpath2), fullpath2
+ crop1.save(fullpath1)
+ crop2.save(fullpath2)
+ results.append(path)
+
+ return results
+
+
+if __name__ == '__main__':
+ args = arg_parser().parse_args()
+ main(args)
+
diff --git a/third_party/mast3r/dust3r/croco/datasets/habitat_sim/README.MD b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..a505781ff9eb91bce7f1d189e848f8ba1c560940
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/README.MD
@@ -0,0 +1,76 @@
+## Generation of synthetic image pairs using Habitat-Sim
+
+These instructions allow to generate pre-training pairs from the Habitat simulator.
+As we did not save metadata of the pairs used in the original paper, they are not strictly the same, but these data use the same setting and are equivalent.
+
+### Download Habitat-Sim scenes
+Download Habitat-Sim scenes:
+- Download links can be found here: https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md
+- We used scenes from the HM3D, habitat-test-scenes, Replica, ReplicaCad and ScanNet datasets.
+- Please put the scenes under `./data/habitat-sim-data/scene_datasets/` following the structure below, or update manually paths in `paths.py`.
+```
+./data/
+└──habitat-sim-data/
+ └──scene_datasets/
+ ├──hm3d/
+ ├──gibson/
+ ├──habitat-test-scenes/
+ ├──replica_cad_baked_lighting/
+ ├──replica_cad/
+ ├──ReplicaDataset/
+ └──scannet/
+```
+
+### Image pairs generation
+We provide metadata to generate reproducible images pairs for pretraining and validation.
+Experiments described in the paper used similar data, but whose generation was not reproducible at the time.
+
+Specifications:
+- 256x256 resolution images, with 60 degrees field of view .
+- Up to 1000 image pairs per scene.
+- Number of scenes considered/number of images pairs per dataset:
+ - Scannet: 1097 scenes / 985 209 pairs
+ - HM3D:
+ - hm3d/train: 800 / 800k pairs
+ - hm3d/val: 100 scenes / 100k pairs
+ - hm3d/minival: 10 scenes / 10k pairs
+ - habitat-test-scenes: 3 scenes / 3k pairs
+ - replica_cad_baked_lighting: 13 scenes / 13k pairs
+
+- Scenes from hm3d/val and hm3d/minival pairs were not used for the pre-training but kept for validation purposes.
+
+Download metadata and extract it:
+```bash
+mkdir -p data/habitat_release_metadata/
+cd data/habitat_release_metadata/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/habitat_release_metadata/multiview_habitat_metadata.tar.gz
+tar -xvf multiview_habitat_metadata.tar.gz
+cd ../..
+# Location of the metadata
+METADATA_DIR="./data/habitat_release_metadata/multiview_habitat_metadata"
+```
+
+Generate image pairs from metadata:
+- The following command will print a list of commandlines to generate image pairs for each scene:
+```bash
+# Target output directory
+PAIRS_DATASET_DIR="./data/habitat_release/"
+python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR
+```
+- One can launch multiple of such commands in parallel e.g. using GNU Parallel:
+```bash
+python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR | parallel -j 16
+```
+
+## Metadata generation
+
+Image pairs were randomly sampled using the following commands, whose outputs contain randomness and are thus not exactly reproducible:
+```bash
+# Print commandlines to generate image pairs from the different scenes available.
+PAIRS_DATASET_DIR=MY_CUSTOM_PATH
+python datasets/habitat_sim/generate_multiview_images.py --list_commands --output_dir=$PAIRS_DATASET_DIR
+
+# Once a dataset is generated, pack metadata files for reproducibility.
+METADATA_DIR=MY_CUSTON_PATH
+python datasets/habitat_sim/pack_metadata_files.py $PAIRS_DATASET_DIR $METADATA_DIR
+```
diff --git a/third_party/mast3r/dust3r/croco/datasets/habitat_sim/__init__.py b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/third_party/mast3r/dust3r/croco/datasets/habitat_sim/generate_from_metadata.py b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/generate_from_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbe0d399084359495250dc8184671ff498adfbf2
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/generate_from_metadata.py
@@ -0,0 +1,92 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Script to generate image pairs for a given scene reproducing poses provided in a metadata file.
+"""
+import os
+from datasets.habitat_sim.multiview_habitat_sim_generator import MultiviewHabitatSimGenerator
+from datasets.habitat_sim.paths import SCENES_DATASET
+import argparse
+import quaternion
+import PIL.Image
+import cv2
+import json
+from tqdm import tqdm
+
+def generate_multiview_images_from_metadata(metadata_filename,
+ output_dir,
+ overload_params = dict(),
+ scene_datasets_paths=None,
+ exist_ok=False):
+ """
+ Generate images from a metadata file for reproducibility purposes.
+ """
+ # Reorder paths by decreasing label length, to avoid collisions when testing if a string by such label
+ if scene_datasets_paths is not None:
+ scene_datasets_paths = dict(sorted(scene_datasets_paths.items(), key= lambda x: len(x[0]), reverse=True))
+
+ with open(metadata_filename, 'r') as f:
+ input_metadata = json.load(f)
+ metadata = dict()
+ for key, value in input_metadata.items():
+ # Optionally replace some paths
+ if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "":
+ if scene_datasets_paths is not None:
+ for dataset_label, dataset_path in scene_datasets_paths.items():
+ if value.startswith(dataset_label):
+ value = os.path.normpath(os.path.join(dataset_path, os.path.relpath(value, dataset_label)))
+ break
+ metadata[key] = value
+
+ # Overload some parameters
+ for key, value in overload_params.items():
+ metadata[key] = value
+
+ generation_entries = dict([(key, value) for key, value in metadata.items() if not (key in ('multiviews', 'output_dir', 'generate_depth'))])
+ generate_depth = metadata["generate_depth"]
+
+ os.makedirs(output_dir, exist_ok=exist_ok)
+
+ generator = MultiviewHabitatSimGenerator(**generation_entries)
+
+ # Generate views
+ for idx_label, data in tqdm(metadata['multiviews'].items()):
+ positions = data["positions"]
+ orientations = data["orientations"]
+ n = len(positions)
+ for oidx in range(n):
+ observation = generator.render_viewpoint(positions[oidx], quaternion.from_float_array(orientations[oidx]))
+ observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1
+ # Color image saved using PIL
+ img = PIL.Image.fromarray(observation['color'][:,:,:3])
+ filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg")
+ img.save(filename)
+ if generate_depth:
+ # Depth image as EXR file
+ filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_depth.exr")
+ cv2.imwrite(filename, observation['depth'], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF])
+ # Camera parameters
+ camera_params = dict([(key, observation[key].tolist()) for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")])
+ filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_camera_params.json")
+ with open(filename, "w") as f:
+ json.dump(camera_params, f)
+ # Save metadata
+ with open(os.path.join(output_dir, "metadata.json"), "w") as f:
+ json.dump(metadata, f)
+
+ generator.close()
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--metadata_filename", required=True)
+ parser.add_argument("--output_dir", required=True)
+ args = parser.parse_args()
+
+ generate_multiview_images_from_metadata(metadata_filename=args.metadata_filename,
+ output_dir=args.output_dir,
+ scene_datasets_paths=SCENES_DATASET,
+ overload_params=dict(),
+ exist_ok=True)
+
+
\ No newline at end of file
diff --git a/third_party/mast3r/dust3r/croco/datasets/habitat_sim/generate_from_metadata_files.py b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/generate_from_metadata_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..962ef849d8c31397b8622df4f2d9140175d78873
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/generate_from_metadata_files.py
@@ -0,0 +1,27 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Script generating commandlines to generate image pairs from metadata files.
+"""
+import os
+import glob
+from tqdm import tqdm
+import argparse
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--input_dir", required=True)
+ parser.add_argument("--output_dir", required=True)
+ parser.add_argument("--prefix", default="", help="Commanline prefix, useful e.g. to setup environment.")
+ args = parser.parse_args()
+
+ input_metadata_filenames = glob.iglob(f"{args.input_dir}/**/metadata.json", recursive=True)
+
+ for metadata_filename in tqdm(input_metadata_filenames):
+ output_dir = os.path.join(args.output_dir, os.path.relpath(os.path.dirname(metadata_filename), args.input_dir))
+ # Do not process the scene if the metadata file already exists
+ if os.path.exists(os.path.join(output_dir, "metadata.json")):
+ continue
+ commandline = f"{args.prefix}python datasets/habitat_sim/generate_from_metadata.py --metadata_filename={metadata_filename} --output_dir={output_dir}"
+ print(commandline)
diff --git a/third_party/mast3r/dust3r/croco/datasets/habitat_sim/generate_multiview_images.py b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/generate_multiview_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..421d49a1696474415940493296b3f2d982398850
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/generate_multiview_images.py
@@ -0,0 +1,177 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+from tqdm import tqdm
+import argparse
+import PIL.Image
+import numpy as np
+import json
+from datasets.habitat_sim.multiview_habitat_sim_generator import MultiviewHabitatSimGenerator, NoNaviguableSpaceError
+from datasets.habitat_sim.paths import list_scenes_available
+import cv2
+import quaternion
+import shutil
+
+def generate_multiview_images_for_scene(scene_dataset_config_file,
+ scene,
+ navmesh,
+ output_dir,
+ views_count,
+ size,
+ exist_ok=False,
+ generate_depth=False,
+ **kwargs):
+ """
+ Generate tuples of overlapping views for a given scene.
+ generate_depth: generate depth images and camera parameters.
+ """
+ if os.path.exists(output_dir) and not exist_ok:
+ print(f"Scene {scene}: data already generated. Ignoring generation.")
+ return
+ try:
+ print(f"Scene {scene}: {size} multiview acquisitions to generate...")
+ os.makedirs(output_dir, exist_ok=exist_ok)
+
+ metadata_filename = os.path.join(output_dir, "metadata.json")
+
+ metadata_template = dict(scene_dataset_config_file=scene_dataset_config_file,
+ scene=scene,
+ navmesh=navmesh,
+ views_count=views_count,
+ size=size,
+ generate_depth=generate_depth,
+ **kwargs)
+ metadata_template["multiviews"] = dict()
+
+ if os.path.exists(metadata_filename):
+ print("Metadata file already exists:", metadata_filename)
+ print("Loading already generated metadata file...")
+ with open(metadata_filename, "r") as f:
+ metadata = json.load(f)
+
+ for key in metadata_template.keys():
+ if key != "multiviews":
+ assert metadata_template[key] == metadata[key], f"existing file is inconsistent with the input parameters:\nKey: {key}\nmetadata: {metadata[key]}\ntemplate: {metadata_template[key]}."
+ else:
+ print("No temporary file found. Starting generation from scratch...")
+ metadata = metadata_template
+
+ starting_id = len(metadata["multiviews"])
+ print(f"Starting generation from index {starting_id}/{size}...")
+ if starting_id >= size:
+ print("Generation already done.")
+ return
+
+ generator = MultiviewHabitatSimGenerator(scene_dataset_config_file=scene_dataset_config_file,
+ scene=scene,
+ navmesh=navmesh,
+ views_count = views_count,
+ size = size,
+ **kwargs)
+
+ for idx in tqdm(range(starting_id, size)):
+ # Generate / re-generate the observations
+ try:
+ data = generator[idx]
+ observations = data["observations"]
+ positions = data["positions"]
+ orientations = data["orientations"]
+
+ idx_label = f"{idx:08}"
+ for oidx, observation in enumerate(observations):
+ observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1
+ # Color image saved using PIL
+ img = PIL.Image.fromarray(observation['color'][:,:,:3])
+ filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg")
+ img.save(filename)
+ if generate_depth:
+ # Depth image as EXR file
+ filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_depth.exr")
+ cv2.imwrite(filename, observation['depth'], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF])
+ # Camera parameters
+ camera_params = dict([(key, observation[key].tolist()) for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")])
+ filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_camera_params.json")
+ with open(filename, "w") as f:
+ json.dump(camera_params, f)
+ metadata["multiviews"][idx_label] = {"positions": positions.tolist(),
+ "orientations": orientations.tolist(),
+ "covisibility_ratios": data["covisibility_ratios"].tolist(),
+ "valid_fractions": data["valid_fractions"].tolist(),
+ "pairwise_visibility_ratios": data["pairwise_visibility_ratios"].tolist()}
+ except RecursionError:
+ print("Recursion error: unable to sample observations for this scene. We will stop there.")
+ break
+
+ # Regularly save a temporary metadata file, in case we need to restart the generation
+ if idx % 10 == 0:
+ with open(metadata_filename, "w") as f:
+ json.dump(metadata, f)
+
+ # Save metadata
+ with open(metadata_filename, "w") as f:
+ json.dump(metadata, f)
+
+ generator.close()
+ except NoNaviguableSpaceError:
+ pass
+
+def create_commandline(scene_data, generate_depth, exist_ok=False):
+ """
+ Create a commandline string to generate a scene.
+ """
+ def my_formatting(val):
+ if val is None or val == "":
+ return '""'
+ else:
+ return val
+ commandline = f"""python {__file__} --scene {my_formatting(scene_data.scene)}
+ --scene_dataset_config_file {my_formatting(scene_data.scene_dataset_config_file)}
+ --navmesh {my_formatting(scene_data.navmesh)}
+ --output_dir {my_formatting(scene_data.output_dir)}
+ --generate_depth {int(generate_depth)}
+ --exist_ok {int(exist_ok)}
+ """
+ commandline = " ".join(commandline.split())
+ return commandline
+
+if __name__ == "__main__":
+ os.umask(2)
+
+ parser = argparse.ArgumentParser(description="""Example of use -- listing commands to generate data for scenes available:
+ > python datasets/habitat_sim/generate_multiview_habitat_images.py --list_commands
+ """)
+
+ parser.add_argument("--output_dir", type=str, required=True)
+ parser.add_argument("--list_commands", action='store_true', help="list commandlines to run if true")
+ parser.add_argument("--scene", type=str, default="")
+ parser.add_argument("--scene_dataset_config_file", type=str, default="")
+ parser.add_argument("--navmesh", type=str, default="")
+
+ parser.add_argument("--generate_depth", type=int, default=1)
+ parser.add_argument("--exist_ok", type=int, default=0)
+
+ kwargs = dict(resolution=(256,256), hfov=60, views_count = 2, size=1000)
+
+ args = parser.parse_args()
+ generate_depth=bool(args.generate_depth)
+ exist_ok = bool(args.exist_ok)
+
+ if args.list_commands:
+ # Listing scenes available...
+ scenes_data = list_scenes_available(base_output_dir=args.output_dir)
+
+ for scene_data in scenes_data:
+ print(create_commandline(scene_data, generate_depth=generate_depth, exist_ok=exist_ok))
+ else:
+ if args.scene == "" or args.output_dir == "":
+ print("Missing scene or output dir argument!")
+ print(parser.format_help())
+ else:
+ generate_multiview_images_for_scene(scene=args.scene,
+ scene_dataset_config_file = args.scene_dataset_config_file,
+ navmesh = args.navmesh,
+ output_dir = args.output_dir,
+ exist_ok=exist_ok,
+ generate_depth=generate_depth,
+ **kwargs)
\ No newline at end of file
diff --git a/third_party/mast3r/dust3r/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..91e5f923b836a645caf5d8e4aacc425047e3c144
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py
@@ -0,0 +1,390 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+import numpy as np
+import quaternion
+import habitat_sim
+import json
+from sklearn.neighbors import NearestNeighbors
+import cv2
+
+# OpenCV to habitat camera convention transformation
+R_OPENCV2HABITAT = np.stack((habitat_sim.geo.RIGHT, -habitat_sim.geo.UP, habitat_sim.geo.FRONT), axis=0)
+R_HABITAT2OPENCV = R_OPENCV2HABITAT.T
+DEG2RAD = np.pi / 180
+
+def compute_camera_intrinsics(height, width, hfov):
+ f = width/2 / np.tan(hfov/2 * np.pi/180)
+ cu, cv = width/2, height/2
+ return f, cu, cv
+
+def compute_camera_pose_opencv_convention(camera_position, camera_orientation):
+ R_cam2world = quaternion.as_rotation_matrix(camera_orientation) @ R_OPENCV2HABITAT
+ t_cam2world = np.asarray(camera_position)
+ return R_cam2world, t_cam2world
+
+def compute_pointmap(depthmap, hfov):
+ """ Compute a HxWx3 pointmap in camera frame from a HxW depth map."""
+ height, width = depthmap.shape
+ f, cu, cv = compute_camera_intrinsics(height, width, hfov)
+ # Cast depth map to point
+ z_cam = depthmap
+ u, v = np.meshgrid(range(width), range(height))
+ x_cam = (u - cu) / f * z_cam
+ y_cam = (v - cv) / f * z_cam
+ X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1)
+ return X_cam
+
+def compute_pointcloud(depthmap, hfov, camera_position, camera_rotation):
+ """Return a 3D point cloud corresponding to valid pixels of the depth map"""
+ R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(camera_position, camera_rotation)
+
+ X_cam = compute_pointmap(depthmap=depthmap, hfov=hfov)
+ valid_mask = (X_cam[:,:,2] != 0.0)
+
+ X_cam = X_cam.reshape(-1, 3)[valid_mask.flatten()]
+ X_world = X_cam @ R_cam2world.T + t_cam2world.reshape(1, 3)
+ return X_world
+
+def compute_pointcloud_overlaps_scikit(pointcloud1, pointcloud2, distance_threshold, compute_symmetric=False):
+ """
+ Compute 'overlapping' metrics based on a distance threshold between two point clouds.
+ """
+ nbrs = NearestNeighbors(n_neighbors=1, algorithm = 'kd_tree').fit(pointcloud2)
+ distances, indices = nbrs.kneighbors(pointcloud1)
+ intersection1 = np.count_nonzero(distances.flatten() < distance_threshold)
+
+ data = {"intersection1": intersection1,
+ "size1": len(pointcloud1)}
+ if compute_symmetric:
+ nbrs = NearestNeighbors(n_neighbors=1, algorithm = 'kd_tree').fit(pointcloud1)
+ distances, indices = nbrs.kneighbors(pointcloud2)
+ intersection2 = np.count_nonzero(distances.flatten() < distance_threshold)
+ data["intersection2"] = intersection2
+ data["size2"] = len(pointcloud2)
+
+ return data
+
+def _append_camera_parameters(observation, hfov, camera_location, camera_rotation):
+ """
+ Add camera parameters to the observation dictionnary produced by Habitat-Sim
+ In-place modifications.
+ """
+ R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(camera_location, camera_rotation)
+ height, width = observation['depth'].shape
+ f, cu, cv = compute_camera_intrinsics(height, width, hfov)
+ K = np.asarray([[f, 0, cu],
+ [0, f, cv],
+ [0, 0, 1.0]])
+ observation["camera_intrinsics"] = K
+ observation["t_cam2world"] = t_cam2world
+ observation["R_cam2world"] = R_cam2world
+
+def look_at(eye, center, up, return_cam2world=True):
+ """
+ Return camera pose looking at a given center point.
+ Analogous of gluLookAt function, using OpenCV camera convention.
+ """
+ z = center - eye
+ z /= np.linalg.norm(z, axis=-1, keepdims=True)
+ y = -up
+ y = y - np.sum(y * z, axis=-1, keepdims=True) * z
+ y /= np.linalg.norm(y, axis=-1, keepdims=True)
+ x = np.cross(y, z, axis=-1)
+
+ if return_cam2world:
+ R = np.stack((x, y, z), axis=-1)
+ t = eye
+ else:
+ # World to camera transformation
+ # Transposed matrix
+ R = np.stack((x, y, z), axis=-2)
+ t = - np.einsum('...ij, ...j', R, eye)
+ return R, t
+
+def look_at_for_habitat(eye, center, up, return_cam2world=True):
+ R, t = look_at(eye, center, up)
+ orientation = quaternion.from_rotation_matrix(R @ R_OPENCV2HABITAT.T)
+ return orientation, t
+
+def generate_orientation_noise(pan_range, tilt_range, roll_range):
+ return (quaternion.from_rotation_vector(np.random.uniform(*pan_range) * DEG2RAD * habitat_sim.geo.UP)
+ * quaternion.from_rotation_vector(np.random.uniform(*tilt_range) * DEG2RAD * habitat_sim.geo.RIGHT)
+ * quaternion.from_rotation_vector(np.random.uniform(*roll_range) * DEG2RAD * habitat_sim.geo.FRONT))
+
+
+class NoNaviguableSpaceError(RuntimeError):
+ def __init__(self, *args):
+ super().__init__(*args)
+
+class MultiviewHabitatSimGenerator:
+ def __init__(self,
+ scene,
+ navmesh,
+ scene_dataset_config_file,
+ resolution = (240, 320),
+ views_count=2,
+ hfov = 60,
+ gpu_id = 0,
+ size = 10000,
+ minimum_covisibility = 0.5,
+ transform = None):
+ self.scene = scene
+ self.navmesh = navmesh
+ self.scene_dataset_config_file = scene_dataset_config_file
+ self.resolution = resolution
+ self.views_count = views_count
+ assert(self.views_count >= 1)
+ self.hfov = hfov
+ self.gpu_id = gpu_id
+ self.size = size
+ self.transform = transform
+
+ # Noise added to camera orientation
+ self.pan_range = (-3, 3)
+ self.tilt_range = (-10, 10)
+ self.roll_range = (-5, 5)
+
+ # Height range to sample cameras
+ self.height_range = (1.2, 1.8)
+
+ # Random steps between the camera views
+ self.random_steps_count = 5
+ self.random_step_variance = 2.0
+
+ # Minimum fraction of the scene which should be valid (well defined depth)
+ self.minimum_valid_fraction = 0.7
+
+ # Distance threshold to see to select pairs
+ self.distance_threshold = 0.05
+ # Minimum IoU of a view point cloud with respect to the reference view to be kept.
+ self.minimum_covisibility = minimum_covisibility
+
+ # Maximum number of retries.
+ self.max_attempts_count = 100
+
+ self.seed = None
+ self._lazy_initialization()
+
+ def _lazy_initialization(self):
+ # Lazy random seeding and instantiation of the simulator to deal with multiprocessing properly
+ if self.seed == None:
+ # Re-seed numpy generator
+ np.random.seed()
+ self.seed = np.random.randint(2**32-1)
+ sim_cfg = habitat_sim.SimulatorConfiguration()
+ sim_cfg.scene_id = self.scene
+ if self.scene_dataset_config_file is not None and self.scene_dataset_config_file != "":
+ sim_cfg.scene_dataset_config_file = self.scene_dataset_config_file
+ sim_cfg.random_seed = self.seed
+ sim_cfg.load_semantic_mesh = False
+ sim_cfg.gpu_device_id = self.gpu_id
+
+ depth_sensor_spec = habitat_sim.CameraSensorSpec()
+ depth_sensor_spec.uuid = "depth"
+ depth_sensor_spec.sensor_type = habitat_sim.SensorType.DEPTH
+ depth_sensor_spec.resolution = self.resolution
+ depth_sensor_spec.hfov = self.hfov
+ depth_sensor_spec.position = [0.0, 0.0, 0]
+ depth_sensor_spec.orientation
+
+ rgb_sensor_spec = habitat_sim.CameraSensorSpec()
+ rgb_sensor_spec.uuid = "color"
+ rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR
+ rgb_sensor_spec.resolution = self.resolution
+ rgb_sensor_spec.hfov = self.hfov
+ rgb_sensor_spec.position = [0.0, 0.0, 0]
+ agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec, depth_sensor_spec])
+
+ cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])
+ self.sim = habitat_sim.Simulator(cfg)
+ if self.navmesh is not None and self.navmesh != "":
+ # Use pre-computed navmesh when available (usually better than those generated automatically)
+ self.sim.pathfinder.load_nav_mesh(self.navmesh)
+
+ if not self.sim.pathfinder.is_loaded:
+ # Try to compute a navmesh
+ navmesh_settings = habitat_sim.NavMeshSettings()
+ navmesh_settings.set_defaults()
+ self.sim.recompute_navmesh(self.sim.pathfinder, navmesh_settings, True)
+
+ # Ensure that the navmesh is not empty
+ if not self.sim.pathfinder.is_loaded:
+ raise NoNaviguableSpaceError(f"No naviguable location (scene: {self.scene} -- navmesh: {self.navmesh})")
+
+ self.agent = self.sim.initialize_agent(agent_id=0)
+
+ def close(self):
+ self.sim.close()
+
+ def __del__(self):
+ self.sim.close()
+
+ def __len__(self):
+ return self.size
+
+ def sample_random_viewpoint(self):
+ """ Sample a random viewpoint using the navmesh """
+ nav_point = self.sim.pathfinder.get_random_navigable_point()
+
+ # Sample a random viewpoint height
+ viewpoint_height = np.random.uniform(*self.height_range)
+ viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP
+ viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(0, 2 * np.pi) * habitat_sim.geo.UP) * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range)
+ return viewpoint_position, viewpoint_orientation, nav_point
+
+ def sample_other_random_viewpoint(self, observed_point, nav_point):
+ """ Sample a random viewpoint close to an existing one, using the navmesh and a reference observed point."""
+ other_nav_point = nav_point
+
+ walk_directions = self.random_step_variance * np.asarray([1,0,1])
+ for i in range(self.random_steps_count):
+ temp = self.sim.pathfinder.snap_point(other_nav_point + walk_directions * np.random.normal(size=3))
+ # Snapping may return nan when it fails
+ if not np.isnan(temp[0]):
+ other_nav_point = temp
+
+ other_viewpoint_height = np.random.uniform(*self.height_range)
+ other_viewpoint_position = other_nav_point + other_viewpoint_height * habitat_sim.geo.UP
+
+ # Set viewing direction towards the central point
+ rotation, position = look_at_for_habitat(eye=other_viewpoint_position, center=observed_point, up=habitat_sim.geo.UP, return_cam2world=True)
+ rotation = rotation * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range)
+ return position, rotation, other_nav_point
+
+ def is_other_pointcloud_overlapping(self, ref_pointcloud, other_pointcloud):
+ """ Check if a viewpoint is valid and overlaps significantly with a reference one. """
+ # Observation
+ pixels_count = self.resolution[0] * self.resolution[1]
+ valid_fraction = len(other_pointcloud) / pixels_count
+ assert valid_fraction <= 1.0 and valid_fraction >= 0.0
+ overlap = compute_pointcloud_overlaps_scikit(ref_pointcloud, other_pointcloud, self.distance_threshold, compute_symmetric=True)
+ covisibility = min(overlap["intersection1"] / pixels_count, overlap["intersection2"] / pixels_count)
+ is_valid = (valid_fraction >= self.minimum_valid_fraction) and (covisibility >= self.minimum_covisibility)
+ return is_valid, valid_fraction, covisibility
+
+ def is_other_viewpoint_overlapping(self, ref_pointcloud, observation, position, rotation):
+ """ Check if a viewpoint is valid and overlaps significantly with a reference one. """
+ # Observation
+ other_pointcloud = compute_pointcloud(observation['depth'], self.hfov, position, rotation)
+ return self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud)
+
+ def render_viewpoint(self, viewpoint_position, viewpoint_orientation):
+ agent_state = habitat_sim.AgentState()
+ agent_state.position = viewpoint_position
+ agent_state.rotation = viewpoint_orientation
+ self.agent.set_state(agent_state)
+ viewpoint_observations = self.sim.get_sensor_observations(agent_ids=0)
+ _append_camera_parameters(viewpoint_observations, self.hfov, viewpoint_position, viewpoint_orientation)
+ return viewpoint_observations
+
+ def __getitem__(self, useless_idx):
+ ref_position, ref_orientation, nav_point = self.sample_random_viewpoint()
+ ref_observations = self.render_viewpoint(ref_position, ref_orientation)
+ # Extract point cloud
+ ref_pointcloud = compute_pointcloud(depthmap=ref_observations['depth'], hfov=self.hfov,
+ camera_position=ref_position, camera_rotation=ref_orientation)
+
+ pixels_count = self.resolution[0] * self.resolution[1]
+ ref_valid_fraction = len(ref_pointcloud) / pixels_count
+ assert ref_valid_fraction <= 1.0 and ref_valid_fraction >= 0.0
+ if ref_valid_fraction < self.minimum_valid_fraction:
+ # This should produce a recursion error at some point when something is very wrong.
+ return self[0]
+ # Pick an reference observed point in the point cloud
+ observed_point = np.mean(ref_pointcloud, axis=0)
+
+ # Add the first image as reference
+ viewpoints_observations = [ref_observations]
+ viewpoints_covisibility = [ref_valid_fraction]
+ viewpoints_positions = [ref_position]
+ viewpoints_orientations = [quaternion.as_float_array(ref_orientation)]
+ viewpoints_clouds = [ref_pointcloud]
+ viewpoints_valid_fractions = [ref_valid_fraction]
+
+ for _ in range(self.views_count - 1):
+ # Generate an other viewpoint using some dummy random walk
+ successful_sampling = False
+ for sampling_attempt in range(self.max_attempts_count):
+ position, rotation, _ = self.sample_other_random_viewpoint(observed_point, nav_point)
+ # Observation
+ other_viewpoint_observations = self.render_viewpoint(position, rotation)
+ other_pointcloud = compute_pointcloud(other_viewpoint_observations['depth'], self.hfov, position, rotation)
+
+ is_valid, valid_fraction, covisibility = self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud)
+ if is_valid:
+ successful_sampling = True
+ break
+ if not successful_sampling:
+ print("WARNING: Maximum number of attempts reached.")
+ # Dirty hack, try using a novel original viewpoint
+ return self[0]
+ viewpoints_observations.append(other_viewpoint_observations)
+ viewpoints_covisibility.append(covisibility)
+ viewpoints_positions.append(position)
+ viewpoints_orientations.append(quaternion.as_float_array(rotation)) # WXYZ convention for the quaternion encoding.
+ viewpoints_clouds.append(other_pointcloud)
+ viewpoints_valid_fractions.append(valid_fraction)
+
+ # Estimate relations between all pairs of images
+ pairwise_visibility_ratios = np.ones((len(viewpoints_observations), len(viewpoints_observations)))
+ for i in range(len(viewpoints_observations)):
+ pairwise_visibility_ratios[i,i] = viewpoints_valid_fractions[i]
+ for j in range(i+1, len(viewpoints_observations)):
+ overlap = compute_pointcloud_overlaps_scikit(viewpoints_clouds[i], viewpoints_clouds[j], self.distance_threshold, compute_symmetric=True)
+ pairwise_visibility_ratios[i,j] = overlap['intersection1'] / pixels_count
+ pairwise_visibility_ratios[j,i] = overlap['intersection2'] / pixels_count
+
+ # IoU is relative to the image 0
+ data = {"observations": viewpoints_observations,
+ "positions": np.asarray(viewpoints_positions),
+ "orientations": np.asarray(viewpoints_orientations),
+ "covisibility_ratios": np.asarray(viewpoints_covisibility),
+ "valid_fractions": np.asarray(viewpoints_valid_fractions, dtype=float),
+ "pairwise_visibility_ratios": np.asarray(pairwise_visibility_ratios, dtype=float),
+ }
+
+ if self.transform is not None:
+ data = self.transform(data)
+ return data
+
+ def generate_random_spiral_trajectory(self, images_count = 100, max_radius=0.5, half_turns=5, use_constant_orientation=False):
+ """
+ Return a list of images corresponding to a spiral trajectory from a random starting point.
+ Useful to generate nice visualisations.
+ Use an even number of half turns to get a nice "C1-continuous" loop effect
+ """
+ ref_position, ref_orientation, navpoint = self.sample_random_viewpoint()
+ ref_observations = self.render_viewpoint(ref_position, ref_orientation)
+ ref_pointcloud = compute_pointcloud(depthmap=ref_observations['depth'], hfov=self.hfov,
+ camera_position=ref_position, camera_rotation=ref_orientation)
+ pixels_count = self.resolution[0] * self.resolution[1]
+ if len(ref_pointcloud) / pixels_count < self.minimum_valid_fraction:
+ # Dirty hack: ensure that the valid part of the image is significant
+ return self.generate_random_spiral_trajectory(images_count, max_radius, half_turns, use_constant_orientation)
+
+ # Pick an observed point in the point cloud
+ observed_point = np.mean(ref_pointcloud, axis=0)
+ ref_R, ref_t = compute_camera_pose_opencv_convention(ref_position, ref_orientation)
+
+ images = []
+ is_valid = []
+ # Spiral trajectory, use_constant orientation
+ for i, alpha in enumerate(np.linspace(0, 1, images_count)):
+ r = max_radius * np.abs(np.sin(alpha * np.pi)) # Increase then decrease the radius
+ theta = alpha * half_turns * np.pi
+ x = r * np.cos(theta)
+ y = r * np.sin(theta)
+ z = 0.0
+ position = ref_position + (ref_R @ np.asarray([x, y, z]).reshape(3,1)).flatten()
+ if use_constant_orientation:
+ orientation = ref_orientation
+ else:
+ # trajectory looking at a mean point in front of the ref observation
+ orientation, position = look_at_for_habitat(eye=position, center=observed_point, up=habitat_sim.geo.UP)
+ observations = self.render_viewpoint(position, orientation)
+ images.append(observations['color'][...,:3])
+ _is_valid, valid_fraction, iou = self.is_other_viewpoint_overlapping(ref_pointcloud, observations, position, orientation)
+ is_valid.append(_is_valid)
+ return images, np.all(is_valid)
\ No newline at end of file
diff --git a/third_party/mast3r/dust3r/croco/datasets/habitat_sim/pack_metadata_files.py b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/pack_metadata_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..10672a01f7dd615d3b4df37781f7f6f97e753ba6
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/pack_metadata_files.py
@@ -0,0 +1,69 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+"""
+Utility script to pack metadata files of the dataset in order to be able to re-generate it elsewhere.
+"""
+import os
+import glob
+from tqdm import tqdm
+import shutil
+import json
+from datasets.habitat_sim.paths import *
+import argparse
+import collections
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("input_dir")
+ parser.add_argument("output_dir")
+ args = parser.parse_args()
+
+ input_dirname = args.input_dir
+ output_dirname = args.output_dir
+
+ input_metadata_filenames = glob.iglob(f"{input_dirname}/**/metadata.json", recursive=True)
+
+ images_count = collections.defaultdict(lambda : 0)
+
+ os.makedirs(output_dirname)
+ for input_filename in tqdm(input_metadata_filenames):
+ # Ignore empty files
+ with open(input_filename, "r") as f:
+ original_metadata = json.load(f)
+ if "multiviews" not in original_metadata or len(original_metadata["multiviews"]) == 0:
+ print("No views in", input_filename)
+ continue
+
+ relpath = os.path.relpath(input_filename, input_dirname)
+ print(relpath)
+
+ # Copy metadata, while replacing scene paths by generic keys depending on the dataset, for portability.
+ # Data paths are sorted by decreasing length to avoid potential bugs due to paths starting by the same string pattern.
+ scenes_dataset_paths = dict(sorted(SCENES_DATASET.items(), key=lambda x: len(x[1]), reverse=True))
+ metadata = dict()
+ for key, value in original_metadata.items():
+ if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "":
+ known_path = False
+ for dataset, dataset_path in scenes_dataset_paths.items():
+ if value.startswith(dataset_path):
+ value = os.path.join(dataset, os.path.relpath(value, dataset_path))
+ known_path = True
+ break
+ if not known_path:
+ raise KeyError("Unknown path:" + value)
+ metadata[key] = value
+
+ # Compile some general statistics while packing data
+ scene_split = metadata["scene"].split("/")
+ upper_level = "/".join(scene_split[:2]) if scene_split[0] == "hm3d" else scene_split[0]
+ images_count[upper_level] += len(metadata["multiviews"])
+
+ output_filename = os.path.join(output_dirname, relpath)
+ os.makedirs(os.path.dirname(output_filename), exist_ok=True)
+ with open(output_filename, "w") as f:
+ json.dump(metadata, f)
+
+ # Print statistics
+ print("Images count:")
+ for upper_level, count in images_count.items():
+ print(f"- {upper_level}: {count}")
\ No newline at end of file
diff --git a/third_party/mast3r/dust3r/croco/datasets/habitat_sim/paths.py b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d63b5fa29c274ddfeae084734a35ba66d7edee8
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/datasets/habitat_sim/paths.py
@@ -0,0 +1,129 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Paths to Habitat-Sim scenes
+"""
+
+import os
+import json
+import collections
+from tqdm import tqdm
+
+
+# Hardcoded path to the different scene datasets
+SCENES_DATASET = {
+ "hm3d": "./data/habitat-sim-data/scene_datasets/hm3d/",
+ "gibson": "./data/habitat-sim-data/scene_datasets/gibson/",
+ "habitat-test-scenes": "./data/habitat-sim/scene_datasets/habitat-test-scenes/",
+ "replica_cad_baked_lighting": "./data/habitat-sim/scene_datasets/replica_cad_baked_lighting/",
+ "replica_cad": "./data/habitat-sim/scene_datasets/replica_cad/",
+ "replica": "./data/habitat-sim/scene_datasets/ReplicaDataset/",
+ "scannet": "./data/habitat-sim/scene_datasets/scannet/"
+}
+
+SceneData = collections.namedtuple("SceneData", ["scene_dataset_config_file", "scene", "navmesh", "output_dir"])
+
+def list_replicacad_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad"]):
+ scene_dataset_config_file = os.path.join(base_path, "replicaCAD.scene_dataset_config.json")
+ scenes = [f"apt_{i}" for i in range(6)] + ["empty_stage"]
+ navmeshes = [f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"]
+ scenes_data = []
+ for idx in range(len(scenes)):
+ output_dir = os.path.join(base_output_dir, "ReplicaCAD", scenes[idx])
+ # Add scene
+ data = SceneData(scene_dataset_config_file=scene_dataset_config_file,
+ scene = scenes[idx] + ".scene_instance.json",
+ navmesh = os.path.join(base_path, navmeshes[idx]),
+ output_dir = output_dir)
+ scenes_data.append(data)
+ return scenes_data
+
+def list_replica_cad_baked_lighting_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad_baked_lighting"]):
+ scene_dataset_config_file = os.path.join(base_path, "replicaCAD_baked.scene_dataset_config.json")
+ scenes = sum([[f"Baked_sc{i}_staging_{j:02}" for i in range(5)] for j in range(21)], [])
+ navmeshes = ""#[f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"]
+ scenes_data = []
+ for idx in range(len(scenes)):
+ output_dir = os.path.join(base_output_dir, "replica_cad_baked_lighting", scenes[idx])
+ data = SceneData(scene_dataset_config_file=scene_dataset_config_file,
+ scene = scenes[idx],
+ navmesh = "",
+ output_dir = output_dir)
+ scenes_data.append(data)
+ return scenes_data
+
+def list_replica_scenes(base_output_dir, base_path):
+ scenes_data = []
+ for scene_id in os.listdir(base_path):
+ scene = os.path.join(base_path, scene_id, "mesh.ply")
+ navmesh = os.path.join(base_path, scene_id, "habitat/mesh_preseg_semantic.navmesh") # Not sure if I should use it
+ scene_dataset_config_file = ""
+ output_dir = os.path.join(base_output_dir, scene_id)
+ # Add scene only if it does not exist already, or if exist_ok
+ data = SceneData(scene_dataset_config_file = scene_dataset_config_file,
+ scene = scene,
+ navmesh = navmesh,
+ output_dir = output_dir)
+ scenes_data.append(data)
+ return scenes_data
+
+
+def list_scenes(base_output_dir, base_path):
+ """
+ Generic method iterating through a base_path folder to find scenes.
+ """
+ scenes_data = []
+ for root, dirs, files in os.walk(base_path, followlinks=True):
+ folder_scenes_data = []
+ for file in files:
+ name, ext = os.path.splitext(file)
+ if ext == ".glb":
+ scene = os.path.join(root, name + ".glb")
+ navmesh = os.path.join(root, name + ".navmesh")
+ if not os.path.exists(navmesh):
+ navmesh = ""
+ relpath = os.path.relpath(root, base_path)
+ output_dir = os.path.abspath(os.path.join(base_output_dir, relpath, name))
+ data = SceneData(scene_dataset_config_file="",
+ scene = scene,
+ navmesh = navmesh,
+ output_dir = output_dir)
+ folder_scenes_data.append(data)
+
+ # Specific check for HM3D:
+ # When two meshesxxxx.basis.glb and xxxx.glb are present, use the 'basis' version.
+ basis_scenes = [data.scene[:-len(".basis.glb")] for data in folder_scenes_data if data.scene.endswith(".basis.glb")]
+ if len(basis_scenes) != 0:
+ folder_scenes_data = [data for data in folder_scenes_data if not (data.scene[:-len(".glb")] in basis_scenes)]
+
+ scenes_data.extend(folder_scenes_data)
+ return scenes_data
+
+def list_scenes_available(base_output_dir, scenes_dataset_paths=SCENES_DATASET):
+ scenes_data = []
+
+ # HM3D
+ for split in ("minival", "train", "val", "examples"):
+ scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, f"hm3d/{split}/"),
+ base_path=f"{scenes_dataset_paths['hm3d']}/{split}")
+
+ # Gibson
+ scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "gibson"),
+ base_path=scenes_dataset_paths["gibson"])
+
+ # Habitat test scenes (just a few)
+ scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "habitat-test-scenes"),
+ base_path=scenes_dataset_paths["habitat-test-scenes"])
+
+ # ReplicaCAD (baked lightning)
+ scenes_data += list_replica_cad_baked_lighting_scenes(base_output_dir=base_output_dir)
+
+ # ScanNet
+ scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "scannet"),
+ base_path=scenes_dataset_paths["scannet"])
+
+ # Replica
+ list_replica_scenes(base_output_dir=os.path.join(base_output_dir, "replica"),
+ base_path=scenes_dataset_paths["replica"])
+ return scenes_data
diff --git a/third_party/mast3r/dust3r/croco/datasets/pairs_dataset.py b/third_party/mast3r/dust3r/croco/datasets/pairs_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f107526b34e154d9013a9a7a0bde3d5ff6f581c
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/datasets/pairs_dataset.py
@@ -0,0 +1,109 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+from torch.utils.data import Dataset
+from PIL import Image
+
+from datasets.transforms import get_pair_transforms
+
+def load_image(impath):
+ return Image.open(impath)
+
+def load_pairs_from_cache_file(fname, root=''):
+ assert os.path.isfile(fname), "cannot parse pairs from {:s}, file does not exist".format(fname)
+ with open(fname, 'r') as fid:
+ lines = fid.read().strip().splitlines()
+ pairs = [ (os.path.join(root,l.split()[0]), os.path.join(root,l.split()[1])) for l in lines]
+ return pairs
+
+def load_pairs_from_list_file(fname, root=''):
+ assert os.path.isfile(fname), "cannot parse pairs from {:s}, file does not exist".format(fname)
+ with open(fname, 'r') as fid:
+ lines = fid.read().strip().splitlines()
+ pairs = [ (os.path.join(root,l+'_1.jpg'), os.path.join(root,l+'_2.jpg')) for l in lines if not l.startswith('#')]
+ return pairs
+
+
+def write_cache_file(fname, pairs, root=''):
+ if len(root)>0:
+ if not root.endswith('/'): root+='/'
+ assert os.path.isdir(root)
+ s = ''
+ for im1, im2 in pairs:
+ if len(root)>0:
+ assert im1.startswith(root), im1
+ assert im2.startswith(root), im2
+ s += '{:s} {:s}\n'.format(im1[len(root):], im2[len(root):])
+ with open(fname, 'w') as fid:
+ fid.write(s[:-1])
+
+def parse_and_cache_all_pairs(dname, data_dir='./data/'):
+ if dname=='habitat_release':
+ dirname = os.path.join(data_dir, 'habitat_release')
+ assert os.path.isdir(dirname), "cannot find folder for habitat_release pairs: "+dirname
+ cache_file = os.path.join(dirname, 'pairs.txt')
+ assert not os.path.isfile(cache_file), "cache file already exists: "+cache_file
+
+ print('Parsing pairs for dataset: '+dname)
+ pairs = []
+ for root, dirs, files in os.walk(dirname):
+ if 'val' in root: continue
+ dirs.sort()
+ pairs += [ (os.path.join(root,f), os.path.join(root,f[:-len('_1.jpeg')]+'_2.jpeg')) for f in sorted(files) if f.endswith('_1.jpeg')]
+ print('Found {:,} pairs'.format(len(pairs)))
+ print('Writing cache to: '+cache_file)
+ write_cache_file(cache_file, pairs, root=dirname)
+
+ else:
+ raise NotImplementedError('Unknown dataset: '+dname)
+
+def dnames_to_image_pairs(dnames, data_dir='./data/'):
+ """
+ dnames: list of datasets with image pairs, separated by +
+ """
+ all_pairs = []
+ for dname in dnames.split('+'):
+ if dname=='habitat_release':
+ dirname = os.path.join(data_dir, 'habitat_release')
+ assert os.path.isdir(dirname), "cannot find folder for habitat_release pairs: "+dirname
+ cache_file = os.path.join(dirname, 'pairs.txt')
+ assert os.path.isfile(cache_file), "cannot find cache file for habitat_release pairs, please first create the cache file, see instructions. "+cache_file
+ pairs = load_pairs_from_cache_file(cache_file, root=dirname)
+ elif dname in ['ARKitScenes', 'MegaDepth', '3DStreetView', 'IndoorVL']:
+ dirname = os.path.join(data_dir, dname+'_crops')
+ assert os.path.isdir(dirname), "cannot find folder for {:s} pairs: {:s}".format(dname, dirname)
+ list_file = os.path.join(dirname, 'listing.txt')
+ assert os.path.isfile(list_file), "cannot find list file for {:s} pairs, see instructions. {:s}".format(dname, list_file)
+ pairs = load_pairs_from_list_file(list_file, root=dirname)
+ print(' {:s}: {:,} pairs'.format(dname, len(pairs)))
+ all_pairs += pairs
+ if '+' in dnames: print(' Total: {:,} pairs'.format(len(all_pairs)))
+ return all_pairs
+
+
+class PairsDataset(Dataset):
+
+ def __init__(self, dnames, trfs='', totensor=True, normalize=True, data_dir='./data/'):
+ super().__init__()
+ self.image_pairs = dnames_to_image_pairs(dnames, data_dir=data_dir)
+ self.transforms = get_pair_transforms(transform_str=trfs, totensor=totensor, normalize=normalize)
+
+ def __len__(self):
+ return len(self.image_pairs)
+
+ def __getitem__(self, index):
+ im1path, im2path = self.image_pairs[index]
+ im1 = load_image(im1path)
+ im2 = load_image(im2path)
+ if self.transforms is not None: im1, im2 = self.transforms(im1, im2)
+ return im1, im2
+
+
+if __name__=="__main__":
+ import argparse
+ parser = argparse.ArgumentParser(prog="Computing and caching list of pairs for a given dataset")
+ parser.add_argument('--data_dir', default='./data/', type=str, help="path where data are stored")
+ parser.add_argument('--dataset', default='habitat_release', type=str, help="name of the dataset")
+ args = parser.parse_args()
+ parse_and_cache_all_pairs(dname=args.dataset, data_dir=args.data_dir)
diff --git a/third_party/mast3r/dust3r/croco/datasets/transforms.py b/third_party/mast3r/dust3r/croco/datasets/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..216bac61f8254fd50e7f269ee80301f250a2d11e
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/datasets/transforms.py
@@ -0,0 +1,95 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import torch
+import torchvision.transforms
+import torchvision.transforms.functional as F
+
+# "Pair": apply a transform on a pair
+# "Both": apply the exact same transform to both images
+
+class ComposePair(torchvision.transforms.Compose):
+ def __call__(self, img1, img2):
+ for t in self.transforms:
+ img1, img2 = t(img1, img2)
+ return img1, img2
+
+class NormalizeBoth(torchvision.transforms.Normalize):
+ def forward(self, img1, img2):
+ img1 = super().forward(img1)
+ img2 = super().forward(img2)
+ return img1, img2
+
+class ToTensorBoth(torchvision.transforms.ToTensor):
+ def __call__(self, img1, img2):
+ img1 = super().__call__(img1)
+ img2 = super().__call__(img2)
+ return img1, img2
+
+class RandomCropPair(torchvision.transforms.RandomCrop):
+ # the crop will be intentionally different for the two images with this class
+ def forward(self, img1, img2):
+ img1 = super().forward(img1)
+ img2 = super().forward(img2)
+ return img1, img2
+
+class ColorJitterPair(torchvision.transforms.ColorJitter):
+ # can be symmetric (same for both images) or assymetric (different jitter params for each image) depending on assymetric_prob
+ def __init__(self, assymetric_prob, **kwargs):
+ super().__init__(**kwargs)
+ self.assymetric_prob = assymetric_prob
+ def jitter_one(self, img, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor):
+ for fn_id in fn_idx:
+ if fn_id == 0 and brightness_factor is not None:
+ img = F.adjust_brightness(img, brightness_factor)
+ elif fn_id == 1 and contrast_factor is not None:
+ img = F.adjust_contrast(img, contrast_factor)
+ elif fn_id == 2 and saturation_factor is not None:
+ img = F.adjust_saturation(img, saturation_factor)
+ elif fn_id == 3 and hue_factor is not None:
+ img = F.adjust_hue(img, hue_factor)
+ return img
+
+ def forward(self, img1, img2):
+
+ fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params(
+ self.brightness, self.contrast, self.saturation, self.hue
+ )
+ img1 = self.jitter_one(img1, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor)
+ if torch.rand(1) < self.assymetric_prob: # assymetric:
+ fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params(
+ self.brightness, self.contrast, self.saturation, self.hue
+ )
+ img2 = self.jitter_one(img2, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor)
+ return img1, img2
+
+def get_pair_transforms(transform_str, totensor=True, normalize=True):
+ # transform_str is eg crop224+color
+ trfs = []
+ for s in transform_str.split('+'):
+ if s.startswith('crop'):
+ size = int(s[len('crop'):])
+ trfs.append(RandomCropPair(size))
+ elif s=='acolor':
+ trfs.append(ColorJitterPair(assymetric_prob=1.0, brightness=(0.6, 1.4), contrast=(0.6, 1.4), saturation=(0.6, 1.4), hue=0.0))
+ elif s=='': # if transform_str was ""
+ pass
+ else:
+ raise NotImplementedError('Unknown augmentation: '+s)
+
+ if totensor:
+ trfs.append( ToTensorBoth() )
+ if normalize:
+ trfs.append( NormalizeBoth(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) )
+
+ if len(trfs)==0:
+ return None
+ elif len(trfs)==1:
+ return trfs
+ else:
+ return ComposePair(trfs)
+
+
+
+
+
diff --git a/third_party/mast3r/dust3r/croco/demo.py b/third_party/mast3r/dust3r/croco/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..91b80ccc5c98c18e20d1ce782511aa824ef28f77
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/demo.py
@@ -0,0 +1,55 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import torch
+from models.croco import CroCoNet
+from PIL import Image
+import torchvision.transforms
+from torchvision.transforms import ToTensor, Normalize, Compose
+
+def main():
+ device = torch.device('cuda:0' if torch.cuda.is_available() and torch.cuda.device_count()>0 else 'cpu')
+
+ # load 224x224 images and transform them to tensor
+ imagenet_mean = [0.485, 0.456, 0.406]
+ imagenet_mean_tensor = torch.tensor(imagenet_mean).view(1,3,1,1).to(device, non_blocking=True)
+ imagenet_std = [0.229, 0.224, 0.225]
+ imagenet_std_tensor = torch.tensor(imagenet_std).view(1,3,1,1).to(device, non_blocking=True)
+ trfs = Compose([ToTensor(), Normalize(mean=imagenet_mean, std=imagenet_std)])
+ image1 = trfs(Image.open('assets/Chateau1.png').convert('RGB')).to(device, non_blocking=True).unsqueeze(0)
+ image2 = trfs(Image.open('assets/Chateau2.png').convert('RGB')).to(device, non_blocking=True).unsqueeze(0)
+
+ # load model
+ ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')
+ model = CroCoNet( **ckpt.get('croco_kwargs',{})).to(device)
+ model.eval()
+ msg = model.load_state_dict(ckpt['model'], strict=True)
+
+ # forward
+ with torch.inference_mode():
+ out, mask, target = model(image1, image2)
+
+ # the output is normalized, thus use the mean/std of the actual image to go back to RGB space
+ patchified = model.patchify(image1)
+ mean = patchified.mean(dim=-1, keepdim=True)
+ var = patchified.var(dim=-1, keepdim=True)
+ decoded_image = model.unpatchify(out * (var + 1.e-6)**.5 + mean)
+ # undo imagenet normalization, prepare masked image
+ decoded_image = decoded_image * imagenet_std_tensor + imagenet_mean_tensor
+ input_image = image1 * imagenet_std_tensor + imagenet_mean_tensor
+ ref_image = image2 * imagenet_std_tensor + imagenet_mean_tensor
+ image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None])
+ masked_input_image = ((1 - image_masks) * input_image)
+
+ # make visualization
+ visualization = torch.cat((ref_image, masked_input_image, decoded_image, input_image), dim=3) # 4*(B, 3, H, W) -> B, 3, H, W*4
+ B, C, H, W = visualization.shape
+ visualization = visualization.permute(1, 0, 2, 3).reshape(C, B*H, W)
+ visualization = torchvision.transforms.functional.to_pil_image(torch.clamp(visualization, 0, 1))
+ fname = "demo_output.png"
+ visualization.save(fname)
+ print('Visualization save in '+fname)
+
+
+if __name__=="__main__":
+ main()
diff --git a/third_party/mast3r/dust3r/croco/interactive_demo.ipynb b/third_party/mast3r/dust3r/croco/interactive_demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..6cfc960af5baac9a69029c29a16eea4e24123a71
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/interactive_demo.ipynb
@@ -0,0 +1,271 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Interactive demo of Cross-view Completion."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n",
+ "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "import numpy as np\n",
+ "from models.croco import CroCoNet\n",
+ "from ipywidgets import interact, interactive, fixed, interact_manual\n",
+ "import ipywidgets as widgets\n",
+ "import matplotlib.pyplot as plt\n",
+ "import quaternion\n",
+ "import models.masking"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Load CroCo model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')\n",
+ "model = CroCoNet( **ckpt.get('croco_kwargs',{}))\n",
+ "msg = model.load_state_dict(ckpt['model'], strict=True)\n",
+ "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n",
+ "device = torch.device('cuda:0' if use_gpu else 'cpu')\n",
+ "model = model.eval()\n",
+ "model = model.to(device=device)\n",
+ "print(msg)\n",
+ "\n",
+ "def process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches=False):\n",
+ " \"\"\"\n",
+ " Perform Cross-View completion using two input images, specified using Numpy arrays.\n",
+ " \"\"\"\n",
+ " # Replace the mask generator\n",
+ " model.mask_generator = models.masking.RandomMask(model.patch_embed.num_patches, masking_ratio)\n",
+ "\n",
+ " # ImageNet-1k color normalization\n",
+ " imagenet_mean = torch.as_tensor([0.485, 0.456, 0.406]).reshape(1,3,1,1).to(device)\n",
+ " imagenet_std = torch.as_tensor([0.229, 0.224, 0.225]).reshape(1,3,1,1).to(device)\n",
+ "\n",
+ " normalize_input_colors = True\n",
+ " is_output_normalized = True\n",
+ " with torch.no_grad():\n",
+ " # Cast data to torch\n",
+ " target_image = (torch.as_tensor(target_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n",
+ " ref_image = (torch.as_tensor(ref_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n",
+ "\n",
+ " if normalize_input_colors:\n",
+ " ref_image = (ref_image - imagenet_mean) / imagenet_std\n",
+ " target_image = (target_image - imagenet_mean) / imagenet_std\n",
+ "\n",
+ " out, mask, _ = model(target_image, ref_image)\n",
+ " # # get target\n",
+ " if not is_output_normalized:\n",
+ " predicted_image = model.unpatchify(out)\n",
+ " else:\n",
+ " # The output only contains higher order information,\n",
+ " # we retrieve mean and standard deviation from the actual target image\n",
+ " patchified = model.patchify(target_image)\n",
+ " mean = patchified.mean(dim=-1, keepdim=True)\n",
+ " var = patchified.var(dim=-1, keepdim=True)\n",
+ " pred_renorm = out * (var + 1.e-6)**.5 + mean\n",
+ " predicted_image = model.unpatchify(pred_renorm)\n",
+ "\n",
+ " image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None])\n",
+ " masked_target_image = (1 - image_masks) * target_image\n",
+ " \n",
+ " if not reconstruct_unmasked_patches:\n",
+ " # Replace unmasked patches by their actual values\n",
+ " predicted_image = predicted_image * image_masks + masked_target_image\n",
+ "\n",
+ " # Unapply color normalization\n",
+ " if normalize_input_colors:\n",
+ " predicted_image = predicted_image * imagenet_std + imagenet_mean\n",
+ " masked_target_image = masked_target_image * imagenet_std + imagenet_mean\n",
+ " \n",
+ " # Cast to Numpy\n",
+ " masked_target_image = np.asarray(torch.clamp(masked_target_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n",
+ " predicted_image = np.asarray(torch.clamp(predicted_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n",
+ " return masked_target_image, predicted_image"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Use the Habitat simulator to render images from arbitrary viewpoints (requires habitat_sim to be installed)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "os.environ[\"MAGNUM_LOG\"]=\"quiet\"\n",
+ "os.environ[\"HABITAT_SIM_LOG\"]=\"quiet\"\n",
+ "import habitat_sim\n",
+ "\n",
+ "scene = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.glb\"\n",
+ "navmesh = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.navmesh\"\n",
+ "\n",
+ "sim_cfg = habitat_sim.SimulatorConfiguration()\n",
+ "if use_gpu: sim_cfg.gpu_device_id = 0\n",
+ "sim_cfg.scene_id = scene\n",
+ "sim_cfg.load_semantic_mesh = False\n",
+ "rgb_sensor_spec = habitat_sim.CameraSensorSpec()\n",
+ "rgb_sensor_spec.uuid = \"color\"\n",
+ "rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR\n",
+ "rgb_sensor_spec.resolution = (224,224)\n",
+ "rgb_sensor_spec.hfov = 56.56\n",
+ "rgb_sensor_spec.position = [0.0, 0.0, 0.0]\n",
+ "rgb_sensor_spec.orientation = [0, 0, 0]\n",
+ "agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec])\n",
+ "\n",
+ "\n",
+ "cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])\n",
+ "sim = habitat_sim.Simulator(cfg)\n",
+ "if navmesh is not None:\n",
+ " sim.pathfinder.load_nav_mesh(navmesh)\n",
+ "agent = sim.initialize_agent(agent_id=0)\n",
+ "\n",
+ "def sample_random_viewpoint():\n",
+ " \"\"\" Sample a random viewpoint using the navmesh \"\"\"\n",
+ " nav_point = sim.pathfinder.get_random_navigable_point()\n",
+ " # Sample a random viewpoint height\n",
+ " viewpoint_height = np.random.uniform(1.0, 1.6)\n",
+ " viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP\n",
+ " viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(-np.pi, np.pi) * habitat_sim.geo.UP)\n",
+ " return viewpoint_position, viewpoint_orientation\n",
+ "\n",
+ "def render_viewpoint(position, orientation):\n",
+ " agent_state = habitat_sim.AgentState()\n",
+ " agent_state.position = position\n",
+ " agent_state.rotation = orientation\n",
+ " agent.set_state(agent_state)\n",
+ " viewpoint_observations = sim.get_sensor_observations(agent_ids=0)\n",
+ " image = viewpoint_observations['color'][:,:,:3]\n",
+ " image = np.asarray(np.clip(1.5 * np.asarray(image, dtype=float), 0, 255), dtype=np.uint8)\n",
+ " return image"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Sample a random reference view"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ref_position, ref_orientation = sample_random_viewpoint()\n",
+ "ref_image = render_viewpoint(ref_position, ref_orientation)\n",
+ "plt.clf()\n",
+ "fig, axes = plt.subplots(1,1, squeeze=False, num=1)\n",
+ "axes[0,0].imshow(ref_image)\n",
+ "for ax in axes.flatten():\n",
+ " ax.set_xticks([])\n",
+ " ax.set_yticks([])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Interactive cross-view completion using CroCo"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "reconstruct_unmasked_patches = False\n",
+ "\n",
+ "def show_demo(masking_ratio, x, y, z, panorama, elevation):\n",
+ " R = quaternion.as_rotation_matrix(ref_orientation)\n",
+ " target_position = ref_position + x * R[:,0] + y * R[:,1] + z * R[:,2]\n",
+ " target_orientation = (ref_orientation\n",
+ " * quaternion.from_rotation_vector(-elevation * np.pi/180 * habitat_sim.geo.LEFT) \n",
+ " * quaternion.from_rotation_vector(-panorama * np.pi/180 * habitat_sim.geo.UP))\n",
+ " \n",
+ " ref_image = render_viewpoint(ref_position, ref_orientation)\n",
+ " target_image = render_viewpoint(target_position, target_orientation)\n",
+ "\n",
+ " masked_target_image, predicted_image = process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches)\n",
+ "\n",
+ " fig, axes = plt.subplots(1,4, squeeze=True, dpi=300)\n",
+ " axes[0].imshow(ref_image)\n",
+ " axes[0].set_xlabel(\"Reference\")\n",
+ " axes[1].imshow(masked_target_image)\n",
+ " axes[1].set_xlabel(\"Masked target\")\n",
+ " axes[2].imshow(predicted_image)\n",
+ " axes[2].set_xlabel(\"Reconstruction\") \n",
+ " axes[3].imshow(target_image)\n",
+ " axes[3].set_xlabel(\"Target\")\n",
+ " for ax in axes.flatten():\n",
+ " ax.set_xticks([])\n",
+ " ax.set_yticks([])\n",
+ "\n",
+ "interact(show_demo,\n",
+ " masking_ratio=widgets.FloatSlider(description='masking', value=0.9, min=0.0, max=1.0),\n",
+ " x=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+ " y=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+ " z=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+ " panorama=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5),\n",
+ " elevation=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5));"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.13"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "f9237820cd248d7e07cb4fb9f0e4508a85d642f19d831560c0a4b61f3e907e67"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/third_party/mast3r/dust3r/croco/models/blocks.py b/third_party/mast3r/dust3r/croco/models/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..18133524f0ae265b0bd8d062d7c9eeaa63858a9b
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/models/blocks.py
@@ -0,0 +1,241 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# Main encoder/decoder blocks
+# --------------------------------------------------------
+# References:
+# timm
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/helpers.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/mlp.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/patch_embed.py
+
+
+import torch
+import torch.nn as nn
+
+from itertools import repeat
+import collections.abc
+
+
+def _ntuple(n):
+ def parse(x):
+ if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+ return x
+ return tuple(repeat(x, n))
+ return parse
+to_2tuple = _ntuple(2)
+
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ """
+ if drop_prob == 0. or not training:
+ return x
+ keep_prob = 1 - drop_prob
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+ if keep_prob > 0.0 and scale_by_keep:
+ random_tensor.div_(keep_prob)
+ return x * random_tensor
+
+class DropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ """
+ def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+ self.scale_by_keep = scale_by_keep
+
+ def forward(self, x):
+ return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+ def extra_repr(self):
+ return f'drop_prob={round(self.drop_prob,3):0.3f}'
+
+class Mlp(nn.Module):
+ """ MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ bias = to_2tuple(bias)
+ drop_probs = to_2tuple(drop)
+
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+ self.act = act_layer()
+ self.drop1 = nn.Dropout(drop_probs[0])
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
+ self.drop2 = nn.Dropout(drop_probs[1])
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop1(x)
+ x = self.fc2(x)
+ x = self.drop2(x)
+ return x
+
+class Attention(nn.Module):
+
+ def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = head_dim ** -0.5
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+ self.rope = rope
+
+ def forward(self, x, xpos):
+ B, N, C = x.shape
+
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1,3)
+ q, k, v = [qkv[:,:,i] for i in range(3)]
+ # q,k,v = qkv.unbind(2) # make torchscript happy (cannot use tensor as tuple)
+
+ if self.rope is not None:
+ q = self.rope(q, xpos)
+ k = self.rope(k, xpos)
+
+ attn = (q @ k.transpose(-2, -1)) * self.scale
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+class Block(nn.Module):
+
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, rope=None):
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ def forward(self, x, xpos):
+ x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ return x
+
+class CrossAttention(nn.Module):
+
+ def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = head_dim ** -0.5
+
+ self.projq = nn.Linear(dim, dim, bias=qkv_bias)
+ self.projk = nn.Linear(dim, dim, bias=qkv_bias)
+ self.projv = nn.Linear(dim, dim, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ self.rope = rope
+
+ def forward(self, query, key, value, qpos, kpos):
+ B, Nq, C = query.shape
+ Nk = key.shape[1]
+ Nv = value.shape[1]
+
+ q = self.projq(query).reshape(B,Nq,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+ k = self.projk(key).reshape(B,Nk,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+ v = self.projv(value).reshape(B,Nv,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+
+ if self.rope is not None:
+ q = self.rope(q, qpos)
+ k = self.rope(k, kpos)
+
+ attn = (q @ k.transpose(-2, -1)) * self.scale
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, Nq, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+class DecoderBlock(nn.Module):
+
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_mem=True, rope=None):
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+ self.cross_attn = CrossAttention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ self.norm3 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+ self.norm_y = norm_layer(dim) if norm_mem else nn.Identity()
+
+ def forward(self, x, y, xpos, ypos):
+ x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+ y_ = self.norm_y(y)
+ x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos))
+ x = x + self.drop_path(self.mlp(self.norm3(x)))
+ return x, y
+
+
+# patch embedding
+class PositionGetter(object):
+ """ return positions of patches """
+
+ def __init__(self):
+ self.cache_positions = {}
+
+ def __call__(self, b, h, w, device):
+ if not (h,w) in self.cache_positions:
+ x = torch.arange(w, device=device)
+ y = torch.arange(h, device=device)
+ self.cache_positions[h,w] = torch.cartesian_prod(y, x) # (h, w, 2)
+ pos = self.cache_positions[h,w].view(1, h*w, 2).expand(b, -1, 2).clone()
+ return pos
+
+class PatchEmbed(nn.Module):
+ """ just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed"""
+
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
+ super().__init__()
+ img_size = to_2tuple(img_size)
+ patch_size = to_2tuple(patch_size)
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+ self.num_patches = self.grid_size[0] * self.grid_size[1]
+ self.flatten = flatten
+
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+ self.position_getter = PositionGetter()
+
+ def forward(self, x):
+ B, C, H, W = x.shape
+ torch._assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")
+ torch._assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")
+ x = self.proj(x)
+ pos = self.position_getter(B, x.size(2), x.size(3), x.device)
+ if self.flatten:
+ x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
+ x = self.norm(x)
+ return x, pos
+
+ def _init_weights(self):
+ w = self.proj.weight.data
+ torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+
diff --git a/third_party/mast3r/dust3r/croco/models/criterion.py b/third_party/mast3r/dust3r/croco/models/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..11696c40865344490f23796ea45e8fbd5e654731
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/models/criterion.py
@@ -0,0 +1,37 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Criterion to train CroCo
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# --------------------------------------------------------
+
+import torch
+
+class MaskedMSE(torch.nn.Module):
+
+ def __init__(self, norm_pix_loss=False, masked=True):
+ """
+ norm_pix_loss: normalize each patch by their pixel mean and variance
+ masked: compute loss over the masked patches only
+ """
+ super().__init__()
+ self.norm_pix_loss = norm_pix_loss
+ self.masked = masked
+
+ def forward(self, pred, mask, target):
+
+ if self.norm_pix_loss:
+ mean = target.mean(dim=-1, keepdim=True)
+ var = target.var(dim=-1, keepdim=True)
+ target = (target - mean) / (var + 1.e-6)**.5
+
+ loss = (pred - target) ** 2
+ loss = loss.mean(dim=-1) # [N, L], mean loss per patch
+ if self.masked:
+ loss = (loss * mask).sum() / mask.sum() # mean loss on masked patches
+ else:
+ loss = loss.mean() # mean loss
+ return loss
diff --git a/third_party/mast3r/dust3r/croco/models/croco.py b/third_party/mast3r/dust3r/croco/models/croco.py
new file mode 100644
index 0000000000000000000000000000000000000000..14c68634152d75555b4c35c25af268394c5821fe
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/models/croco.py
@@ -0,0 +1,249 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# CroCo model during pretraining
+# --------------------------------------------------------
+
+
+
+import torch
+import torch.nn as nn
+torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12
+from functools import partial
+
+from models.blocks import Block, DecoderBlock, PatchEmbed
+from models.pos_embed import get_2d_sincos_pos_embed, RoPE2D
+from models.masking import RandomMask
+
+
+class CroCoNet(nn.Module):
+
+ def __init__(self,
+ img_size=224, # input image size
+ patch_size=16, # patch_size
+ mask_ratio=0.9, # ratios of masked tokens
+ enc_embed_dim=768, # encoder feature dimension
+ enc_depth=12, # encoder depth
+ enc_num_heads=12, # encoder number of heads in the transformer block
+ dec_embed_dim=512, # decoder feature dimension
+ dec_depth=8, # decoder depth
+ dec_num_heads=16, # decoder number of heads in the transformer block
+ mlp_ratio=4,
+ norm_layer=partial(nn.LayerNorm, eps=1e-6),
+ norm_im2_in_dec=True, # whether to apply normalization of the 'memory' = (second image) in the decoder
+ pos_embed='cosine', # positional embedding (either cosine or RoPE100)
+ ):
+
+ super(CroCoNet, self).__init__()
+
+ # patch embeddings (with initialization done as in MAE)
+ self._set_patch_embed(img_size, patch_size, enc_embed_dim)
+
+ # mask generations
+ self._set_mask_generator(self.patch_embed.num_patches, mask_ratio)
+
+ self.pos_embed = pos_embed
+ if pos_embed=='cosine':
+ # positional embedding of the encoder
+ enc_pos_embed = get_2d_sincos_pos_embed(enc_embed_dim, int(self.patch_embed.num_patches**.5), n_cls_token=0)
+ self.register_buffer('enc_pos_embed', torch.from_numpy(enc_pos_embed).float())
+ # positional embedding of the decoder
+ dec_pos_embed = get_2d_sincos_pos_embed(dec_embed_dim, int(self.patch_embed.num_patches**.5), n_cls_token=0)
+ self.register_buffer('dec_pos_embed', torch.from_numpy(dec_pos_embed).float())
+ # pos embedding in each block
+ self.rope = None # nothing for cosine
+ elif pos_embed.startswith('RoPE'): # eg RoPE100
+ self.enc_pos_embed = None # nothing to add in the encoder with RoPE
+ self.dec_pos_embed = None # nothing to add in the decoder with RoPE
+ if RoPE2D is None: raise ImportError("Cannot find cuRoPE2D, please install it following the README instructions")
+ freq = float(pos_embed[len('RoPE'):])
+ self.rope = RoPE2D(freq=freq)
+ else:
+ raise NotImplementedError('Unknown pos_embed '+pos_embed)
+
+ # transformer for the encoder
+ self.enc_depth = enc_depth
+ self.enc_embed_dim = enc_embed_dim
+ self.enc_blocks = nn.ModuleList([
+ Block(enc_embed_dim, enc_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer, rope=self.rope)
+ for i in range(enc_depth)])
+ self.enc_norm = norm_layer(enc_embed_dim)
+
+ # masked tokens
+ self._set_mask_token(dec_embed_dim)
+
+ # decoder
+ self._set_decoder(enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec)
+
+ # prediction head
+ self._set_prediction_head(dec_embed_dim, patch_size)
+
+ # initializer weights
+ self.initialize_weights()
+
+ def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
+ self.patch_embed = PatchEmbed(img_size, patch_size, 3, enc_embed_dim)
+
+ def _set_mask_generator(self, num_patches, mask_ratio):
+ self.mask_generator = RandomMask(num_patches, mask_ratio)
+
+ def _set_mask_token(self, dec_embed_dim):
+ self.mask_token = nn.Parameter(torch.zeros(1, 1, dec_embed_dim))
+
+ def _set_decoder(self, enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec):
+ self.dec_depth = dec_depth
+ self.dec_embed_dim = dec_embed_dim
+ # transfer from encoder to decoder
+ self.decoder_embed = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True)
+ # transformer for the decoder
+ self.dec_blocks = nn.ModuleList([
+ DecoderBlock(dec_embed_dim, dec_num_heads, mlp_ratio=mlp_ratio, qkv_bias=True, norm_layer=norm_layer, norm_mem=norm_im2_in_dec, rope=self.rope)
+ for i in range(dec_depth)])
+ # final norm layer
+ self.dec_norm = norm_layer(dec_embed_dim)
+
+ def _set_prediction_head(self, dec_embed_dim, patch_size):
+ self.prediction_head = nn.Linear(dec_embed_dim, patch_size**2 * 3, bias=True)
+
+
+ def initialize_weights(self):
+ # patch embed
+ self.patch_embed._init_weights()
+ # mask tokens
+ if self.mask_token is not None: torch.nn.init.normal_(self.mask_token, std=.02)
+ # linears and layer norms
+ self.apply(self._init_weights)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ # we use xavier_uniform following official JAX ViT:
+ torch.nn.init.xavier_uniform_(m.weight)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+
+ def _encode_image(self, image, do_mask=False, return_all_blocks=False):
+ """
+ image has B x 3 x img_size x img_size
+ do_mask: whether to perform masking or not
+ return_all_blocks: if True, return the features at the end of every block
+ instead of just the features from the last block (eg for some prediction heads)
+ """
+ # embed the image into patches (x has size B x Npatches x C)
+ # and get position if each return patch (pos has size B x Npatches x 2)
+ x, pos = self.patch_embed(image)
+ # add positional embedding without cls token
+ if self.enc_pos_embed is not None:
+ x = x + self.enc_pos_embed[None,...]
+ # apply masking
+ B,N,C = x.size()
+ if do_mask:
+ masks = self.mask_generator(x)
+ x = x[~masks].view(B, -1, C)
+ posvis = pos[~masks].view(B, -1, 2)
+ else:
+ B,N,C = x.size()
+ masks = torch.zeros((B,N), dtype=bool)
+ posvis = pos
+ # now apply the transformer encoder and normalization
+ if return_all_blocks:
+ out = []
+ for blk in self.enc_blocks:
+ x = blk(x, posvis)
+ out.append(x)
+ out[-1] = self.enc_norm(out[-1])
+ return out, pos, masks
+ else:
+ for blk in self.enc_blocks:
+ x = blk(x, posvis)
+ x = self.enc_norm(x)
+ return x, pos, masks
+
+ def _decoder(self, feat1, pos1, masks1, feat2, pos2, return_all_blocks=False):
+ """
+ return_all_blocks: if True, return the features at the end of every block
+ instead of just the features from the last block (eg for some prediction heads)
+
+ masks1 can be None => assume image1 fully visible
+ """
+ # encoder to decoder layer
+ visf1 = self.decoder_embed(feat1)
+ f2 = self.decoder_embed(feat2)
+ # append masked tokens to the sequence
+ B,Nenc,C = visf1.size()
+ if masks1 is None: # downstreams
+ f1_ = visf1
+ else: # pretraining
+ Ntotal = masks1.size(1)
+ f1_ = self.mask_token.repeat(B, Ntotal, 1).to(dtype=visf1.dtype)
+ f1_[~masks1] = visf1.view(B * Nenc, C)
+ # add positional embedding
+ if self.dec_pos_embed is not None:
+ f1_ = f1_ + self.dec_pos_embed
+ f2 = f2 + self.dec_pos_embed
+ # apply Transformer blocks
+ out = f1_
+ out2 = f2
+ if return_all_blocks:
+ _out, out = out, []
+ for blk in self.dec_blocks:
+ _out, out2 = blk(_out, out2, pos1, pos2)
+ out.append(_out)
+ out[-1] = self.dec_norm(out[-1])
+ else:
+ for blk in self.dec_blocks:
+ out, out2 = blk(out, out2, pos1, pos2)
+ out = self.dec_norm(out)
+ return out
+
+ def patchify(self, imgs):
+ """
+ imgs: (B, 3, H, W)
+ x: (B, L, patch_size**2 *3)
+ """
+ p = self.patch_embed.patch_size[0]
+ assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
+
+ h = w = imgs.shape[2] // p
+ x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
+ x = torch.einsum('nchpwq->nhwpqc', x)
+ x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3))
+
+ return x
+
+ def unpatchify(self, x, channels=3):
+ """
+ x: (N, L, patch_size**2 *channels)
+ imgs: (N, 3, H, W)
+ """
+ patch_size = self.patch_embed.patch_size[0]
+ h = w = int(x.shape[1]**.5)
+ assert h * w == x.shape[1]
+ x = x.reshape(shape=(x.shape[0], h, w, patch_size, patch_size, channels))
+ x = torch.einsum('nhwpqc->nchpwq', x)
+ imgs = x.reshape(shape=(x.shape[0], channels, h * patch_size, h * patch_size))
+ return imgs
+
+ def forward(self, img1, img2):
+ """
+ img1: tensor of size B x 3 x img_size x img_size
+ img2: tensor of size B x 3 x img_size x img_size
+
+ out will be B x N x (3*patch_size*patch_size)
+ masks are also returned as B x N just in case
+ """
+ # encoder of the masked first image
+ feat1, pos1, mask1 = self._encode_image(img1, do_mask=True)
+ # encoder of the second image
+ feat2, pos2, _ = self._encode_image(img2, do_mask=False)
+ # decoder
+ decfeat = self._decoder(feat1, pos1, mask1, feat2, pos2)
+ # prediction head
+ out = self.prediction_head(decfeat)
+ # get target
+ target = self.patchify(img1)
+ return out, mask1, target
diff --git a/third_party/mast3r/dust3r/croco/models/croco_downstream.py b/third_party/mast3r/dust3r/croco/models/croco_downstream.py
new file mode 100644
index 0000000000000000000000000000000000000000..159dfff4d2c1461bc235e21441b57ce1e2088f76
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/models/croco_downstream.py
@@ -0,0 +1,122 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# CroCo model for downstream tasks
+# --------------------------------------------------------
+
+import torch
+
+from .croco import CroCoNet
+
+
+def croco_args_from_ckpt(ckpt):
+ if 'croco_kwargs' in ckpt: # CroCo v2 released models
+ return ckpt['croco_kwargs']
+ elif 'args' in ckpt and hasattr(ckpt['args'], 'model'): # pretrained using the official code release
+ s = ckpt['args'].model # eg "CroCoNet(enc_embed_dim=1024, enc_num_heads=16, enc_depth=24)"
+ assert s.startswith('CroCoNet(')
+ return eval('dict'+s[len('CroCoNet'):]) # transform it into the string of a dictionary and evaluate it
+ else: # CroCo v1 released models
+ return dict()
+
+class CroCoDownstreamMonocularEncoder(CroCoNet):
+
+ def __init__(self,
+ head,
+ **kwargs):
+ """ Build network for monocular downstream task, only using the encoder.
+ It takes an extra argument head, that is called with the features
+ and a dictionary img_info containing 'width' and 'height' keys
+ The head is setup with the croconet arguments in this init function
+ NOTE: It works by *calling super().__init__() but with redefined setters
+
+ """
+ super(CroCoDownstreamMonocularEncoder, self).__init__(**kwargs)
+ head.setup(self)
+ self.head = head
+
+ def _set_mask_generator(self, *args, **kwargs):
+ """ No mask generator """
+ return
+
+ def _set_mask_token(self, *args, **kwargs):
+ """ No mask token """
+ self.mask_token = None
+ return
+
+ def _set_decoder(self, *args, **kwargs):
+ """ No decoder """
+ return
+
+ def _set_prediction_head(self, *args, **kwargs):
+ """ No 'prediction head' for downstream tasks."""
+ return
+
+ def forward(self, img):
+ """
+ img if of size batch_size x 3 x h x w
+ """
+ B, C, H, W = img.size()
+ img_info = {'height': H, 'width': W}
+ need_all_layers = hasattr(self.head, 'return_all_blocks') and self.head.return_all_blocks
+ out, _, _ = self._encode_image(img, do_mask=False, return_all_blocks=need_all_layers)
+ return self.head(out, img_info)
+
+
+class CroCoDownstreamBinocular(CroCoNet):
+
+ def __init__(self,
+ head,
+ **kwargs):
+ """ Build network for binocular downstream task
+ It takes an extra argument head, that is called with the features
+ and a dictionary img_info containing 'width' and 'height' keys
+ The head is setup with the croconet arguments in this init function
+ """
+ super(CroCoDownstreamBinocular, self).__init__(**kwargs)
+ head.setup(self)
+ self.head = head
+
+ def _set_mask_generator(self, *args, **kwargs):
+ """ No mask generator """
+ return
+
+ def _set_mask_token(self, *args, **kwargs):
+ """ No mask token """
+ self.mask_token = None
+ return
+
+ def _set_prediction_head(self, *args, **kwargs):
+ """ No prediction head for downstream tasks, define your own head """
+ return
+
+ def encode_image_pairs(self, img1, img2, return_all_blocks=False):
+ """ run encoder for a pair of images
+ it is actually ~5% faster to concatenate the images along the batch dimension
+ than to encode them separately
+ """
+ ## the two commented lines below is the naive version with separate encoding
+ #out, pos, _ = self._encode_image(img1, do_mask=False, return_all_blocks=return_all_blocks)
+ #out2, pos2, _ = self._encode_image(img2, do_mask=False, return_all_blocks=False)
+ ## and now the faster version
+ out, pos, _ = self._encode_image( torch.cat( (img1,img2), dim=0), do_mask=False, return_all_blocks=return_all_blocks )
+ if return_all_blocks:
+ out,out2 = list(map(list, zip(*[o.chunk(2, dim=0) for o in out])))
+ out2 = out2[-1]
+ else:
+ out,out2 = out.chunk(2, dim=0)
+ pos,pos2 = pos.chunk(2, dim=0)
+ return out, out2, pos, pos2
+
+ def forward(self, img1, img2):
+ B, C, H, W = img1.size()
+ img_info = {'height': H, 'width': W}
+ return_all_blocks = hasattr(self.head, 'return_all_blocks') and self.head.return_all_blocks
+ out, out2, pos, pos2 = self.encode_image_pairs(img1, img2, return_all_blocks=return_all_blocks)
+ if return_all_blocks:
+ decout = self._decoder(out[-1], pos, None, out2, pos2, return_all_blocks=return_all_blocks)
+ decout = out+decout
+ else:
+ decout = self._decoder(out, pos, None, out2, pos2, return_all_blocks=return_all_blocks)
+ return self.head(decout, img_info)
\ No newline at end of file
diff --git a/third_party/mast3r/dust3r/croco/models/curope/__init__.py b/third_party/mast3r/dust3r/croco/models/curope/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..25e3d48a162760260826080f6366838e83e26878
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/models/curope/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+from .curope2d import cuRoPE2D
diff --git a/third_party/mast3r/dust3r/croco/models/curope/curope.cpp b/third_party/mast3r/dust3r/croco/models/curope/curope.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8fe9058e05aa1bf3f37b0d970edc7312bc68455b
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/models/curope/curope.cpp
@@ -0,0 +1,69 @@
+/*
+ Copyright (C) 2022-present Naver Corporation. All rights reserved.
+ Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+*/
+
+#include
+
+// forward declaration
+void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd );
+
+void rope_2d_cpu( torch::Tensor tokens, const torch::Tensor positions, const float base, const float fwd )
+{
+ const int B = tokens.size(0);
+ const int N = tokens.size(1);
+ const int H = tokens.size(2);
+ const int D = tokens.size(3) / 4;
+
+ auto tok = tokens.accessor();
+ auto pos = positions.accessor();
+
+ for (int b = 0; b < B; b++) {
+ for (int x = 0; x < 2; x++) { // y and then x (2d)
+ for (int n = 0; n < N; n++) {
+
+ // grab the token position
+ const int p = pos[b][n][x];
+
+ for (int h = 0; h < H; h++) {
+ for (int d = 0; d < D; d++) {
+ // grab the two values
+ float u = tok[b][n][h][d+0+x*2*D];
+ float v = tok[b][n][h][d+D+x*2*D];
+
+ // grab the cos,sin
+ const float inv_freq = fwd * p / powf(base, d/float(D));
+ float c = cosf(inv_freq);
+ float s = sinf(inv_freq);
+
+ // write the result
+ tok[b][n][h][d+0+x*2*D] = u*c - v*s;
+ tok[b][n][h][d+D+x*2*D] = v*c + u*s;
+ }
+ }
+ }
+ }
+ }
+}
+
+void rope_2d( torch::Tensor tokens, // B,N,H,D
+ const torch::Tensor positions, // B,N,2
+ const float base,
+ const float fwd )
+{
+ TORCH_CHECK(tokens.dim() == 4, "tokens must have 4 dimensions");
+ TORCH_CHECK(positions.dim() == 3, "positions must have 3 dimensions");
+ TORCH_CHECK(tokens.size(0) == positions.size(0), "batch size differs between tokens & positions");
+ TORCH_CHECK(tokens.size(1) == positions.size(1), "seq_length differs between tokens & positions");
+ TORCH_CHECK(positions.size(2) == 2, "positions.shape[2] must be equal to 2");
+ TORCH_CHECK(tokens.is_cuda() == positions.is_cuda(), "tokens and positions are not on the same device" );
+
+ if (tokens.is_cuda())
+ rope_2d_cuda( tokens, positions, base, fwd );
+ else
+ rope_2d_cpu( tokens, positions, base, fwd );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+ m.def("rope_2d", &rope_2d, "RoPE 2d forward/backward");
+}
diff --git a/third_party/mast3r/dust3r/croco/models/curope/curope2d.py b/third_party/mast3r/dust3r/croco/models/curope/curope2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a49c12f8c529e9a889b5ac20c5767158f238e17d
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/models/curope/curope2d.py
@@ -0,0 +1,40 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import torch
+
+try:
+ import curope as _kernels # run `python setup.py install`
+except ModuleNotFoundError:
+ from . import curope as _kernels # run `python setup.py build_ext --inplace`
+
+
+class cuRoPE2D_func (torch.autograd.Function):
+
+ @staticmethod
+ def forward(ctx, tokens, positions, base, F0=1):
+ ctx.save_for_backward(positions)
+ ctx.saved_base = base
+ ctx.saved_F0 = F0
+ # tokens = tokens.clone() # uncomment this if inplace doesn't work
+ _kernels.rope_2d( tokens, positions, base, F0 )
+ ctx.mark_dirty(tokens)
+ return tokens
+
+ @staticmethod
+ def backward(ctx, grad_res):
+ positions, base, F0 = ctx.saved_tensors[0], ctx.saved_base, ctx.saved_F0
+ _kernels.rope_2d( grad_res, positions, base, -F0 )
+ ctx.mark_dirty(grad_res)
+ return grad_res, None, None, None
+
+
+class cuRoPE2D(torch.nn.Module):
+ def __init__(self, freq=100.0, F0=1.0):
+ super().__init__()
+ self.base = freq
+ self.F0 = F0
+
+ def forward(self, tokens, positions):
+ cuRoPE2D_func.apply( tokens.transpose(1,2), positions, self.base, self.F0 )
+ return tokens
\ No newline at end of file
diff --git a/third_party/mast3r/dust3r/croco/models/curope/kernels.cu b/third_party/mast3r/dust3r/croco/models/curope/kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7156cd1bb935cb1f0be45e58add53f9c21505c20
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/models/curope/kernels.cu
@@ -0,0 +1,108 @@
+/*
+ Copyright (C) 2022-present Naver Corporation. All rights reserved.
+ Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+*/
+
+#include
+#include
+#include
+#include
+
+#define CHECK_CUDA(tensor) {\
+ TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \
+ TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); }
+void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));}
+
+
+template < typename scalar_t >
+__global__ void rope_2d_cuda_kernel(
+ //scalar_t* __restrict__ tokens,
+ torch::PackedTensorAccessor32 tokens,
+ const int64_t* __restrict__ pos,
+ const float base,
+ const float fwd )
+ // const int N, const int H, const int D )
+{
+ // tokens shape = (B, N, H, D)
+ const int N = tokens.size(1);
+ const int H = tokens.size(2);
+ const int D = tokens.size(3);
+
+ // each block update a single token, for all heads
+ // each thread takes care of a single output
+ extern __shared__ float shared[];
+ float* shared_inv_freq = shared + D;
+
+ const int b = blockIdx.x / N;
+ const int n = blockIdx.x % N;
+
+ const int Q = D / 4;
+ // one token = [0..Q : Q..2Q : 2Q..3Q : 3Q..D]
+ // u_Y v_Y u_X v_X
+
+ // shared memory: first, compute inv_freq
+ if (threadIdx.x < Q)
+ shared_inv_freq[threadIdx.x] = fwd / powf(base, threadIdx.x/float(Q));
+ __syncthreads();
+
+ // start of X or Y part
+ const int X = threadIdx.x < D/2 ? 0 : 1;
+ const int m = (X*D/2) + (threadIdx.x % Q); // index of u_Y or u_X
+
+ // grab the cos,sin appropriate for me
+ const float freq = pos[blockIdx.x*2+X] * shared_inv_freq[threadIdx.x % Q];
+ const float cos = cosf(freq);
+ const float sin = sinf(freq);
+ /*
+ float* shared_cos_sin = shared + D + D/4;
+ if ((threadIdx.x % (D/2)) < Q)
+ shared_cos_sin[m+0] = cosf(freq);
+ else
+ shared_cos_sin[m+Q] = sinf(freq);
+ __syncthreads();
+ const float cos = shared_cos_sin[m+0];
+ const float sin = shared_cos_sin[m+Q];
+ */
+
+ for (int h = 0; h < H; h++)
+ {
+ // then, load all the token for this head in shared memory
+ shared[threadIdx.x] = tokens[b][n][h][threadIdx.x];
+ __syncthreads();
+
+ const float u = shared[m];
+ const float v = shared[m+Q];
+
+ // write output
+ if ((threadIdx.x % (D/2)) < Q)
+ tokens[b][n][h][threadIdx.x] = u*cos - v*sin;
+ else
+ tokens[b][n][h][threadIdx.x] = v*cos + u*sin;
+ }
+}
+
+void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd )
+{
+ const int B = tokens.size(0); // batch size
+ const int N = tokens.size(1); // sequence length
+ const int H = tokens.size(2); // number of heads
+ const int D = tokens.size(3); // dimension per head
+
+ TORCH_CHECK(tokens.stride(3) == 1 && tokens.stride(2) == D, "tokens are not contiguous");
+ TORCH_CHECK(pos.is_contiguous(), "positions are not contiguous");
+ TORCH_CHECK(pos.size(0) == B && pos.size(1) == N && pos.size(2) == 2, "bad pos.shape");
+ TORCH_CHECK(D % 4 == 0, "token dim must be multiple of 4");
+
+ // one block for each layer, one thread per local-max
+ const int THREADS_PER_BLOCK = D;
+ const int N_BLOCKS = B * N; // each block takes care of H*D values
+ const int SHARED_MEM = sizeof(float) * (D + D/4);
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] {
+ rope_2d_cuda_kernel <<>> (
+ //tokens.data_ptr(),
+ tokens.packed_accessor32(),
+ pos.data_ptr(),
+ base, fwd); //, N, H, D );
+ }));
+}
diff --git a/third_party/mast3r/dust3r/croco/models/curope/setup.py b/third_party/mast3r/dust3r/croco/models/curope/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..230632ed05e309200e8f93a3a852072333975009
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/models/curope/setup.py
@@ -0,0 +1,34 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+from setuptools import setup
+from torch import cuda
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+# compile for all possible CUDA architectures
+all_cuda_archs = cuda.get_gencode_flags().replace('compute=','arch=').split()
+# alternatively, you can list cuda archs that you want, eg:
+# all_cuda_archs = [
+ # '-gencode', 'arch=compute_70,code=sm_70',
+ # '-gencode', 'arch=compute_75,code=sm_75',
+ # '-gencode', 'arch=compute_80,code=sm_80',
+ # '-gencode', 'arch=compute_86,code=sm_86'
+# ]
+
+setup(
+ name = 'curope',
+ ext_modules = [
+ CUDAExtension(
+ name='curope',
+ sources=[
+ "curope.cpp",
+ "kernels.cu",
+ ],
+ extra_compile_args = dict(
+ nvcc=['-O3','--ptxas-options=-v',"--use_fast_math"]+all_cuda_archs,
+ cxx=['-O3'])
+ )
+ ],
+ cmdclass = {
+ 'build_ext': BuildExtension
+ })
diff --git a/third_party/mast3r/dust3r/croco/models/dpt_block.py b/third_party/mast3r/dust3r/croco/models/dpt_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4ddfb74e2769ceca88720d4c730e00afd71c763
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/models/dpt_block.py
@@ -0,0 +1,450 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# DPT head for ViTs
+# --------------------------------------------------------
+# References:
+# https://github.com/isl-org/DPT
+# https://github.com/EPFL-VILAB/MultiMAE/blob/main/multimae/output_adapters.py
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from typing import Union, Tuple, Iterable, List, Optional, Dict
+
+def pair(t):
+ return t if isinstance(t, tuple) else (t, t)
+
+def make_scratch(in_shape, out_shape, groups=1, expand=False):
+ scratch = nn.Module()
+
+ out_shape1 = out_shape
+ out_shape2 = out_shape
+ out_shape3 = out_shape
+ out_shape4 = out_shape
+ if expand == True:
+ out_shape1 = out_shape
+ out_shape2 = out_shape * 2
+ out_shape3 = out_shape * 4
+ out_shape4 = out_shape * 8
+
+ scratch.layer1_rn = nn.Conv2d(
+ in_shape[0],
+ out_shape1,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=False,
+ groups=groups,
+ )
+ scratch.layer2_rn = nn.Conv2d(
+ in_shape[1],
+ out_shape2,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=False,
+ groups=groups,
+ )
+ scratch.layer3_rn = nn.Conv2d(
+ in_shape[2],
+ out_shape3,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=False,
+ groups=groups,
+ )
+ scratch.layer4_rn = nn.Conv2d(
+ in_shape[3],
+ out_shape4,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=False,
+ groups=groups,
+ )
+
+ scratch.layer_rn = nn.ModuleList([
+ scratch.layer1_rn,
+ scratch.layer2_rn,
+ scratch.layer3_rn,
+ scratch.layer4_rn,
+ ])
+
+ return scratch
+
+class ResidualConvUnit_custom(nn.Module):
+ """Residual convolution module."""
+
+ def __init__(self, features, activation, bn):
+ """Init.
+ Args:
+ features (int): number of features
+ """
+ super().__init__()
+
+ self.bn = bn
+
+ self.groups = 1
+
+ self.conv1 = nn.Conv2d(
+ features,
+ features,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=not self.bn,
+ groups=self.groups,
+ )
+
+ self.conv2 = nn.Conv2d(
+ features,
+ features,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=not self.bn,
+ groups=self.groups,
+ )
+
+ if self.bn == True:
+ self.bn1 = nn.BatchNorm2d(features)
+ self.bn2 = nn.BatchNorm2d(features)
+
+ self.activation = activation
+
+ self.skip_add = nn.quantized.FloatFunctional()
+
+ def forward(self, x):
+ """Forward pass.
+ Args:
+ x (tensor): input
+ Returns:
+ tensor: output
+ """
+
+ out = self.activation(x)
+ out = self.conv1(out)
+ if self.bn == True:
+ out = self.bn1(out)
+
+ out = self.activation(out)
+ out = self.conv2(out)
+ if self.bn == True:
+ out = self.bn2(out)
+
+ if self.groups > 1:
+ out = self.conv_merge(out)
+
+ return self.skip_add.add(out, x)
+
+class FeatureFusionBlock_custom(nn.Module):
+ """Feature fusion block."""
+
+ def __init__(
+ self,
+ features,
+ activation,
+ deconv=False,
+ bn=False,
+ expand=False,
+ align_corners=True,
+ width_ratio=1,
+ ):
+ """Init.
+ Args:
+ features (int): number of features
+ """
+ super(FeatureFusionBlock_custom, self).__init__()
+ self.width_ratio = width_ratio
+
+ self.deconv = deconv
+ self.align_corners = align_corners
+
+ self.groups = 1
+
+ self.expand = expand
+ out_features = features
+ if self.expand == True:
+ out_features = features // 2
+
+ self.out_conv = nn.Conv2d(
+ features,
+ out_features,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ bias=True,
+ groups=1,
+ )
+
+ self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+ self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+
+ self.skip_add = nn.quantized.FloatFunctional()
+
+ def forward(self, *xs):
+ """Forward pass.
+ Returns:
+ tensor: output
+ """
+ output = xs[0]
+
+ if len(xs) == 2:
+ res = self.resConfUnit1(xs[1])
+ if self.width_ratio != 1:
+ res = F.interpolate(res, size=(output.shape[2], output.shape[3]), mode='bilinear')
+
+ output = self.skip_add.add(output, res)
+ # output += res
+
+ output = self.resConfUnit2(output)
+
+ if self.width_ratio != 1:
+ # and output.shape[3] < self.width_ratio * output.shape[2]
+ #size=(image.shape[])
+ if (output.shape[3] / output.shape[2]) < (2 / 3) * self.width_ratio:
+ shape = 3 * output.shape[3]
+ else:
+ shape = int(self.width_ratio * 2 * output.shape[2])
+ output = F.interpolate(output, size=(2* output.shape[2], shape), mode='bilinear')
+ else:
+ output = nn.functional.interpolate(output, scale_factor=2,
+ mode="bilinear", align_corners=self.align_corners)
+ output = self.out_conv(output)
+ return output
+
+def make_fusion_block(features, use_bn, width_ratio=1):
+ return FeatureFusionBlock_custom(
+ features,
+ nn.ReLU(False),
+ deconv=False,
+ bn=use_bn,
+ expand=False,
+ align_corners=True,
+ width_ratio=width_ratio,
+ )
+
+class Interpolate(nn.Module):
+ """Interpolation module."""
+
+ def __init__(self, scale_factor, mode, align_corners=False):
+ """Init.
+ Args:
+ scale_factor (float): scaling
+ mode (str): interpolation mode
+ """
+ super(Interpolate, self).__init__()
+
+ self.interp = nn.functional.interpolate
+ self.scale_factor = scale_factor
+ self.mode = mode
+ self.align_corners = align_corners
+
+ def forward(self, x):
+ """Forward pass.
+ Args:
+ x (tensor): input
+ Returns:
+ tensor: interpolated data
+ """
+
+ x = self.interp(
+ x,
+ scale_factor=self.scale_factor,
+ mode=self.mode,
+ align_corners=self.align_corners,
+ )
+
+ return x
+
+class DPTOutputAdapter(nn.Module):
+ """DPT output adapter.
+
+ :param num_cahnnels: Number of output channels
+ :param stride_level: tride level compared to the full-sized image.
+ E.g. 4 for 1/4th the size of the image.
+ :param patch_size_full: Int or tuple of the patch size over the full image size.
+ Patch size for smaller inputs will be computed accordingly.
+ :param hooks: Index of intermediate layers
+ :param layer_dims: Dimension of intermediate layers
+ :param feature_dim: Feature dimension
+ :param last_dim: out_channels/in_channels for the last two Conv2d when head_type == regression
+ :param use_bn: If set to True, activates batch norm
+ :param dim_tokens_enc: Dimension of tokens coming from encoder
+ """
+
+ def __init__(self,
+ num_channels: int = 1,
+ stride_level: int = 1,
+ patch_size: Union[int, Tuple[int, int]] = 16,
+ main_tasks: Iterable[str] = ('rgb',),
+ hooks: List[int] = [2, 5, 8, 11],
+ layer_dims: List[int] = [96, 192, 384, 768],
+ feature_dim: int = 256,
+ last_dim: int = 32,
+ use_bn: bool = False,
+ dim_tokens_enc: Optional[int] = None,
+ head_type: str = 'regression',
+ output_width_ratio=1,
+ **kwargs):
+ super().__init__()
+ self.num_channels = num_channels
+ self.stride_level = stride_level
+ self.patch_size = pair(patch_size)
+ self.main_tasks = main_tasks
+ self.hooks = hooks
+ self.layer_dims = layer_dims
+ self.feature_dim = feature_dim
+ self.dim_tokens_enc = dim_tokens_enc * len(self.main_tasks) if dim_tokens_enc is not None else None
+ self.head_type = head_type
+
+ # Actual patch height and width, taking into account stride of input
+ self.P_H = max(1, self.patch_size[0] // stride_level)
+ self.P_W = max(1, self.patch_size[1] // stride_level)
+
+ self.scratch = make_scratch(layer_dims, feature_dim, groups=1, expand=False)
+
+ self.scratch.refinenet1 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+ self.scratch.refinenet2 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+ self.scratch.refinenet3 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+ self.scratch.refinenet4 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+
+ if self.head_type == 'regression':
+ # The "DPTDepthModel" head
+ self.head = nn.Sequential(
+ nn.Conv2d(feature_dim, feature_dim // 2, kernel_size=3, stride=1, padding=1),
+ Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+ nn.Conv2d(feature_dim // 2, last_dim, kernel_size=3, stride=1, padding=1),
+ nn.ReLU(True),
+ nn.Conv2d(last_dim, self.num_channels, kernel_size=1, stride=1, padding=0)
+ )
+ elif self.head_type == 'semseg':
+ # The "DPTSegmentationModel" head
+ self.head = nn.Sequential(
+ nn.Conv2d(feature_dim, feature_dim, kernel_size=3, padding=1, bias=False),
+ nn.BatchNorm2d(feature_dim) if use_bn else nn.Identity(),
+ nn.ReLU(True),
+ nn.Dropout(0.1, False),
+ nn.Conv2d(feature_dim, self.num_channels, kernel_size=1),
+ Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+ )
+ else:
+ raise ValueError('DPT head_type must be "regression" or "semseg".')
+
+ if self.dim_tokens_enc is not None:
+ self.init(dim_tokens_enc=dim_tokens_enc)
+
+ def init(self, dim_tokens_enc=768):
+ """
+ Initialize parts of decoder that are dependent on dimension of encoder tokens.
+ Should be called when setting up MultiMAE.
+
+ :param dim_tokens_enc: Dimension of tokens coming from encoder
+ """
+ #print(dim_tokens_enc)
+
+ # Set up activation postprocessing layers
+ if isinstance(dim_tokens_enc, int):
+ dim_tokens_enc = 4 * [dim_tokens_enc]
+
+ self.dim_tokens_enc = [dt * len(self.main_tasks) for dt in dim_tokens_enc]
+
+ self.act_1_postprocess = nn.Sequential(
+ nn.Conv2d(
+ in_channels=self.dim_tokens_enc[0],
+ out_channels=self.layer_dims[0],
+ kernel_size=1, stride=1, padding=0,
+ ),
+ nn.ConvTranspose2d(
+ in_channels=self.layer_dims[0],
+ out_channels=self.layer_dims[0],
+ kernel_size=4, stride=4, padding=0,
+ bias=True, dilation=1, groups=1,
+ )
+ )
+
+ self.act_2_postprocess = nn.Sequential(
+ nn.Conv2d(
+ in_channels=self.dim_tokens_enc[1],
+ out_channels=self.layer_dims[1],
+ kernel_size=1, stride=1, padding=0,
+ ),
+ nn.ConvTranspose2d(
+ in_channels=self.layer_dims[1],
+ out_channels=self.layer_dims[1],
+ kernel_size=2, stride=2, padding=0,
+ bias=True, dilation=1, groups=1,
+ )
+ )
+
+ self.act_3_postprocess = nn.Sequential(
+ nn.Conv2d(
+ in_channels=self.dim_tokens_enc[2],
+ out_channels=self.layer_dims[2],
+ kernel_size=1, stride=1, padding=0,
+ )
+ )
+
+ self.act_4_postprocess = nn.Sequential(
+ nn.Conv2d(
+ in_channels=self.dim_tokens_enc[3],
+ out_channels=self.layer_dims[3],
+ kernel_size=1, stride=1, padding=0,
+ ),
+ nn.Conv2d(
+ in_channels=self.layer_dims[3],
+ out_channels=self.layer_dims[3],
+ kernel_size=3, stride=2, padding=1,
+ )
+ )
+
+ self.act_postprocess = nn.ModuleList([
+ self.act_1_postprocess,
+ self.act_2_postprocess,
+ self.act_3_postprocess,
+ self.act_4_postprocess
+ ])
+
+ def adapt_tokens(self, encoder_tokens):
+ # Adapt tokens
+ x = []
+ x.append(encoder_tokens[:, :])
+ x = torch.cat(x, dim=-1)
+ return x
+
+ def forward(self, encoder_tokens: List[torch.Tensor], image_size):
+ #input_info: Dict):
+ assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first'
+ H, W = image_size
+
+ # Number of patches in height and width
+ N_H = H // (self.stride_level * self.P_H)
+ N_W = W // (self.stride_level * self.P_W)
+
+ # Hook decoder onto 4 layers from specified ViT layers
+ layers = [encoder_tokens[hook] for hook in self.hooks]
+
+ # Extract only task-relevant tokens and ignore global tokens.
+ layers = [self.adapt_tokens(l) for l in layers]
+
+ # Reshape tokens to spatial representation
+ layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers]
+
+ layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)]
+ # Project layers to chosen feature dim
+ layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)]
+
+ # Fuse layers using refinement stages
+ path_4 = self.scratch.refinenet4(layers[3])
+ path_3 = self.scratch.refinenet3(path_4, layers[2])
+ path_2 = self.scratch.refinenet2(path_3, layers[1])
+ path_1 = self.scratch.refinenet1(path_2, layers[0])
+
+ # Output head
+ out = self.head(path_1)
+
+ return out
diff --git a/third_party/mast3r/dust3r/croco/models/head_downstream.py b/third_party/mast3r/dust3r/croco/models/head_downstream.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd40c91ba244d6c3522c6efd4ed4d724b7bdc650
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/models/head_downstream.py
@@ -0,0 +1,58 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Heads for downstream tasks
+# --------------------------------------------------------
+
+"""
+A head is a module where the __init__ defines only the head hyperparameters.
+A method setup(croconet) takes a CroCoNet and set all layers according to the head and croconet attributes.
+The forward takes the features as well as a dictionary img_info containing the keys 'width' and 'height'
+"""
+
+import torch
+import torch.nn as nn
+from .dpt_block import DPTOutputAdapter
+
+
+class PixelwiseTaskWithDPT(nn.Module):
+ """ DPT module for CroCo.
+ by default, hooks_idx will be equal to:
+ * for encoder-only: 4 equally spread layers
+ * for encoder+decoder: last encoder + 3 equally spread layers of the decoder
+ """
+
+ def __init__(self, *, hooks_idx=None, layer_dims=[96,192,384,768],
+ output_width_ratio=1, num_channels=1, postprocess=None, **kwargs):
+ super(PixelwiseTaskWithDPT, self).__init__()
+ self.return_all_blocks = True # backbone needs to return all layers
+ self.postprocess = postprocess
+ self.output_width_ratio = output_width_ratio
+ self.num_channels = num_channels
+ self.hooks_idx = hooks_idx
+ self.layer_dims = layer_dims
+
+ def setup(self, croconet):
+ dpt_args = {'output_width_ratio': self.output_width_ratio, 'num_channels': self.num_channels}
+ if self.hooks_idx is None:
+ if hasattr(croconet, 'dec_blocks'): # encoder + decoder
+ step = {8: 3, 12: 4, 24: 8}[croconet.dec_depth]
+ hooks_idx = [croconet.dec_depth+croconet.enc_depth-1-i*step for i in range(3,-1,-1)]
+ else: # encoder only
+ step = croconet.enc_depth//4
+ hooks_idx = [croconet.enc_depth-1-i*step for i in range(3,-1,-1)]
+ self.hooks_idx = hooks_idx
+ print(f' PixelwiseTaskWithDPT: automatically setting hook_idxs={self.hooks_idx}')
+ dpt_args['hooks'] = self.hooks_idx
+ dpt_args['layer_dims'] = self.layer_dims
+ self.dpt = DPTOutputAdapter(**dpt_args)
+ dim_tokens = [croconet.enc_embed_dim if hook0:
+ pos_embed = np.concatenate([np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+ assert embed_dim % 2 == 0
+
+ # use half of dimensions to encode grid_h
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
+
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+ return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+ """
+ embed_dim: output dimension for each position
+ pos: a list of positions to be encoded: size (M,)
+ out: (M, D)
+ """
+ assert embed_dim % 2 == 0
+ omega = np.arange(embed_dim // 2, dtype=float)
+ omega /= embed_dim / 2.
+ omega = 1. / 10000**omega # (D/2,)
+
+ pos = pos.reshape(-1) # (M,)
+ out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
+
+ emb_sin = np.sin(out) # (M, D/2)
+ emb_cos = np.cos(out) # (M, D/2)
+
+ emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
+ return emb
+
+
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+ if 'pos_embed' in checkpoint_model:
+ pos_embed_checkpoint = checkpoint_model['pos_embed']
+ embedding_size = pos_embed_checkpoint.shape[-1]
+ num_patches = model.patch_embed.num_patches
+ num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+ # height (== width) for the checkpoint position embedding
+ orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+ # height (== width) for the new position embedding
+ new_size = int(num_patches ** 0.5)
+ # class_token and dist_token are kept unchanged
+ if orig_size != new_size:
+ print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+ extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+ # only the position tokens are interpolated
+ pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+ pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+ pos_tokens = torch.nn.functional.interpolate(
+ pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+ pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+ new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+ checkpoint_model['pos_embed'] = new_pos_embed
+
+
+#----------------------------------------------------------
+# RoPE2D: RoPE implementation in 2D
+#----------------------------------------------------------
+
+try:
+ from models.curope import cuRoPE2D
+ RoPE2D = cuRoPE2D
+except ImportError:
+ print('Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead')
+
+ class RoPE2D(torch.nn.Module):
+
+ def __init__(self, freq=100.0, F0=1.0):
+ super().__init__()
+ self.base = freq
+ self.F0 = F0
+ self.cache = {}
+
+ def get_cos_sin(self, D, seq_len, device, dtype):
+ if (D,seq_len,device,dtype) not in self.cache:
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+ t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+ freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+ freqs = torch.cat((freqs, freqs), dim=-1)
+ cos = freqs.cos() # (Seq, Dim)
+ sin = freqs.sin()
+ self.cache[D,seq_len,device,dtype] = (cos,sin)
+ return self.cache[D,seq_len,device,dtype]
+
+ @staticmethod
+ def rotate_half(x):
+ x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+ def apply_rope1d(self, tokens, pos1d, cos, sin):
+ assert pos1d.ndim==2
+ cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+ sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+ return (tokens * cos) + (self.rotate_half(tokens) * sin)
+
+ def forward(self, tokens, positions):
+ """
+ input:
+ * tokens: batch_size x nheads x ntokens x dim
+ * positions: batch_size x ntokens x 2 (y and x position of each token)
+ output:
+ * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim)
+ """
+ assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two"
+ D = tokens.size(3) // 2
+ assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2
+ cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype)
+ # split features into two along the feature dimension, and apply rope1d on each half
+ y, x = tokens.chunk(2, dim=-1)
+ y = self.apply_rope1d(y, positions[:,:,0], cos, sin)
+ x = self.apply_rope1d(x, positions[:,:,1], cos, sin)
+ tokens = torch.cat((y, x), dim=-1)
+ return tokens
\ No newline at end of file
diff --git a/third_party/mast3r/dust3r/croco/pretrain.py b/third_party/mast3r/dust3r/croco/pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c45e488015ef5380c71d0381ff453fdb860759e
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/pretrain.py
@@ -0,0 +1,254 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Pre-training CroCo
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+import argparse
+import datetime
+import json
+import numpy as np
+import os
+import sys
+import time
+import math
+from pathlib import Path
+from typing import Iterable
+
+import torch
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+import utils.misc as misc
+from utils.misc import NativeScalerWithGradNormCount as NativeScaler
+from models.croco import CroCoNet
+from models.criterion import MaskedMSE
+from datasets.pairs_dataset import PairsDataset
+
+
+def get_args_parser():
+ parser = argparse.ArgumentParser('CroCo pre-training', add_help=False)
+ # model and criterion
+ parser.add_argument('--model', default='CroCoNet()', type=str, help="string containing the model to build")
+ parser.add_argument('--norm_pix_loss', default=1, choices=[0,1], help="apply per-patch mean/std normalization before applying the loss")
+ # dataset
+ parser.add_argument('--dataset', default='habitat_release', type=str, help="training set")
+ parser.add_argument('--transforms', default='crop224+acolor', type=str, help="transforms to apply") # in the paper, we also use some homography and rotation, but find later that they were not useful or even harmful
+ # training
+ parser.add_argument('--seed', default=0, type=int, help="Random seed")
+ parser.add_argument('--batch_size', default=64, type=int, help="Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus")
+ parser.add_argument('--epochs', default=800, type=int, help="Maximum number of epochs for the scheduler")
+ parser.add_argument('--max_epoch', default=400, type=int, help="Stop training at this epoch")
+ parser.add_argument('--accum_iter', default=1, type=int, help="Accumulate gradient iterations (for increasing the effective batch size under memory constraints)")
+ parser.add_argument('--weight_decay', type=float, default=0.05, help="weight decay (default: 0.05)")
+ parser.add_argument('--lr', type=float, default=None, metavar='LR', help='learning rate (absolute lr)')
+ parser.add_argument('--blr', type=float, default=1.5e-4, metavar='LR', help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
+ parser.add_argument('--min_lr', type=float, default=0., metavar='LR', help='lower lr bound for cyclic schedulers that hit 0')
+ parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N', help='epochs to warmup LR')
+ parser.add_argument('--amp', type=int, default=1, choices=[0,1], help="Use Automatic Mixed Precision for pretraining")
+ # others
+ parser.add_argument('--num_workers', default=8, type=int)
+ parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
+ parser.add_argument('--local_rank', default=-1, type=int)
+ parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+ parser.add_argument('--save_freq', default=1, type=int, help='frequence (number of epochs) to save checkpoint in checkpoint-last.pth')
+ parser.add_argument('--keep_freq', default=20, type=int, help='frequence (number of epochs) to save checkpoint in checkpoint-%d.pth')
+ parser.add_argument('--print_freq', default=20, type=int, help='frequence (number of iterations) to print infos while training')
+ # paths
+ parser.add_argument('--output_dir', default='./output/', type=str, help="path where to save the output")
+ parser.add_argument('--data_dir', default='./data/', type=str, help="path where data are stored")
+ return parser
+
+
+
+
+def main(args):
+ misc.init_distributed_mode(args)
+ global_rank = misc.get_rank()
+ world_size = misc.get_world_size()
+
+ print("output_dir: "+args.output_dir)
+ if args.output_dir:
+ Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+
+ # auto resume
+ last_ckpt_fname = os.path.join(args.output_dir, f'checkpoint-last.pth')
+ args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None
+
+ print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__))))
+ print("{}".format(args).replace(', ', ',\n'))
+
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ device = torch.device(device)
+
+ # fix the seed
+ seed = args.seed + misc.get_rank()
+ torch.manual_seed(seed)
+ np.random.seed(seed)
+
+ cudnn.benchmark = True
+
+ ## training dataset and loader
+ print('Building dataset for {:s} with transforms {:s}'.format(args.dataset, args.transforms))
+ dataset = PairsDataset(args.dataset, trfs=args.transforms, data_dir=args.data_dir)
+ if world_size>1:
+ sampler_train = torch.utils.data.DistributedSampler(
+ dataset, num_replicas=world_size, rank=global_rank, shuffle=True
+ )
+ print("Sampler_train = %s" % str(sampler_train))
+ else:
+ sampler_train = torch.utils.data.RandomSampler(dataset)
+ data_loader_train = torch.utils.data.DataLoader(
+ dataset, sampler=sampler_train,
+ batch_size=args.batch_size,
+ num_workers=args.num_workers,
+ pin_memory=True,
+ drop_last=True,
+ )
+
+ ## model
+ print('Loading model: {:s}'.format(args.model))
+ model = eval(args.model)
+ print('Loading criterion: MaskedMSE(norm_pix_loss={:s})'.format(str(bool(args.norm_pix_loss))))
+ criterion = MaskedMSE(norm_pix_loss=bool(args.norm_pix_loss))
+
+ model.to(device)
+ model_without_ddp = model
+ print("Model = %s" % str(model_without_ddp))
+
+ eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
+ if args.lr is None: # only base_lr is specified
+ args.lr = args.blr * eff_batch_size / 256
+ print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
+ print("actual lr: %.2e" % args.lr)
+ print("accumulate grad iterations: %d" % args.accum_iter)
+ print("effective batch size: %d" % eff_batch_size)
+
+ if args.distributed:
+ model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True, static_graph=True)
+ model_without_ddp = model.module
+
+ param_groups = misc.get_parameter_groups(model_without_ddp, args.weight_decay) # following timm: set wd as 0 for bias and norm layers
+ optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
+ print(optimizer)
+ loss_scaler = NativeScaler()
+
+ misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler)
+
+ if global_rank == 0 and args.output_dir is not None:
+ log_writer = SummaryWriter(log_dir=args.output_dir)
+ else:
+ log_writer = None
+
+ print(f"Start training until {args.max_epoch} epochs")
+ start_time = time.time()
+ for epoch in range(args.start_epoch, args.max_epoch):
+ if world_size>1:
+ data_loader_train.sampler.set_epoch(epoch)
+
+ train_stats = train_one_epoch(
+ model, criterion, data_loader_train,
+ optimizer, device, epoch, loss_scaler,
+ log_writer=log_writer,
+ args=args
+ )
+
+ if args.output_dir and epoch % args.save_freq == 0 :
+ misc.save_model(
+ args=args, model_without_ddp=model_without_ddp, optimizer=optimizer,
+ loss_scaler=loss_scaler, epoch=epoch, fname='last')
+
+ if args.output_dir and (epoch % args.keep_freq == 0 or epoch + 1 == args.max_epoch) and (epoch>0 or args.max_epoch==1):
+ misc.save_model(
+ args=args, model_without_ddp=model_without_ddp, optimizer=optimizer,
+ loss_scaler=loss_scaler, epoch=epoch)
+
+ log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+ 'epoch': epoch,}
+
+ if args.output_dir and misc.is_main_process():
+ if log_writer is not None:
+ log_writer.flush()
+ with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
+ f.write(json.dumps(log_stats) + "\n")
+
+ total_time = time.time() - start_time
+ total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+ print('Training time {}'.format(total_time_str))
+
+
+
+
+def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
+ data_loader: Iterable, optimizer: torch.optim.Optimizer,
+ device: torch.device, epoch: int, loss_scaler,
+ log_writer=None,
+ args=None):
+ model.train(True)
+ metric_logger = misc.MetricLogger(delimiter=" ")
+ metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+ header = 'Epoch: [{}]'.format(epoch)
+ accum_iter = args.accum_iter
+
+ optimizer.zero_grad()
+
+ if log_writer is not None:
+ print('log_dir: {}'.format(log_writer.log_dir))
+
+ for data_iter_step, (image1, image2) in enumerate(metric_logger.log_every(data_loader, args.print_freq, header)):
+
+ # we use a per iteration lr scheduler
+ if data_iter_step % accum_iter == 0:
+ misc.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args)
+
+ image1 = image1.to(device, non_blocking=True)
+ image2 = image2.to(device, non_blocking=True)
+ with torch.cuda.amp.autocast(enabled=bool(args.amp)):
+ out, mask, target = model(image1, image2)
+ loss = criterion(out, mask, target)
+
+ loss_value = loss.item()
+
+ if not math.isfinite(loss_value):
+ print("Loss is {}, stopping training".format(loss_value))
+ sys.exit(1)
+
+ loss /= accum_iter
+ loss_scaler(loss, optimizer, parameters=model.parameters(),
+ update_grad=(data_iter_step + 1) % accum_iter == 0)
+ if (data_iter_step + 1) % accum_iter == 0:
+ optimizer.zero_grad()
+
+ torch.cuda.synchronize()
+
+ metric_logger.update(loss=loss_value)
+
+ lr = optimizer.param_groups[0]["lr"]
+ metric_logger.update(lr=lr)
+
+ loss_value_reduce = misc.all_reduce_mean(loss_value)
+ if log_writer is not None and ((data_iter_step + 1) % (accum_iter*args.print_freq)) == 0:
+ # x-axis is based on epoch_1000x in the tensorboard, calibrating differences curves when batch size changes
+ epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000)
+ log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x)
+ log_writer.add_scalar('lr', lr, epoch_1000x)
+
+ # gather the stats from all processes
+ metric_logger.synchronize_between_processes()
+ print("Averaged stats:", metric_logger)
+ return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+
+if __name__ == '__main__':
+ args = get_args_parser()
+ args = args.parse_args()
+ main(args)
diff --git a/third_party/mast3r/dust3r/croco/stereoflow/README.MD b/third_party/mast3r/dust3r/croco/stereoflow/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..81595380fadd274b523e0cf77921b1b65cbedb34
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/stereoflow/README.MD
@@ -0,0 +1,318 @@
+## CroCo-Stereo and CroCo-Flow
+
+This README explains how to use CroCo-Stereo and CroCo-Flow as well as how they were trained.
+All commands should be launched from the root directory.
+
+### Simple inference example
+
+We provide a simple inference exemple for CroCo-Stereo and CroCo-Flow in the Totebook `croco-stereo-flow-demo.ipynb`.
+Before running it, please download the trained models with:
+```
+bash stereoflow/download_model.sh crocostereo.pth
+bash stereoflow/download_model.sh crocoflow.pth
+```
+
+### Prepare data for training or evaluation
+
+Put the datasets used for training/evaluation in `./data/stereoflow` (or update the paths at the top of `stereoflow/datasets_stereo.py` and `stereoflow/datasets_flow.py`).
+Please find below on the file structure should look for each dataset:
+
+FlyingChairs
+
+```
+./data/stereoflow/FlyingChairs/
+└───chairs_split.txt
+└───data/
+ └─── ...
+```
+
+
+
+MPI-Sintel
+
+```
+./data/stereoflow/MPI-Sintel/
+└───training/
+│ └───clean/
+│ └───final/
+│ └───flow/
+└───test/
+ └───clean/
+ └───final/
+```
+
+
+
+SceneFlow (including FlyingThings)
+
+```
+./data/stereoflow/SceneFlow/
+└───Driving/
+│ └───disparity/
+│ └───frames_cleanpass/
+│ └───frames_finalpass/
+└───FlyingThings/
+│ └───disparity/
+│ └───frames_cleanpass/
+│ └───frames_finalpass/
+│ └───optical_flow/
+└───Monkaa/
+ └───disparity/
+ └───frames_cleanpass/
+ └───frames_finalpass/
+```
+
+
+
+TartanAir
+
+```
+./data/stereoflow/TartanAir/
+└───abandonedfactory/
+│ └───.../
+└───abandonedfactory_night/
+│ └───.../
+└───.../
+```
+
+
+
+Booster
+
+```
+./data/stereoflow/booster_gt/
+└───train/
+ └───balanced/
+ └───Bathroom/
+ └───Bedroom/
+ └───...
+```
+
+
+
+CREStereo
+
+```
+./data/stereoflow/crenet_stereo_trainset/
+└───stereo_trainset/
+ └───crestereo/
+ └───hole/
+ └───reflective/
+ └───shapenet/
+ └───tree/
+```
+
+
+
+ETH3D Two-view Low-res
+
+```
+./data/stereoflow/eth3d_lowres/
+└───test/
+│ └───lakeside_1l/
+│ └───...
+└───train/
+│ └───delivery_area_1l/
+│ └───...
+└───train_gt/
+ └───delivery_area_1l/
+ └───...
+```
+
+
+
+KITTI 2012
+
+```
+./data/stereoflow/kitti-stereo-2012/
+└───testing/
+│ └───colored_0/
+│ └───colored_1/
+└───training/
+ └───colored_0/
+ └───colored_1/
+ └───disp_occ/
+ └───flow_occ/
+```
+
+
+
+KITTI 2015
+
+```
+./data/stereoflow/kitti-stereo-2015/
+└───testing/
+│ └───image_2/
+│ └───image_3/
+└───training/
+ └───image_2/
+ └───image_3/
+ └───disp_occ_0/
+ └───flow_occ/
+```
+
+
+
+Middlebury
+
+```
+./data/stereoflow/middlebury
+└───2005/
+│ └───train/
+│ └───Art/
+│ └───...
+└───2006/
+│ └───Aloe/
+│ └───Baby1/
+│ └───...
+└───2014/
+│ └───Adirondack-imperfect/
+│ └───Adirondack-perfect/
+│ └───...
+└───2021/
+│ └───data/
+│ └───artroom1/
+│ └───artroom2/
+│ └───...
+└───MiddEval3_F/
+ └───test/
+ │ └───Australia/
+ │ └───...
+ └───train/
+ └───Adirondack/
+ └───...
+```
+
+
+
+Spring
+
+```
+./data/stereoflow/spring/
+└───test/
+│ └───0003/
+│ └───...
+└───train/
+ └───0001/
+ └───...
+```
+
+
+
+### CroCo-Stereo
+
+##### Main model
+
+The main training of CroCo-Stereo was performed on a series of datasets, and it was used as it for Middlebury v3 benchmark.
+
+```
+# Download the model
+bash stereoflow/download_model.sh crocostereo.pth
+# Middlebury v3 submission
+python stereoflow/test.py --model stereoflow_models/crocostereo.pth --dataset "MdEval3('all_full')" --save submission --tile_overlap 0.9
+# Training command that was used, using checkpoint-last.pth
+python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/
+# or it can be launched on multiple gpus (while maintaining the effective batch size), e.g. on 3 gpus:
+torchrun --nproc_per_node 3 stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 2 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/
+```
+
+For evaluation of validation set, we also provide the model trained on the `subtrain` subset of the training sets.
+
+```
+# Download the model
+bash stereoflow/download_model.sh crocostereo_subtrain.pth
+# Evaluation on validation sets
+python stereoflow/test.py --model stereoflow_models/crocostereo_subtrain.pth --dataset "MdEval3('subval_full')+ETH3DLowRes('subval')+SceneFlow('test_finalpass')+SceneFlow('test_cleanpass')" --save metrics --tile_overlap 0.9
+# Training command that was used (same as above but on subtrain, using checkpoint-best.pth), can also be launched on multiple gpus
+python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('subtrain')+50*Md05('subtrain')+50*Md06('subtrain')+50*Md14('subtrain')+50*Md21('subtrain')+50*MdEval3('subtrain_full')+Booster('subtrain_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_subtrain/
+```
+
+##### Other models
+
+
+ Model for ETH3D
+ The model used for the submission on ETH3D is trained with the same command but using an unbounded Laplacian loss.
+
+ # Download the model
+ bash stereoflow/download_model.sh crocostereo_eth3d.pth
+ # ETH3D submission
+ python stereoflow/test.py --model stereoflow_models/crocostereo_eth3d.pth --dataset "ETH3DLowRes('all')" --save submission --tile_overlap 0.9
+ # Training command that was used
+ python -u stereoflow/train.py stereo --criterion "LaplacianLoss()" --tile_conf_mode conf_expbeta3 --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_eth3d/
+
+
+
+
+ Main model finetuned on Kitti
+
+ # Download the model
+ bash stereoflow/download_model.sh crocostereo_finetune_kitti.pth
+ # Kitti submission
+ python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.9
+ # Training that was used
+ python -u stereoflow/train.py stereo --crop 352 1216 --criterion "LaplacianLossBounded2()" --dataset "Kitti12('train')+Kitti15('train')" --lr 3e-5 --batch_size 1 --accum_iter 6 --epochs 20 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_kitti/ --save_every 5
+
+
+
+ Main model finetuned on Spring
+
+ # Download the model
+ bash stereoflow/download_model.sh crocostereo_finetune_spring.pth
+ # Spring submission
+ python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9
+ # Training command that was used
+ python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "Spring('train')" --lr 3e-5 --batch_size 6 --epochs 8 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_spring/
+
+
+
+ Smaller models
+ To train CroCo-Stereo with smaller CroCo pretrained models, simply replace the --pretrained
argument. To download the smaller CroCo-Stereo models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use bash stereoflow/download_model.sh crocostereo_subtrain_vitb_smalldecoder.pth
, and for the model with a ViT-Base encoder and a Base decoder, use bash stereoflow/download_model.sh crocostereo_subtrain_vitb_basedecoder.pth
.
+
+
+
+### CroCo-Flow
+
+##### Main model
+
+The main training of CroCo-Flow was performed on the FlyingThings, FlyingChairs, MPI-Sintel and TartanAir datasets.
+It was used for our submission to the MPI-Sintel benchmark.
+
+```
+# Download the model
+bash stereoflow/download_model.sh crocoflow.pth
+# Evaluation
+python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --save metrics --tile_overlap 0.9
+# Sintel submission
+python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('test_allpass')" --save submission --tile_overlap 0.9
+# Training command that was used, with checkpoint-best.pth
+python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "40*MPISintel('subtrain_cleanpass')+40*MPISintel('subtrain_finalpass')+4*FlyingThings('train_allpass')+4*FlyingChairs('train')+TartanAir('train')" --val_dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --lr 2e-5 --batch_size 8 --epochs 240 --img_per_epoch 30000 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocoflow/main/
+```
+
+##### Other models
+
+
+ Main model finetuned on Kitti
+
+ # Download the model
+ bash stereoflow/download_model.sh crocoflow_finetune_kitti.pth
+ # Kitti submission
+ python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.99
+ # Training that was used, with checkpoint-last.pth
+ python -u stereoflow/train.py flow --crop 352 1216 --criterion "LaplacianLossBounded()" --dataset "Kitti15('train')+Kitti12('train')" --lr 2e-5 --batch_size 1 --accum_iter 8 --epochs 150 --save_every 5 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_kitti/
+
+
+
+ Main model finetuned on Spring
+
+ # Download the model
+ bash stereoflow/download_model.sh crocoflow_finetune_spring.pth
+ # Spring submission
+ python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9
+ # Training command that was used, with checkpoint-last.pth
+ python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "Spring('train')" --lr 2e-5 --batch_size 8 --epochs 12 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_spring/
+
+
+
+ Smaller models
+ To train CroCo-Flow with smaller CroCo pretrained models, simply replace the --pretrained
argument. To download the smaller CroCo-Flow models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use bash stereoflow/download_model.sh crocoflow_vitb_smalldecoder.pth
, and for the model with a ViT-Base encoder and a Base decoder, use bash stereoflow/download_model.sh crocoflow_vitb_basedecoder.pth
.
+
diff --git a/third_party/mast3r/dust3r/croco/stereoflow/augmentor.py b/third_party/mast3r/dust3r/croco/stereoflow/augmentor.py
new file mode 100644
index 0000000000000000000000000000000000000000..69e6117151988d94cbc4b385e0d88e982133bf10
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/stereoflow/augmentor.py
@@ -0,0 +1,290 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Data augmentation for training stereo and flow
+# --------------------------------------------------------
+
+# References
+# https://github.com/autonomousvision/unimatch/blob/master/dataloader/stereo/transforms.py
+# https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/transforms.py
+
+
+import numpy as np
+import random
+from PIL import Image
+
+import cv2
+cv2.setNumThreads(0)
+cv2.ocl.setUseOpenCL(False)
+
+import torch
+from torchvision.transforms import ColorJitter
+import torchvision.transforms.functional as FF
+
+class StereoAugmentor(object):
+
+ def __init__(self, crop_size, scale_prob=0.5, scale_xonly=True, lhth=800., lminscale=0.0, lmaxscale=1.0, hminscale=-0.2, hmaxscale=0.4, scale_interp_nearest=True, rightjitterprob=0.5, v_flip_prob=0.5, color_aug_asym=True, color_choice_prob=0.5):
+ self.crop_size = crop_size
+ self.scale_prob = scale_prob
+ self.scale_xonly = scale_xonly
+ self.lhth = lhth
+ self.lminscale = lminscale
+ self.lmaxscale = lmaxscale
+ self.hminscale = hminscale
+ self.hmaxscale = hmaxscale
+ self.scale_interp_nearest = scale_interp_nearest
+ self.rightjitterprob = rightjitterprob
+ self.v_flip_prob = v_flip_prob
+ self.color_aug_asym = color_aug_asym
+ self.color_choice_prob = color_choice_prob
+
+ def _random_scale(self, img1, img2, disp):
+ ch,cw = self.crop_size
+ h,w = img1.shape[:2]
+ if self.scale_prob>0. and np.random.rand()1.:
+ scale_x = clip_scale
+ scale_y = scale_x if not self.scale_xonly else 1.0
+ img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+ img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+ disp = cv2.resize(disp, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR if not self.scale_interp_nearest else cv2.INTER_NEAREST) * scale_x
+ return img1, img2, disp
+
+ def _random_crop(self, img1, img2, disp):
+ h,w = img1.shape[:2]
+ ch,cw = self.crop_size
+ assert ch<=h and cw<=w, (img1.shape, h,w,ch,cw)
+ offset_x = np.random.randint(w - cw + 1)
+ offset_y = np.random.randint(h - ch + 1)
+ img1 = img1[offset_y:offset_y+ch,offset_x:offset_x+cw]
+ img2 = img2[offset_y:offset_y+ch,offset_x:offset_x+cw]
+ disp = disp[offset_y:offset_y+ch,offset_x:offset_x+cw]
+ return img1, img2, disp
+
+ def _random_vflip(self, img1, img2, disp):
+ # vertical flip
+ if self.v_flip_prob>0 and np.random.rand() < self.v_flip_prob:
+ img1 = np.copy(np.flipud(img1))
+ img2 = np.copy(np.flipud(img2))
+ disp = np.copy(np.flipud(disp))
+ return img1, img2, disp
+
+ def _random_rotate_shift_right(self, img2):
+ if self.rightjitterprob>0. and np.random.rand() 0) & (xx < wd1) & (yy > 0) & (yy < ht1)
+ xx = xx[v]
+ yy = yy[v]
+ flow1 = flow1[v]
+
+ flow = np.inf * np.ones([ht1, wd1, 2], dtype=np.float32) # invalid value every where, before we fill it with the correct ones
+ flow[yy, xx] = flow1
+ return flow
+
+ def spatial_transform(self, img1, img2, flow, dname):
+
+ if np.random.rand() < self.spatial_aug_prob:
+ # randomly sample scale
+ ht, wd = img1.shape[:2]
+ clip_min_scale = np.maximum(
+ (self.crop_size[0] + 8) / float(ht),
+ (self.crop_size[1] + 8) / float(wd))
+ min_scale, max_scale = self.min_scale, self.max_scale
+ scale = 2 ** np.random.uniform(self.min_scale, self.max_scale)
+ scale_x = scale
+ scale_y = scale
+ if np.random.rand() < self.stretch_prob:
+ scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
+ scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
+ scale_x = np.clip(scale_x, clip_min_scale, None)
+ scale_y = np.clip(scale_y, clip_min_scale, None)
+ # rescale the images
+ img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+ img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+ flow = self._resize_flow(flow, scale_x, scale_y, factor=2.0 if dname=='Spring' else 1.0)
+ elif dname=="Spring":
+ flow = self._resize_flow(flow, 1.0, 1.0, factor=2.0)
+
+ if self.h_flip_prob>0. and np.random.rand() < self.h_flip_prob: # h-flip
+ img1 = img1[:, ::-1]
+ img2 = img2[:, ::-1]
+ flow = flow[:, ::-1] * [-1.0, 1.0]
+
+ if self.v_flip_prob>0. and np.random.rand() < self.v_flip_prob: # v-flip
+ img1 = img1[::-1, :]
+ img2 = img2[::-1, :]
+ flow = flow[::-1, :] * [1.0, -1.0]
+
+ # In case no cropping
+ if img1.shape[0] - self.crop_size[0] > 0:
+ y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0])
+ else:
+ y0 = 0
+ if img1.shape[1] - self.crop_size[1] > 0:
+ x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1])
+ else:
+ x0 = 0
+
+ img1 = img1[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+ img2 = img2[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+ flow = flow[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+
+ return img1, img2, flow
+
+ def __call__(self, img1, img2, flow, dname):
+ img1, img2, flow = self.spatial_transform(img1, img2, flow, dname)
+ img1, img2 = self.color_transform(img1, img2)
+ img1 = np.ascontiguousarray(img1)
+ img2 = np.ascontiguousarray(img2)
+ flow = np.ascontiguousarray(flow)
+ return img1, img2, flow
\ No newline at end of file
diff --git a/third_party/mast3r/dust3r/croco/stereoflow/criterion.py b/third_party/mast3r/dust3r/croco/stereoflow/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..57792ebeeee34827b317a4d32b7445837bb33f17
--- /dev/null
+++ b/third_party/mast3r/dust3r/croco/stereoflow/criterion.py
@@ -0,0 +1,251 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Losses, metrics per batch, metrics per dataset
+# --------------------------------------------------------
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+def _get_gtnorm(gt):
+ if gt.size(1)==1: # stereo
+ return gt
+ # flow
+ return torch.sqrt(torch.sum(gt**2, dim=1, keepdims=True)) # Bx1xHxW
+
+############ losses without confidence
+
+class L1Loss(nn.Module):
+
+ def __init__(self, max_gtnorm=None):
+ super().__init__()
+ self.max_gtnorm = max_gtnorm
+ self.with_conf = False
+
+ def _error(self, gt, predictions):
+ return torch.abs(gt-predictions)
+
+ def forward(self, predictions, gt, inspect=False):
+ mask = torch.isfinite(gt)
+ if self.max_gtnorm is not None:
+ mask *= _get_gtnorm(gt).expand(-1,gt.size(1),-1,-1) which is a constant
+
+
+class LaplacianLossBounded(nn.Module): # used for CroCo-Flow ; in the equation of the paper, we have a=1/b
+ def __init__(self, max_gtnorm=10000., a=0.25, b=4.):
+ super().__init__()
+ self.max_gtnorm = max_gtnorm
+ self.with_conf = True
+ self.a, self.b = a, b
+
+ def forward(self, predictions, gt, conf):
+ mask = torch.isfinite(gt)
+ mask = mask[:,0,:,:]
+ if self.max_gtnorm is not None: mask *= _get_gtnorm(gt)[:,0,:,:] which is a constant
+
+class LaplacianLossBounded2(nn.Module): # used for CroCo-Stereo (except for ETH3D) ; in the equation of the paper, we have a=b
+ def __init__(self, max_gtnorm=None, a=3.0, b=3.0):
+ super().__init__()
+ self.max_gtnorm = max_gtnorm
+ self.with_conf = True
+ self.a, self.b = a, b
+
+ def forward(self, predictions, gt, conf):
+ mask = torch.isfinite(gt)
+ mask = mask[:,0,:,:]
+ if self.max_gtnorm is not None: mask *= _get_gtnorm(gt)[:,0,:,:] which is a constant
+
+############## metrics per batch
+
+class StereoMetrics(nn.Module):
+
+ def __init__(self, do_quantile=False):
+ super().__init__()
+ self.bad_ths = [0.5,1,2,3]
+ self.do_quantile = do_quantile
+
+ def forward(self, predictions, gt):
+ B = predictions.size(0)
+ metrics = {}
+ gtcopy = gt.clone()
+ mask = torch.isfinite(gtcopy)
+ gtcopy[~mask] = 999999.0 # we make a copy and put a non-infinite value, such that it does not become nan once multiplied by the mask value 0
+ Npx = mask.view(B,-1).sum(dim=1)
+ L1error = (torch.abs(gtcopy-predictions)*mask).view(B,-1)
+ L2error = (torch.square(gtcopy-predictions)*mask).view(B,-1)
+ # avgerr
+ metrics['avgerr'] = torch.mean(L1error.sum(dim=1)/Npx )
+ # rmse
+ metrics['rmse'] = torch.sqrt(L2error.sum(dim=1)/Npx).mean(dim=0)
+ # err > t for t in [0.5,1,2,3]
+ for ths in self.bad_ths:
+ metrics['bad@{:.1f}'.format(ths)] = (((L1error>ths)* mask.view(B,-1)).sum(dim=1)/Npx).mean(dim=0) * 100
+ return metrics
+
+class FlowMetrics(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.bad_ths = [1,3,5]
+
+ def forward(self, predictions, gt):
+ B = predictions.size(0)
+ metrics = {}
+ mask = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite
+ Npx = mask.view(B,-1).sum(dim=1)
+ gtcopy = gt.clone() # to compute L1/L2 error, we need to have non-infinite value, the error computed at this locations will be ignored
+ gtcopy[:,0,:,:][~mask] = 999999.0
+ gtcopy[:,1,:,:][~mask] = 999999.0
+ L1error = (torch.abs(gtcopy-predictions).sum(dim=1)*mask).view(B,-1)
+ L2error = (torch.sqrt(torch.sum(torch.square(gtcopy-predictions),dim=1))*mask).view(B,-1)
+ metrics['L1err'] = torch.mean(L1error.sum(dim=1)/Npx )
+ metrics['EPE'] = torch.mean(L2error.sum(dim=1)/Npx )
+ for ths in self.bad_ths:
+ metrics['bad@{:.1f}'.format(ths)] = (((L2error>ths)* mask.view(B,-1)).sum(dim=1)/Npx).mean(dim=0) * 100
+ return metrics
+
+############## metrics per dataset
+## we update the average and maintain the number of pixels while adding data batch per batch
+## at the beggining, call reset()
+## after each batch, call add_batch(...)
+## at the end: call get_results()
+
+class StereoDatasetMetrics(nn.Module):
+
+ def __init__(self):
+ super().__init__()
+ self.bad_ths = [0.5,1,2,3]
+
+ def reset(self):
+ self.agg_N = 0 # number of pixels so far
+ self.agg_L1err = torch.tensor(0.0) # L1 error so far
+ self.agg_Nbad = [0 for _ in self.bad_ths] # counter of bad pixels
+ self._metrics = None
+
+ def add_batch(self, predictions, gt):
+ assert predictions.size(1)==1, predictions.size()
+ assert gt.size(1)==1, gt.size()
+ if gt.size(2)==predictions.size(2)*2 and gt.size(3)==predictions.size(3)*2: # special case for Spring ...
+ L1err = torch.minimum( torch.minimum( torch.minimum(
+ torch.sum(torch.abs(gt[:,:,0::2,0::2]-predictions),dim=1),
+ torch.sum(torch.abs(gt[:,:,1::2,0::2]-predictions),dim=1)),
+ torch.sum(torch.abs(gt[:,:,0::2,1::2]-predictions),dim=1)),
+ torch.sum(torch.abs(gt[:,:,1::2,1::2]-predictions),dim=1))
+ valid = torch.isfinite(L1err)
+ else:
+ valid = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite
+ L1err = torch.sum(torch.abs(gt-predictions),dim=1)
+ N = valid.sum()
+ Nnew = self.agg_N + N
+ self.agg_L1err = float(self.agg_N)/Nnew * self.agg_L1err + L1err[valid].mean().cpu() * float(N)/Nnew
+ self.agg_N = Nnew
+ for i,th in enumerate(self.bad_ths):
+ self.agg_Nbad[i] += (L1err[valid]>th).sum().cpu()
+
+ def _compute_metrics(self):
+ if self._metrics is not None: return
+ out = {}
+ out['L1err'] = self.agg_L1err.item()
+ for i,th in enumerate(self.bad_ths):
+ out['bad@{:.1f}'.format(th)] = (float(self.agg_Nbad[i]) / self.agg_N).item() * 100.0
+ self._metrics = out
+
+ def get_results(self):
+ self._compute_metrics() # to avoid recompute them multiple times
+ return self._metrics
+
+class FlowDatasetMetrics(nn.Module):
+
+ def __init__(self):
+ super().__init__()
+ self.bad_ths = [0.5,1,3,5]
+ self.speed_ths = [(0,10),(10,40),(40,torch.inf)]
+
+ def reset(self):
+ self.agg_N = 0 # number of pixels so far
+ self.agg_L1err = torch.tensor(0.0) # L1 error so far
+ self.agg_L2err = torch.tensor(0.0) # L2 (=EPE) error so far
+ self.agg_Nbad = [0 for _ in self.bad_ths] # counter of bad pixels
+ self.agg_EPEspeed = [torch.tensor(0.0) for _ in self.speed_ths] # EPE per speed bin so far
+ self.agg_Nspeed = [0 for _ in self.speed_ths] # N pixels per speed bin so far
+ self._metrics = None
+ self.pairname_results = {}
+
+ def add_batch(self, predictions, gt):
+ assert predictions.size(1)==2, predictions.size()
+ assert gt.size(1)==2, gt.size()
+ if gt.size(2)==predictions.size(2)*2 and gt.size(3)==predictions.size(3)*2: # special case for Spring ...
+ L1err = torch.minimum( torch.minimum( torch.minimum(
+ torch.sum(torch.abs(gt[:,:,0::2,0::2]-predictions),dim=1),
+ torch.sum(torch.abs(gt[:,:,1::2,0::2]-predictions),dim=1)),
+ torch.sum(torch.abs(gt[:,:,0::2,1::2]-predictions),dim=1)),
+ torch.sum(torch.abs(gt[:,:,1::2,1::2]-predictions),dim=1))
+ L2err = torch.minimum( torch.minimum( torch.minimum(
+ torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,0::2]-predictions),dim=1)),
+ torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,0::2]-predictions),dim=1))),
+ torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,1::2]-predictions),dim=1))),
+ torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,1::2]-predictions),dim=1)))
+ valid = torch.isfinite(L1err)
+ gtspeed = (torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,0::2]),dim=1)) + torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,1::2]),dim=1)) +\
+ torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,0::2]),dim=1)) + torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,1::2]),dim=1)) ) / 4.0 # let's just average them
+ else:
+ valid = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite
+ L1err = torch.sum(torch.abs(gt-predictions),dim=1)
+ L2err = torch.sqrt(torch.sum(torch.square(gt-predictions),dim=1))
+ gtspeed = torch.sqrt(torch.sum(torch.square(gt),dim=1))
+ N = valid.sum()
+ Nnew = self.agg_N + N
+ self.agg_L1err = float(self.agg_N)/Nnew * self.agg_L1err + L1err[valid].mean().cpu() * float(N)/Nnew
+ self.agg_L2err = float(self.agg_N)/Nnew * self.agg_L2err + L2err[valid].mean().cpu() * float(N)/Nnew
+ self.agg_N = Nnew
+ for i,th in enumerate(self.bad_ths):
+ self.agg_Nbad[i] += (L2err[valid]>th).sum().cpu()
+ for i,(th1,th2) in enumerate(self.speed_ths):
+ vv = (gtspeed[valid]>=th1) * (gtspeed[valid] don't use batch_size>1 at test time)
+ self._prepare_data()
+ self._load_or_build_cache()
+
+ def prepare_data(self):
+ """
+ to be defined for each dataset
+ """
+ raise NotImplementedError
+
+ def __len__(self):
+ return len(self.pairnames) # each pairname is typically of the form (str, int1, int2)
+
+ def __getitem__(self, index):
+ pairname = self.pairnames[index]
+
+ # get filenames
+ img1name = self.pairname_to_img1name(pairname)
+ img2name = self.pairname_to_img2name(pairname)
+ flowname = self.pairname_to_flowname(pairname) if self.pairname_to_flowname is not None else None
+
+ # load images and disparities
+ img1 = _read_img(img1name)
+ img2 = _read_img(img2name)
+ flow = self.load_flow(flowname) if flowname is not None else None
+
+ # apply augmentations
+ if self.augmentor is not None:
+ img1, img2, flow = self.augmentor(img1, img2, flow, self.name)
+
+ if self.totensor:
+ img1 = img_to_tensor(img1)
+ img2 = img_to_tensor(img2)
+ if flow is not None:
+ flow = flow_to_tensor(flow)
+ else:
+ flow = torch.tensor([]) # to allow dataloader batching with default collate_gn
+ pairname = str(pairname) # transform potential tuple to str to be able to batch it
+
+ return img1, img2, flow, pairname
+
+ def __rmul__(self, v):
+ self.rmul *= v
+ self.pairnames = v * self.pairnames
+ return self
+
+ def __str__(self):
+ return f'{self.__class__.__name__}_{self.split}'
+
+ def __repr__(self):
+ s = f'{self.__class__.__name__}(split={self.split}, augmentor={self.augmentor_str}, crop_size={str(self.crop_size)}, totensor={self.totensor})'
+ if self.rmul==1:
+ s+=f'\n\tnum pairs: {len(self.pairnames)}'
+ else:
+ s+=f'\n\tnum pairs: {len(self.pairnames)} ({len(self.pairnames)//self.rmul}x{self.rmul})'
+ return s
+
+ def _set_root(self):
+ self.root = dataset_to_root[self.name]
+ assert os.path.isdir(self.root), f"could not find root directory for dataset {self.name}: {self.root}"
+
+ def _load_or_build_cache(self):
+ cache_file = osp.join(cache_dir, self.name+'.pkl')
+ if osp.isfile(cache_file):
+ with open(cache_file, 'rb') as fid:
+ self.pairnames = pickle.load(fid)[self.split]
+ else:
+ tosave = self._build_cache()
+ os.makedirs(cache_dir, exist_ok=True)
+ with open(cache_file, 'wb') as fid:
+ pickle.dump(tosave, fid)
+ self.pairnames = tosave[self.split]
+
+class TartanAirDataset(FlowDataset):
+
+ def _prepare_data(self):
+ self.name = "TartanAir"
+ self._set_root()
+ assert self.split in ['train']
+ self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], 'image_left/{:06d}_left.png'.format(pairname[1]))
+ self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], 'image_left/{:06d}_left.png'.format(pairname[2]))
+ self.pairname_to_flowname = lambda pairname: osp.join(self.root, pairname[0], 'flow/{:06d}_{:06d}_flow.npy'.format(pairname[1],pairname[2]))
+ self.pairname_to_str = lambda pairname: os.path.join(pairname[0][pairname[0].find('/')+1:], '{:06d}_{:06d}'.format(pairname[1], pairname[2]))
+ self.load_flow = _read_numpy_flow
+
+ def _build_cache(self):
+ seqs = sorted(os.listdir(self.root))
+ pairs = [(osp.join(s,s,difficulty,Pxxx),int(a[:6]),int(a[:6])+1) for s in seqs for difficulty in ['Easy','Hard'] for Pxxx in sorted(os.listdir(osp.join(self.root,s,s,difficulty))) for a in sorted(os.listdir(osp.join(self.root,s,s,difficulty,Pxxx,'image_left/')))[:-1]]
+ assert len(pairs)==306268, "incorrect parsing of pairs in TartanAir"
+ tosave = {'train': pairs}
+ return tosave
+
+class FlyingChairsDataset(FlowDataset):
+
+ def _prepare_data(self):
+ self.name = "FlyingChairs"
+ self._set_root()
+ assert self.split in ['train','val']
+ self.pairname_to_img1name = lambda pairname: osp.join(self.root, 'data', pairname+'_img1.ppm')
+ self.pairname_to_img2name = lambda pairname: osp.join(self.root, 'data', pairname+'_img2.ppm')
+ self.pairname_to_flowname = lambda pairname: osp.join(self.root, 'data', pairname+'_flow.flo')
+ self.pairname_to_str = lambda pairname: pairname
+ self.load_flow = _read_flo_file
+
+ def _build_cache(self):
+ split_file = osp.join(self.root, 'chairs_split.txt')
+ split_list = np.loadtxt(split_file, dtype=np.int32)
+ trainpairs = ['{:05d}'.format(i) for i in np.where(split_list==1)[0]+1]
+ valpairs = ['{:05d}'.format(i) for i in np.where(split_list==2)[0]+1]
+ assert len(trainpairs)==22232 and len(valpairs)==640, "incorrect parsing of pairs in MPI-Sintel"
+ tosave = {'train': trainpairs, 'val': valpairs}
+ return tosave
+
+class FlyingThingsDataset(FlowDataset):
+
+ def _prepare_data(self):
+ self.name = "FlyingThings"
+ self._set_root()
+ assert self.split in [f'{set_}_{pass_}pass{camstr}' for set_ in ['train','test','test1024'] for camstr in ['','_rightcam'] for pass_ in ['clean','final','all']]
+ self.pairname_to_img1name = lambda pairname: osp.join(self.root, f'frames_{pairname[3]}pass', pairname[0].replace('into_future','').replace('into_past',''), '{:04d}.png'.format(pairname[1]))
+ self.pairname_to_img2name = lambda pairname: osp.join(self.root, f'frames_{pairname[3]}pass', pairname[0].replace('into_future','').replace('into_past',''), '{:04d}.png'.format(pairname[2]))
+ self.pairname_to_flowname = lambda pairname: osp.join(self.root, 'optical_flow', pairname[0], 'OpticalFlowInto{f:s}_{i:04d}_{c:s}.pfm'.format(f='Future' if 'future' in pairname[0] else 'Past', i=pairname[1], c='L' if 'left' in pairname[0] else 'R' ))
+ self.pairname_to_str = lambda pairname: os.path.join(pairname[3]+'pass', pairname[0], 'Into{f:s}_{i:04d}_{c:s}'.format(f='Future' if 'future' in pairname[0] else 'Past', i=pairname[1], c='L' if 'left' in pairname[0] else 'R' ))
+ self.load_flow = _read_pfm_flow
+
+ def _build_cache(self):
+ tosave = {}
+ # train and test splits for the different passes
+ for set_ in ['train', 'test']:
+ sroot = osp.join(self.root, 'optical_flow', set_.upper())
+ fname_to_i = lambda f: int(f[len('OpticalFlowIntoFuture_'):-len('_L.pfm')])
+ pp = [(osp.join(set_.upper(), d, s, 'into_future/left'),fname_to_i(fname)) for d in sorted(os.listdir(sroot)) for s in sorted(os.listdir(osp.join(sroot,d))) for fname in sorted(os.listdir(osp.join(sroot,d, s, 'into_future/left')))[:-1]]
+ pairs = [(a,i,i+1) for a,i in pp]
+ pairs += [(a.replace('into_future','into_past'),i+1,i) for a,i in pp]
+ assert len(pairs)=={'train': 40302, 'test': 7866}[set_], "incorrect parsing of pairs Flying Things"
+ for cam in ['left','right']:
+ camstr = '' if cam=='left' else f'_{cam}cam'
+ for pass_ in ['final', 'clean']:
+ tosave[f'{set_}_{pass_}pass{camstr}'] = [(a.replace('left',cam),i,j,pass_) for a,i,j in pairs]
+ tosave[f'{set_}_allpass{camstr}'] = tosave[f'{set_}_cleanpass{camstr}'] + tosave[f'{set_}_finalpass{camstr}']
+ # test1024: this is the same split as unimatch 'validation' split
+ # see https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/datasets.py#L229
+ test1024_nsamples = 1024
+ alltest_nsamples = len(tosave['test_cleanpass']) # 7866
+ stride = alltest_nsamples // test1024_nsamples
+ remove = alltest_nsamples % test1024_nsamples
+ for cam in ['left','right']:
+ camstr = '' if cam=='left' else f'_{cam}cam'
+ for pass_ in ['final','clean']:
+ tosave[f'test1024_{pass_}pass{camstr}'] = sorted(tosave[f'test_{pass_}pass{camstr}'])[:-remove][::stride] # warning, it was not sorted before
+ assert len(tosave['test1024_cleanpass'])==1024, "incorrect parsing of pairs in Flying Things"
+ tosave[f'test1024_allpass{camstr}'] = tosave[f'test1024_cleanpass{camstr}'] + tosave[f'test1024_finalpass{camstr}']
+ return tosave
+
+
+class MPISintelDataset(FlowDataset):
+
+ def _prepare_data(self):
+ self.name = "MPISintel"
+ self._set_root()
+ assert self.split in [s+'_'+p for s in ['train','test','subval','subtrain'] for p in ['cleanpass','finalpass','allpass']]
+ self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], 'frame_{:04d}.png'.format(pairname[1]))
+ self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], 'frame_{:04d}.png'.format(pairname[1]+1))
+ self.pairname_to_flowname = lambda pairname: None if pairname[0].startswith('test/') else osp.join(self.root, pairname[0].replace('/clean/','/flow/').replace('/final/','/flow/'), 'frame_{:04d}.flo'.format(pairname[1]))
+ self.pairname_to_str = lambda pairname: osp.join(pairname[0], 'frame_{:04d}'.format(pairname[1]))
+ self.load_flow = _read_flo_file
+
+ def _build_cache(self):
+ trainseqs = sorted(os.listdir(self.root+'training/clean'))
+ trainpairs = [ (osp.join('training/clean', s),i) for s in trainseqs for i in range(1, len(os.listdir(self.root+'training/clean/'+s)))]
+ subvalseqs = ['temple_2','temple_3']
+ subtrainseqs = [s for s in trainseqs if s not in subvalseqs]
+ subvalpairs = [ (p,i) for p,i in trainpairs if any(s in p for s in subvalseqs)]
+ subtrainpairs = [ (p,i) for p,i in trainpairs if any(s in p for s in subtrainseqs)]
+ testseqs = sorted(os.listdir(self.root+'test/clean'))
+ testpairs = [ (osp.join('test/clean', s),i) for s in testseqs for i in range(1, len(os.listdir(self.root+'test/clean/'+s)))]
+ assert len(trainpairs)==1041 and len(testpairs)==552 and len(subvalpairs)==98 and len(subtrainpairs)==943, "incorrect parsing of pairs in MPI-Sintel"
+ tosave = {}
+ tosave['train_cleanpass'] = trainpairs
+ tosave['test_cleanpass'] = testpairs
+ tosave['subval_cleanpass'] = subvalpairs
+ tosave['subtrain_cleanpass'] = subtrainpairs
+ for t in ['train','test','subval','subtrain']:
+ tosave[t+'_finalpass'] = [(p.replace('/clean/','/final/'),i) for p,i in tosave[t+'_cleanpass']]
+ tosave[t+'_allpass'] = tosave[t+'_cleanpass'] + tosave[t+'_finalpass']
+ return tosave
+
+ def submission_save_pairname(self, pairname, prediction, outdir, _time):
+ assert prediction.shape[2]==2
+ outfile = os.path.join(outdir, 'submission', self.pairname_to_str(pairname)+'.flo')
+ os.makedirs( os.path.dirname(outfile), exist_ok=True)
+ writeFlowFile(prediction, outfile)
+
+ def finalize_submission(self, outdir):
+ assert self.split == 'test_allpass'
+ bundle_exe = "/nfs/data/ffs-3d/datasets/StereoFlow/MPI-Sintel/bundler/linux-x64/bundler" # eg