Upload processor

Browse files

Files changed (6) hide show

image_processing_custom_clip.py +94 -0
preprocessor_config.json +31 -0
special_tokens_map.json +27 -0
spiece.model +3 -0
tokenization_custom_clip.py +74 -0
tokenizer_config.json +83 -0

image_processing_custom_clip.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+Use torchvision instead of transformers to perform resize and center crop.
+This is because transformers' version is sometimes 1-pixel off.
+For example, if the image size is 640x480, both results are consistent.
+(e.g., "http://images.cocodataset.org/val2017/000000039769.jpg")
+However, if the image size is 500x334, the following happens:
+(e.g., "http://images.cocodataset.org/val2014/COCO_val2014_000000324158.jpg")
+>>> # Results' shape: (h, w)
+>>> torch.allclose(torchvision_result[:, :-1], transformers_result[:, 1:])
+... True
+Note that if only resize is performed with torchvision,
+the inconsistency remains.
+Therefore, center crop must also be done with torchvision.
+"""
+import PIL
+from torchvision.transforms import CenterCrop, InterpolationMode, Resize
+from transformers import AutoImageProcessor, CLIPImageProcessor
+from transformers.image_processing_utils import get_size_dict
+from transformers.image_utils import ImageInput, PILImageResampling, make_list_of_images
+def PILImageResampling_to_InterpolationMode(
+    resample: PILImageResampling,
+) -> InterpolationMode:
+    return getattr(InterpolationMode, PILImageResampling(resample).name)
+class CustomCLIPImageProcessor(CLIPImageProcessor):
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_center_crop = (
+            do_center_crop if do_center_crop is not None else self.do_center_crop
+        )
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        images = make_list_of_images(images)
+        if do_resize:
+            # TODO input_data_format is ignored
+            _size = get_size_dict(
+                size,
+                param_name="size",
+                default_to_square=getattr(self, "use_square_size", False),
+            )
+            if set(_size) == {"shortest_edge"}:
+                # Corresponds to `image_transform.transforms[0]`
+                resize = Resize(
+                    size=_size["shortest_edge"],
+                    interpolation=PILImageResampling_to_InterpolationMode(resample),
+                )
+                images = [resize(image) for image in images]
+                do_resize = False
+        if do_center_crop:
+            # TODO input_data_format is ignored
+            _crop_size = get_size_dict(
+                crop_size, param_name="crop_size", default_to_square=True
+            )
+            # Corresponds to `image_transform.transforms[1]`
+            center_crop = CenterCrop(
+                size=tuple(map(_crop_size.get, ["height", "width"]))
+            )
+            images = [center_crop(image) for image in images]
+            do_center_crop = False
+        return super().preprocess(
+            images=images,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            **kwargs,
+        )
+AutoImageProcessor.register("CustomCLIPImageProcessor", CustomCLIPImageProcessor)

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_custom_clip.CustomCLIPImageProcessor"
+  },
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CustomCLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "processor_class": "VisionTextDualEncoderProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "[CLS]",
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": "[MASK]",
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": "[SEP]",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5cbdfa8aa7c54c8c5af85b78c309c54a5f2749a20468bf6f60eee007fe6fec1
+size 805634

tokenization_custom_clip.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# coding=utf-8
+# Modified from rinna
+# https://github.com/rinnakk/japanese-clip/blob/master/src/japanese_clip/tokenizer.py
+# ################################## COPIED ##################################
+# Copyright 2022 rinna Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ################################## COPIED ##################################
+from typing import Union
+import torch
+from transformers import AutoTokenizer, T5Tokenizer
+class CustomCLIPTokenizer(T5Tokenizer):
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.do_lower_case = True  # due to some bug of tokenizer config loading
+    def __call__(
+        self,
+        texts: Union[str, list[str]],
+        tokenizer: T5Tokenizer = None,
+        max_seq_len: int = 77,
+        device: Union[str, torch.device] = (
+            "cuda" if torch.cuda.is_available() else "cpu"
+        ),
+        **kwargs,
+    ):
+        if isinstance(texts, str):
+            texts = [texts]
+        if tokenizer is None:
+            tokenizer = self
+            tokenizer_call = super().__call__
+        else:
+            tokenizer_call = tokenizer
+        inputs = tokenizer_call(
+            texts,
+            max_length=max_seq_len - 1,
+            padding="max_length",
+            truncation=True,
+            add_special_tokens=False,
+        )
+        # add cls token at first place
+        input_ids = [[tokenizer.cls_token_id] + ids for ids in inputs["input_ids"]]
+        attention_mask = [[1] + am for am in inputs["attention_mask"]]
+        position_ids = [list(range(0, len(input_ids[0])))] * len(texts)
+        input_ids = torch.tensor(input_ids, dtype=torch.long)
+        attention_mask = torch.tensor(attention_mask, dtype=torch.long)
+        position_ids = torch.tensor(position_ids, dtype=torch.long)
+        return {
+            "input_ids": input_ids.to(device),
+            "attention_mask": attention_mask.to(device),
+            "position_ids": position_ids.to(device),
+        }
+AutoTokenizer.register("CustomCLIPTokenizer", CustomCLIPTokenizer)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,83 @@

+{
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_custom_clip.CustomCLIPTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "eos_token": "</s>",
+  "extra_ids": 0,
+  "legacy": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "processor_class": "VisionTextDualEncoderProcessor",
+  "sep_token": "[SEP]",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "CustomCLIPTokenizer",
+  "unk_token": "<unk>"
+}