bsyx001 commited on
Commit
5af068e
1 Parent(s): 8b26dd7

Upload processor

Browse files
image_processing_custom_clip.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Use torchvision instead of transformers to perform resize and center crop.
3
+
4
+ This is because transformers' version is sometimes 1-pixel off.
5
+
6
+ For example, if the image size is 640x480, both results are consistent.
7
+ (e.g., "http://images.cocodataset.org/val2017/000000039769.jpg")
8
+
9
+ However, if the image size is 500x334, the following happens:
10
+ (e.g., "http://images.cocodataset.org/val2014/COCO_val2014_000000324158.jpg")
11
+
12
+ >>> # Results' shape: (h, w)
13
+ >>> torch.allclose(torchvision_result[:, :-1], transformers_result[:, 1:])
14
+ ... True
15
+
16
+ Note that if only resize is performed with torchvision,
17
+ the inconsistency remains.
18
+ Therefore, center crop must also be done with torchvision.
19
+ """
20
+
21
+ import PIL
22
+ from torchvision.transforms import CenterCrop, InterpolationMode, Resize
23
+ from transformers import AutoImageProcessor, CLIPImageProcessor
24
+ from transformers.image_processing_utils import get_size_dict
25
+ from transformers.image_utils import ImageInput, PILImageResampling, make_list_of_images
26
+
27
+
28
+ def PILImageResampling_to_InterpolationMode(
29
+ resample: PILImageResampling,
30
+ ) -> InterpolationMode:
31
+ return getattr(InterpolationMode, PILImageResampling(resample).name)
32
+
33
+
34
+ class CustomCLIPImageProcessor(CLIPImageProcessor):
35
+ def preprocess(
36
+ self,
37
+ images: ImageInput,
38
+ do_resize: bool = None,
39
+ size: dict[str, int] = None,
40
+ resample: PILImageResampling = None,
41
+ do_center_crop: bool = None,
42
+ crop_size: int = None,
43
+ **kwargs,
44
+ ) -> PIL.Image.Image:
45
+ do_resize = do_resize if do_resize is not None else self.do_resize
46
+ size = size if size is not None else self.size
47
+ resample = resample if resample is not None else self.resample
48
+ do_center_crop = (
49
+ do_center_crop if do_center_crop is not None else self.do_center_crop
50
+ )
51
+ crop_size = crop_size if crop_size is not None else self.crop_size
52
+
53
+ images = make_list_of_images(images)
54
+
55
+ if do_resize:
56
+ # TODO input_data_format is ignored
57
+ _size = get_size_dict(
58
+ size,
59
+ param_name="size",
60
+ default_to_square=getattr(self, "use_square_size", False),
61
+ )
62
+ if set(_size) == {"shortest_edge"}:
63
+ # Corresponds to `image_transform.transforms[0]`
64
+ resize = Resize(
65
+ size=_size["shortest_edge"],
66
+ interpolation=PILImageResampling_to_InterpolationMode(resample),
67
+ )
68
+ images = [resize(image) for image in images]
69
+ do_resize = False
70
+
71
+ if do_center_crop:
72
+ # TODO input_data_format is ignored
73
+ _crop_size = get_size_dict(
74
+ crop_size, param_name="crop_size", default_to_square=True
75
+ )
76
+ # Corresponds to `image_transform.transforms[1]`
77
+ center_crop = CenterCrop(
78
+ size=tuple(map(_crop_size.get, ["height", "width"]))
79
+ )
80
+ images = [center_crop(image) for image in images]
81
+ do_center_crop = False
82
+
83
+ return super().preprocess(
84
+ images=images,
85
+ do_resize=do_resize,
86
+ size=size,
87
+ resample=resample,
88
+ do_center_crop=do_center_crop,
89
+ crop_size=crop_size,
90
+ **kwargs,
91
+ )
92
+
93
+
94
+ AutoImageProcessor.register("CustomCLIPImageProcessor", CustomCLIPImageProcessor)
preprocessor_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing_custom_clip.CustomCLIPImageProcessor"
4
+ },
5
+ "crop_size": {
6
+ "height": 224,
7
+ "width": 224
8
+ },
9
+ "do_center_crop": true,
10
+ "do_convert_rgb": true,
11
+ "do_normalize": true,
12
+ "do_rescale": true,
13
+ "do_resize": true,
14
+ "image_mean": [
15
+ 0.48145466,
16
+ 0.4578275,
17
+ 0.40821073
18
+ ],
19
+ "image_processor_type": "CustomCLIPImageProcessor",
20
+ "image_std": [
21
+ 0.26862954,
22
+ 0.26130258,
23
+ 0.27577711
24
+ ],
25
+ "processor_class": "VisionTextDualEncoderProcessor",
26
+ "resample": 3,
27
+ "rescale_factor": 0.00392156862745098,
28
+ "size": {
29
+ "shortest_edge": 224
30
+ }
31
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": {
5
+ "content": "</s>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "mask_token": "[MASK]",
12
+ "pad_token": {
13
+ "content": "[PAD]",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "sep_token": "[SEP]",
20
+ "unk_token": {
21
+ "content": "<unk>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ }
27
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5cbdfa8aa7c54c8c5af85b78c309c54a5f2749a20468bf6f60eee007fe6fec1
3
+ size 805634
tokenization_custom_clip.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+
3
+ # Modified from rinna
4
+ # https://github.com/rinnakk/japanese-clip/blob/master/src/japanese_clip/tokenizer.py
5
+
6
+ # ################################## COPIED ##################################
7
+ # Copyright 2022 rinna Co., Ltd.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ # ################################## COPIED ##################################
21
+
22
+ from typing import Union
23
+
24
+ import torch
25
+ from transformers import AutoTokenizer, T5Tokenizer
26
+
27
+
28
+ class CustomCLIPTokenizer(T5Tokenizer):
29
+ model_input_names = ["input_ids", "attention_mask", "position_ids"]
30
+
31
+ def __init__(self, *args, **kwargs):
32
+ super().__init__(*args, **kwargs)
33
+ self.do_lower_case = True # due to some bug of tokenizer config loading
34
+
35
+ def __call__(
36
+ self,
37
+ texts: Union[str, list[str]],
38
+ tokenizer: T5Tokenizer = None,
39
+ max_seq_len: int = 77,
40
+ device: Union[str, torch.device] = (
41
+ "cuda" if torch.cuda.is_available() else "cpu"
42
+ ),
43
+ **kwargs,
44
+ ):
45
+ if isinstance(texts, str):
46
+ texts = [texts]
47
+ if tokenizer is None:
48
+ tokenizer = self
49
+ tokenizer_call = super().__call__
50
+ else:
51
+ tokenizer_call = tokenizer
52
+ inputs = tokenizer_call(
53
+ texts,
54
+ max_length=max_seq_len - 1,
55
+ padding="max_length",
56
+ truncation=True,
57
+ add_special_tokens=False,
58
+ )
59
+ # add cls token at first place
60
+ input_ids = [[tokenizer.cls_token_id] + ids for ids in inputs["input_ids"]]
61
+ attention_mask = [[1] + am for am in inputs["attention_mask"]]
62
+ position_ids = [list(range(0, len(input_ids[0])))] * len(texts)
63
+
64
+ input_ids = torch.tensor(input_ids, dtype=torch.long)
65
+ attention_mask = torch.tensor(attention_mask, dtype=torch.long)
66
+ position_ids = torch.tensor(position_ids, dtype=torch.long)
67
+ return {
68
+ "input_ids": input_ids.to(device),
69
+ "attention_mask": attention_mask.to(device),
70
+ "position_ids": position_ids.to(device),
71
+ }
72
+
73
+
74
+ AutoTokenizer.register("CustomCLIPTokenizer", CustomCLIPTokenizer)
tokenizer_config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<unk>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "[PAD]",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "[CLS]",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "[SEP]",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "[MASK]",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ }
60
+ },
61
+ "additional_special_tokens": [],
62
+ "auto_map": {
63
+ "AutoTokenizer": [
64
+ "tokenization_custom_clip.CustomCLIPTokenizer",
65
+ null
66
+ ]
67
+ },
68
+ "bos_token": "<s>",
69
+ "clean_up_tokenization_spaces": true,
70
+ "cls_token": "[CLS]",
71
+ "do_lower_case": true,
72
+ "eos_token": "</s>",
73
+ "extra_ids": 0,
74
+ "legacy": true,
75
+ "mask_token": "[MASK]",
76
+ "model_max_length": 1000000000000000019884624838656,
77
+ "pad_token": "[PAD]",
78
+ "processor_class": "VisionTextDualEncoderProcessor",
79
+ "sep_token": "[SEP]",
80
+ "sp_model_kwargs": {},
81
+ "tokenizer_class": "CustomCLIPTokenizer",
82
+ "unk_token": "<unk>"
83
+ }