File size: 7,800 Bytes
b1b2476 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Idefics."""
from typing import Callable, Dict, List, Optional, Union
from PIL import Image
from ...image_processing_utils import BaseImageProcessor, BatchFeature
from ...image_transforms import resize, to_channel_dimension_format
from ...image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
make_list_of_images,
to_numpy_array,
valid_images,
)
from ...utils import TensorType, is_torch_available
IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
def convert_to_rgb(image):
# `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
# for transparent images. The call to `alpha_composite` handles this case
if image.mode == "RGB":
return image
image_rgba = image.convert("RGBA")
background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
alpha_composite = Image.alpha_composite(background, image_rgba)
alpha_composite = alpha_composite.convert("RGB")
return alpha_composite
class IdeficsImageProcessor(BaseImageProcessor):
r"""
Constructs a Idefics image processor.
Args:
image_size (`int`, *optional*, defaults to 224):
Resize to image size
image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
Can be overridden by the `image_std` parameter in the `preprocess` method.
image_num_channels (`int`, *optional*, defaults to 3):
Number of image channels.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
image_size: int = 224,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
image_num_channels: Optional[int] = 3,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.image_size = image_size
self.image_num_channels = image_num_channels
self.image_mean = image_mean
self.image_std = image_std
def preprocess(
self,
images: ImageInput,
image_num_channels: Optional[int] = 3,
image_size: Optional[Dict[str, int]] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
transform: Callable = None,
**kwargs,
) -> TensorType.PYTORCH:
"""
Preprocess a batch of images.
Args:
images (`ImageInput`):
A list of images to preprocess.
image_size (`int`, *optional*, defaults to `self.image_size`):
Resize to image size
image_num_channels (`int`, *optional*, defaults to `self.image_num_channels`):
Number of image channels.
image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can
be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess`
method. Can be overridden by the `image_std` parameter in the `preprocess` method.
transform (`Callable`, *optional*, defaults to `None`):
A custom transform function that accepts a single image can be passed for training. For example,
`torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
assumed - and then a preset of inference-specific transforms will be applied to the images
Returns:
a PyTorch tensor of the processed images
"""
image_size = image_size if image_size is not None else self.image_size
image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
size = (image_size, image_size)
if isinstance(images, list) and len(images) == 0:
return []
images = make_list_of_images(images)
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
# For training a user needs to pass their own set of transforms as a Callable.
# For reference this is what was used in the original IDEFICS training:
# transform = transforms.Compose([
# convert_to_rgb,
# transforms.RandomResizedCrop((size, size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
# transforms.ToTensor(),
# transforms.Normalize(mean=image_mean, std=image_std),
# ])
if transform is not None:
if not is_torch_available():
raise ImportError("To pass in `transform` torch must be installed")
import torch
images = [transform(x) for x in images]
return torch.stack(images)
# for inference we do the exact transforms that were used to train IDEFICS
images = [convert_to_rgb(x) for x in images]
# further transforms expect numpy arrays
images = [to_numpy_array(x) for x in images]
images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
images = [self.rescale(image=image, scale=1 / 255) for image in images]
images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images]
# TODO: this converts to torch tensors - switch to convert_to_tensors once it becomes available
images = BatchFeature(data={"pixel_values": images}, tensor_type=TensorType.PYTORCH)["pixel_values"]
return images |