babe24 / transformers_4_35_0 /image_transforms.py

9231ab9 10 months ago

33.9 kB

	# coding=utf-8
	# Copyright 2022 The HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import warnings
	from typing import Iterable, List, Optional, Tuple, Union

	import numpy as np

	from .image_utils import (
	ChannelDimension,
	ImageInput,
	get_channel_dimension_axis,
	get_image_size,
	infer_channel_dimension_format,
	)
	from .utils import ExplicitEnum, TensorType, is_jax_tensor, is_tf_tensor, is_torch_tensor
	from .utils.import_utils import (
	is_flax_available,
	is_tf_available,
	is_torch_available,
	is_vision_available,
	requires_backends,
	)


	if is_vision_available():
	import PIL

	from .image_utils import PILImageResampling

	if is_torch_available():
	import torch

	if is_tf_available():
	import tensorflow as tf

	if is_flax_available():
	import jax.numpy as jnp


	def to_channel_dimension_format(
	image: np.ndarray,
	channel_dim: Union[ChannelDimension, str],
	input_channel_dim: Optional[Union[ChannelDimension, str]] = None,
	) -> np.ndarray:
	"""
	Converts `image` to the channel dimension format specified by `channel_dim`.

	Args:
	image (`numpy.ndarray`):
	The image to have its channel dimension set.
	channel_dim (`ChannelDimension`):
	The channel dimension format to use.
	input_channel_dim (`ChannelDimension`, optional):
	The channel dimension format of the input image. If not provided, it will be inferred from the input image.

	Returns:
	`np.ndarray`: The image with the channel dimension set to `channel_dim`.
	"""
	if not isinstance(image, np.ndarray):
	raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")

	if input_channel_dim is None:
	input_channel_dim = infer_channel_dimension_format(image)

	target_channel_dim = ChannelDimension(channel_dim)
	if input_channel_dim == target_channel_dim:
	return image

	if target_channel_dim == ChannelDimension.FIRST:
	image = image.transpose((2, 0, 1))
	elif target_channel_dim == ChannelDimension.LAST:
	image = image.transpose((1, 2, 0))
	else:
	raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))

	return image


	def rescale(
	image: np.ndarray,
	scale: float,
	data_format: Optional[ChannelDimension] = None,
	dtype: np.dtype = np.float32,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	) -> np.ndarray:
	"""
	Rescales `image` by `scale`.

	Args:
	image (`np.ndarray`):
	The image to rescale.
	scale (`float`):
	The scale to use for rescaling the image.
	data_format (`ChannelDimension`, optional):
	The channel dimension format of the image. If not provided, it will be the same as the input image.
	dtype (`np.dtype`, optional, defaults to `np.float32`):
	The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature
	extractors.
	input_data_format (`ChannelDimension`, optional):
	The channel dimension format of the input image. If not provided, it will be inferred from the input image.

	Returns:
	`np.ndarray`: The rescaled image.
	"""
	if not isinstance(image, np.ndarray):
	raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")

	rescaled_image = image * scale
	if data_format is not None:
	rescaled_image = to_channel_dimension_format(rescaled_image, data_format, input_data_format)

	rescaled_image = rescaled_image.astype(dtype)

	return rescaled_image


	def _rescale_for_pil_conversion(image):
	"""
	Detects whether or not the image needs to be rescaled before being converted to a PIL image.

	The assumption is that if the image is of type `np.float` and all values are between 0 and 1, it needs to be
	rescaled.
	"""
	if image.dtype == np.uint8:
	do_rescale = False
	elif np.allclose(image, image.astype(int)):
	if np.all(0 <= image) and np.all(image <= 255):
	do_rescale = False
	else:
	raise ValueError(
	"The image to be converted to a PIL image contains values outside the range [0, 255], "
	f"got [{image.min()}, {image.max()}] which cannot be converted to uint8."
	)
	elif np.all(0 <= image) and np.all(image <= 1):
	do_rescale = True
	else:
	raise ValueError(
	"The image to be converted to a PIL image contains values outside the range [0, 1], "
	f"got [{image.min()}, {image.max()}] which cannot be converted to uint8."
	)
	return do_rescale


	def to_pil_image(
	image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
	do_rescale: Optional[bool] = None,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	) -> "PIL.Image.Image":
	"""
	Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
	needed.

	Args:
	image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor` or `tf.Tensor`):
	The image to convert to the `PIL.Image` format.
	do_rescale (`bool`, optional):
	Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
	to `True` if the image type is a floating type and casting to `int` would result in a loss of precision,
	and `False` otherwise.
	input_data_format (`ChannelDimension`, optional):
	The channel dimension format of the input image. If unset, will use the inferred format from the input.

	Returns:
	`PIL.Image.Image`: The converted image.
	"""
	requires_backends(to_pil_image, ["vision"])

	if isinstance(image, PIL.Image.Image):
	return image

	# Convert all tensors to numpy arrays before converting to PIL image
	if is_torch_tensor(image) or is_tf_tensor(image):
	image = image.numpy()
	elif is_jax_tensor(image):
	image = np.array(image)
	elif not isinstance(image, np.ndarray):
	raise ValueError("Input image type not supported: {}".format(type(image)))

	# If the channel as been moved to first dim, we put it back at the end.
	image = to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)

	# If there is a single channel, we squeeze it, as otherwise PIL can't handle it.
	image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image

	# PIL.Image can only store uint8 values so we rescale the image to be between 0 and 255 if needed.
	do_rescale = _rescale_for_pil_conversion(image) if do_rescale is None else do_rescale

	if do_rescale:
	image = rescale(image, 255)

	image = image.astype(np.uint8)
	return PIL.Image.fromarray(image)


	# Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366
	def get_resize_output_image_size(
	input_image: np.ndarray,
	size: Union[int, Tuple[int, int], List[int], Tuple[int]],
	default_to_square: bool = True,
	max_size: Optional[int] = None,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	) -> tuple:
	"""
	Find the target (height, width) dimension of the output image after resizing given the input image and the desired
	size.

	Args:
	input_image (`np.ndarray`):
	The image to resize.
	size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
	The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
	this.

	If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
	`size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
	number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
	default_to_square (`bool`, optional, defaults to `True`):
	How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
	(`size`,`size`). If set to `False`, will replicate
	[`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
	with support for resizing only the smallest edge and providing an optional `max_size`.
	max_size (`int`, optional):
	The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
	than `max_size` after being resized according to `size`, then the image is resized again so that the longer
	edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
	than `size`. Only used if `default_to_square` is `False`.
	input_data_format (`ChannelDimension`, optional):
	The channel dimension format of the input image. If unset, will use the inferred format from the input.

	Returns:
	`tuple`: The target (height, width) dimension of the output image after resizing.
	"""
	if isinstance(size, (tuple, list)):
	if len(size) == 2:
	return tuple(size)
	elif len(size) == 1:
	# Perform same logic as if size was an int
	size = size[0]
	else:
	raise ValueError("size must have 1 or 2 elements if it is a list or tuple")

	if default_to_square:
	return (size, size)

	height, width = get_image_size(input_image, input_data_format)
	short, long = (width, height) if width <= height else (height, width)
	requested_new_short = size

	new_short, new_long = requested_new_short, int(requested_new_short * long / short)

	if max_size is not None:
	if max_size <= requested_new_short:
	raise ValueError(
	f"max_size = {max_size} must be strictly greater than the requested "
	f"size for the smaller edge size = {size}"
	)
	if new_long > max_size:
	new_short, new_long = int(max_size * new_short / new_long), max_size

	return (new_long, new_short) if width <= height else (new_short, new_long)


	def resize(
	image,
	size: Tuple[int, int],
	resample: "PILImageResampling" = None,
	reducing_gap: Optional[int] = None,
	data_format: Optional[ChannelDimension] = None,
	return_numpy: bool = True,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	) -> np.ndarray:
	"""
	Resizes `image` to `(height, width)` specified by `size` using the PIL library.

	Args:
	image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
	The image to resize.
	size (`Tuple[int, int]`):
	The size to use for resizing the image.
	resample (`int`, optional, defaults to `PILImageResampling.BILINEAR`):
	The filter to user for resampling.
	reducing_gap (`int`, optional):
	Apply optimization by resizing the image in two steps. The bigger `reducing_gap`, the closer the result to
	the fair resampling. See corresponding Pillow documentation for more details.
	data_format (`ChannelDimension`, optional):
	The channel dimension format of the output image. If unset, will use the inferred format from the input.
	return_numpy (`bool`, optional, defaults to `True`):
	Whether or not to return the resized image as a numpy array. If False a `PIL.Image.Image` object is
	returned.
	input_data_format (`ChannelDimension`, optional):
	The channel dimension format of the input image. If unset, will use the inferred format from the input.

	Returns:
	`np.ndarray`: The resized image.
	"""
	requires_backends(resize, ["vision"])

	resample = resample if resample is not None else PILImageResampling.BILINEAR

	if not len(size) == 2:
	raise ValueError("size must have 2 elements")

	# For all transformations, we want to keep the same data format as the input image unless otherwise specified.
	# The resized image from PIL will always have channels last, so find the input format first.
	if input_data_format is None:
	input_data_format = infer_channel_dimension_format(image)
	data_format = input_data_format if data_format is None else data_format

	# To maintain backwards compatibility with the resizing done in previous image feature extractors, we use
	# the pillow library to resize the image and then convert back to numpy
	do_rescale = False
	if not isinstance(image, PIL.Image.Image):
	do_rescale = _rescale_for_pil_conversion(image)
	image = to_pil_image(image, do_rescale=do_rescale, input_data_format=input_data_format)
	height, width = size
	# PIL images are in the format (width, height)
	resized_image = image.resize((width, height), resample=resample, reducing_gap=reducing_gap)

	if return_numpy:
	resized_image = np.array(resized_image)
	# If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image
	# so we need to add it back if necessary.
	resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image
	# The image is always in channels last format after converting from a PIL image
	resized_image = to_channel_dimension_format(
	resized_image, data_format, input_channel_dim=ChannelDimension.LAST
	)
	# If an image was rescaled to be in the range [0, 255] before converting to a PIL image, then we need to
	# rescale it back to the original range.
	resized_image = rescale(resized_image, 1 / 255) if do_rescale else resized_image
	return resized_image


	def normalize(
	image: np.ndarray,
	mean: Union[float, Iterable[float]],
	std: Union[float, Iterable[float]],
	data_format: Optional[ChannelDimension] = None,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	) -> np.ndarray:
	"""
	Normalizes `image` using the mean and standard deviation specified by `mean` and `std`.

	image = (image - mean) / std

	Args:
	image (`np.ndarray`):
	The image to normalize.
	mean (`float` or `Iterable[float]`):
	The mean to use for normalization.
	std (`float` or `Iterable[float]`):
	The standard deviation to use for normalization.
	data_format (`ChannelDimension`, optional):
	The channel dimension format of the output image. If unset, will use the inferred format from the input.
	input_data_format (`ChannelDimension`, optional):
	The channel dimension format of the input image. If unset, will use the inferred format from the input.
	"""
	if not isinstance(image, np.ndarray):
	raise ValueError("image must be a numpy array")

	if input_data_format is None:
	input_data_format = infer_channel_dimension_format(image)
	channel_axis = get_channel_dimension_axis(image, input_data_format=input_data_format)
	num_channels = image.shape[channel_axis]

	if isinstance(mean, Iterable):
	if len(mean) != num_channels:
	raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}")
	else:
	mean = [mean] * num_channels
	mean = np.array(mean, dtype=image.dtype)

	if isinstance(std, Iterable):
	if len(std) != num_channels:
	raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}")
	else:
	std = [std] * num_channels
	std = np.array(std, dtype=image.dtype)

	if input_data_format == ChannelDimension.LAST:
	image = (image - mean) / std
	else:
	image = ((image.T - mean) / std).T

	image = to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
	return image


	def center_crop(
	image: np.ndarray,
	size: Tuple[int, int],
	data_format: Optional[Union[str, ChannelDimension]] = None,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	return_numpy: Optional[bool] = None,
	) -> np.ndarray:
	"""
	Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to
	the size given, it will be padded (so the returned result will always be of size `size`).

	Args:
	image (`np.ndarray`):
	The image to crop.
	size (`Tuple[int, int]`):
	The target size for the cropped image.
	data_format (`str` or `ChannelDimension`, optional):
	The channel dimension format for the output image. Can be one of:
	- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
	- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
	If unset, will use the inferred format of the input image.
	input_data_format (`str` or `ChannelDimension`, optional):
	The channel dimension format for the input image. Can be one of:
	- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
	- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
	If unset, will use the inferred format of the input image.
	return_numpy (`bool`, optional):
	Whether or not to return the cropped image as a numpy array. Used for backwards compatibility with the
	previous ImageFeatureExtractionMixin method.
	- Unset: will return the same type as the input image.
	- `True`: will return a numpy array.
	- `False`: will return a `PIL.Image.Image` object.
	Returns:
	`np.ndarray`: The cropped image.
	"""
	requires_backends(center_crop, ["vision"])

	if return_numpy is not None:
	warnings.warn("return_numpy is deprecated and will be removed in v.4.33", FutureWarning)

	return_numpy = True if return_numpy is None else return_numpy

	if not isinstance(image, np.ndarray):
	raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")

	if not isinstance(size, Iterable) or len(size) != 2:
	raise ValueError("size must have 2 elements representing the height and width of the output image")

	if input_data_format is None:
	input_data_format = infer_channel_dimension_format(image)
	output_data_format = data_format if data_format is not None else input_data_format

	# We perform the crop in (C, H, W) format and then convert to the output format
	image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_data_format)

	orig_height, orig_width = get_image_size(image, ChannelDimension.FIRST)
	crop_height, crop_width = size
	crop_height, crop_width = int(crop_height), int(crop_width)

	# In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result.
	top = (orig_height - crop_height) // 2
	bottom = top + crop_height
	# In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result.
	left = (orig_width - crop_width) // 2
	right = left + crop_width

	# Check if cropped area is within image boundaries
	if top >= 0 and bottom <= orig_height and left >= 0 and right <= orig_width:
	image = image[..., top:bottom, left:right]
	image = to_channel_dimension_format(image, output_data_format, ChannelDimension.FIRST)
	return image

	# Otherwise, we may need to pad if the image is too small. Oh joy...
	new_height = max(crop_height, orig_height)
	new_width = max(crop_width, orig_width)
	new_shape = image.shape[:-2] + (new_height, new_width)
	new_image = np.zeros_like(image, shape=new_shape)

	# If the image is too small, pad it with zeros
	top_pad = (new_height - orig_height) // 2
	bottom_pad = top_pad + orig_height
	left_pad = (new_width - orig_width) // 2
	right_pad = left_pad + orig_width
	new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image

	top += top_pad
	bottom += top_pad
	left += left_pad
	right += left_pad

	new_image = new_image[..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)]
	new_image = to_channel_dimension_format(new_image, output_data_format, ChannelDimension.FIRST)

	if not return_numpy:
	new_image = to_pil_image(new_image)

	return new_image


	def _center_to_corners_format_torch(bboxes_center: "torch.Tensor") -> "torch.Tensor":
	center_x, center_y, width, height = bboxes_center.unbind(-1)
	bbox_corners = torch.stack(
	# top left x, top left y, bottom right x, bottom right y
	[(center_x - 0.5 * width), (center_y - 0.5 * height), (center_x + 0.5 * width), (center_y + 0.5 * height)],
	dim=-1,
	)
	return bbox_corners


	def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray:
	center_x, center_y, width, height = bboxes_center.T
	bboxes_corners = np.stack(
	# top left x, top left y, bottom right x, bottom right y
	[center_x - 0.5 * width, center_y - 0.5 * height, center_x + 0.5 * width, center_y + 0.5 * height],
	axis=-1,
	)
	return bboxes_corners


	def _center_to_corners_format_tf(bboxes_center: "tf.Tensor") -> "tf.Tensor":
	center_x, center_y, width, height = tf.unstack(bboxes_center, axis=-1)
	bboxes_corners = tf.stack(
	# top left x, top left y, bottom right x, bottom right y
	[center_x - 0.5 * width, center_y - 0.5 * height, center_x + 0.5 * width, center_y + 0.5 * height],
	axis=-1,
	)
	return bboxes_corners


	# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
	def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
	"""
	Converts bounding boxes from center format to corners format.

	center format: contains the coordinate for the center of the box and its width, height dimensions
	(center_x, center_y, width, height)
	corners format: contains the coodinates for the top-left and bottom-right corners of the box
	(top_left_x, top_left_y, bottom_right_x, bottom_right_y)
	"""
	# Function is used during model forward pass, so we use the input framework if possible, without
	# converting to numpy
	if is_torch_tensor(bboxes_center):
	return _center_to_corners_format_torch(bboxes_center)
	elif isinstance(bboxes_center, np.ndarray):
	return _center_to_corners_format_numpy(bboxes_center)
	elif is_tf_tensor(bboxes_center):
	return _center_to_corners_format_tf(bboxes_center)

	raise ValueError(f"Unsupported input type {type(bboxes_center)}")


	def _corners_to_center_format_torch(bboxes_corners: "torch.Tensor") -> "torch.Tensor":
	top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(-1)
	b = [
	(top_left_x + bottom_right_x) / 2, # center x
	(top_left_y + bottom_right_y) / 2, # center y
	(bottom_right_x - top_left_x), # width
	(bottom_right_y - top_left_y), # height
	]
	return torch.stack(b, dim=-1)


	def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray:
	top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.T
	bboxes_center = np.stack(
	[
	(top_left_x + bottom_right_x) / 2, # center x
	(top_left_y + bottom_right_y) / 2, # center y
	(bottom_right_x - top_left_x), # width
	(bottom_right_y - top_left_y), # height
	],
	axis=-1,
	)
	return bboxes_center


	def _corners_to_center_format_tf(bboxes_corners: "tf.Tensor") -> "tf.Tensor":
	top_left_x, top_left_y, bottom_right_x, bottom_right_y = tf.unstack(bboxes_corners, axis=-1)
	bboxes_center = tf.stack(
	[
	(top_left_x + bottom_right_x) / 2, # center x
	(top_left_y + bottom_right_y) / 2, # center y
	(bottom_right_x - top_left_x), # width
	(bottom_right_y - top_left_y), # height
	],
	axis=-1,
	)
	return bboxes_center


	def corners_to_center_format(bboxes_corners: TensorType) -> TensorType:
	"""
	Converts bounding boxes from corners format to center format.

	corners format: contains the coodinates for the top-left and bottom-right corners of the box
	(top_left_x, top_left_y, bottom_right_x, bottom_right_y)
	center format: contains the coordinate for the center of the box and its the width, height dimensions
	(center_x, center_y, width, height)
	"""
	# Inverse function accepts different input types so implemented here too
	if is_torch_tensor(bboxes_corners):
	return _corners_to_center_format_torch(bboxes_corners)
	elif isinstance(bboxes_corners, np.ndarray):
	return _corners_to_center_format_numpy(bboxes_corners)
	elif is_tf_tensor(bboxes_corners):
	return _corners_to_center_format_tf(bboxes_corners)

	raise ValueError(f"Unsupported input type {type(bboxes_corners)}")


	# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
	# Copyright (c) 2018, Alexander Kirillov
	# All rights reserved.
	def rgb_to_id(color):
	"""
	Converts RGB color to unique ID.
	"""
	if isinstance(color, np.ndarray) and len(color.shape) == 3:
	if color.dtype == np.uint8:
	color = color.astype(np.int32)
	return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
	return int(color[0] + 256 * color[1] + 256 * 256 * color[2])


	def id_to_rgb(id_map):
	"""
	Converts unique ID to RGB color.
	"""
	if isinstance(id_map, np.ndarray):
	id_map_copy = id_map.copy()
	rgb_shape = tuple(list(id_map.shape) + [3])
	rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
	for i in range(3):
	rgb_map[..., i] = id_map_copy % 256
	id_map_copy //= 256
	return rgb_map
	color = []
	for _ in range(3):
	color.append(id_map % 256)
	id_map //= 256
	return color


	class PaddingMode(ExplicitEnum):
	"""
	Enum class for the different padding modes to use when padding images.
	"""

	CONSTANT = "constant"
	REFLECT = "reflect"
	REPLICATE = "replicate"
	SYMMETRIC = "symmetric"


	def pad(
	image: np.ndarray,
	padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]],
	mode: PaddingMode = PaddingMode.CONSTANT,
	constant_values: Union[float, Iterable[float]] = 0.0,
	data_format: Optional[Union[str, ChannelDimension]] = None,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	) -> np.ndarray:
	"""
	Pads the `image` with the specified (height, width) `padding` and `mode`.

	Args:
	image (`np.ndarray`):
	The image to pad.
	padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`):
	Padding to apply to the edges of the height, width axes. Can be one of three formats:
	- `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
	- `((before, after),)` yields same before and after pad for height and width.
	- `(pad,)` or int is a shortcut for before = after = pad width for all axes.
	mode (`PaddingMode`):
	The padding mode to use. Can be one of:
	- `"constant"`: pads with a constant value.
	- `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
	vector along each axis.
	- `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
	- `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
	constant_values (`float` or `Iterable[float]`, optional):
	The value to use for the padding if `mode` is `"constant"`.
	data_format (`str` or `ChannelDimension`, optional):
	The channel dimension format for the output image. Can be one of:
	- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
	- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
	If unset, will use same as the input image.
	input_data_format (`str` or `ChannelDimension`, optional):
	The channel dimension format for the input image. Can be one of:
	- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
	- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
	If unset, will use the inferred format of the input image.

	Returns:
	`np.ndarray`: The padded image.

	"""
	if input_data_format is None:
	input_data_format = infer_channel_dimension_format(image)

	def _expand_for_data_format(values):
	"""
	Convert values to be in the format expected by np.pad based on the data format.
	"""
	if isinstance(values, (int, float)):
	values = ((values, values), (values, values))
	elif isinstance(values, tuple) and len(values) == 1:
	values = ((values[0], values[0]), (values[0], values[0]))
	elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], int):
	values = (values, values)
	elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], tuple):
	values = values
	else:
	raise ValueError(f"Unsupported format: {values}")

	# add 0 for channel dimension
	values = ((0, 0), values) if input_data_format == ChannelDimension.FIRST else (values, (0, 0))

	# Add additional padding if there's a batch dimension
	values = (0, *values) if image.ndim == 4 else values
	return values

	padding = _expand_for_data_format(padding)

	if mode == PaddingMode.CONSTANT:
	constant_values = _expand_for_data_format(constant_values)
	image = np.pad(image, padding, mode="constant", constant_values=constant_values)
	elif mode == PaddingMode.REFLECT:
	image = np.pad(image, padding, mode="reflect")
	elif mode == PaddingMode.REPLICATE:
	image = np.pad(image, padding, mode="edge")
	elif mode == PaddingMode.SYMMETRIC:
	image = np.pad(image, padding, mode="symmetric")
	else:
	raise ValueError(f"Invalid padding mode: {mode}")

	image = to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
	return image


	# TODO (Amy): Accept 1/3/4 channel numpy array as input and return np.array as default
	def convert_to_rgb(image: ImageInput) -> ImageInput:
	"""
	Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
	as is.

	Args:
	image (Image):
	The image to convert.
	"""
	requires_backends(convert_to_rgb, ["vision"])

	if not isinstance(image, PIL.Image.Image):
	return image

	image = image.convert("RGB")
	return image


	def flip_channel_order(
	image: np.ndarray,
	data_format: Optional[ChannelDimension] = None,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	) -> np.ndarray:
	"""
	Flips the channel order of the image.

	If the image is in RGB format, it will be converted to BGR and vice versa.

	Args:
	image (`np.ndarray`):
	The image to flip.
	data_format (`ChannelDimension`, optional):
	The channel dimension format for the output image. Can be one of:
	- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
	- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
	If unset, will use same as the input image.
	input_data_format (`ChannelDimension`, optional):
	The channel dimension format for the input image. Can be one of:
	- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
	- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
	If unset, will use the inferred format of the input image.
	"""
	input_data_format = infer_channel_dimension_format(image) if input_data_format is None else input_data_format

	if input_data_format == ChannelDimension.LAST:
	image = image[..., ::-1]
	elif input_data_format == ChannelDimension.FIRST:
	image = image[::-1, ...]
	else:
	raise ValueError(f"Unsupported channel dimension: {input_data_format}")

	if data_format is not None:
	image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
	return image