| | from transformers.image_processing_utils import BaseImageProcessor, BatchFeature |
| | from typing import Optional, Union, Tuple, Dict, List, Iterable |
| | from transformers.image_transforms import to_channel_dimension_format, PaddingMode |
| | from transformers.image_utils import ChannelDimension, to_numpy_array, make_list_of_images, get_image_size, infer_channel_dimension_format |
| | from transformers.utils import TensorType |
| | from PIL import Image |
| | import numpy as np |
| | try: |
| | from torchvision.transforms import InterpolationMode |
| | BICUBIC = InterpolationMode.BICUBIC |
| | except ImportError: |
| | BICUBIC = Image.BICUBIC |
| |
|
| | import torch |
| | from transformers.utils import ( |
| | TensorType, |
| | is_torch_device, |
| | is_torch_dtype, |
| | requires_backends, |
| | ) |
| |
|
| | from torchvision.transforms import Compose, ToTensor, Normalize, ToPILImage, RandomResizedCrop, Resize |
| |
|
| | try: |
| | from torchvision.transforms import InterpolationMode |
| | BICUBIC = InterpolationMode.BICUBIC |
| | except ImportError: |
| | BICUBIC = Image.BICUBIC |
| |
|
| | from PIL import Image |
| | import torch |
| | import numpy as np |
| | import os |
| | processor_for_vllm = int(os.getenv("PROCESSOR_FOR_VLLM", 0)) |
| |
|
| | def select_best_resolution(original_size, possible_resolutions): |
| | """ |
| | Selects the best resolution from a list of possible resolutions based on the original size. |
| | |
| | Args: |
| | original_size (tuple): The original size of the image in the format (width, height). |
| | possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. |
| | |
| | Returns: |
| | tuple: The best fit resolution in the format (width, height). |
| | """ |
| | original_width, original_height = original_size |
| | best_fit = None |
| | max_effective_resolution = 0 |
| | min_wasted_resolution = float("inf") |
| |
|
| | for width, height in possible_resolutions: |
| | |
| | scale = min(width / original_width, height / original_height) |
| | downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale) |
| |
|
| | |
| | effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height) |
| | wasted_resolution = (width * height) - effective_resolution |
| |
|
| | if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution): |
| | max_effective_resolution = effective_resolution |
| | min_wasted_resolution = wasted_resolution |
| | best_fit = (width, height) |
| |
|
| | return best_fit |
| |
|
| | def divide_to_patches(image, patch_size): |
| | """ |
| | Divides an image into patches of a specified size. |
| | |
| | Args: |
| | image (PIL.Image.Image): The input image. |
| | patch_size (int): The size of each patch. |
| | |
| | Returns: |
| | list: A list of PIL.Image.Image objects representing the patches. |
| | """ |
| | patches = [] |
| | width, height = image.size |
| | for i in range(0, height, patch_size): |
| | for j in range(0, width, patch_size): |
| | box = (j, i, j + patch_size, i + patch_size) |
| | patch = image.crop(box) |
| | patches.append(patch) |
| |
|
| | return patches |
| |
|
| | def image_size_to_num_patches(image_size, grid_pinpoints, patch_size): |
| | if not isinstance(grid_pinpoints, list): |
| | raise TypeError("grid_pinpoints should be a list of tuples or lists") |
| |
|
| | |
| | if not isinstance(image_size, (list, tuple)): |
| | if not isinstance(image_size, (torch.Tensor, np.ndarray)): |
| | raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}") |
| | image_size = image_size.tolist() |
| |
|
| | best_resolution = select_best_resolution(image_size, grid_pinpoints) |
| | width, height = best_resolution |
| | num_patches = 0 |
| | |
| | for i in range(0, height, patch_size): |
| | for j in range(0, width, patch_size): |
| | num_patches += 1 |
| | |
| | num_patches += 1 |
| | return num_patches |
| |
|
| | def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): |
| | """ |
| | Calculate the shape of the image patch grid after the preprocessing for images of any resolution. |
| | |
| | Args: |
| | image_size (`tuple`): |
| | The size of the input image in the format (width, height). |
| | grid_pinpoints (`List`): |
| | A list containing possible resolutions. Each item in the list should be a tuple or list |
| | of the form `(height, width)`. |
| | patch_size (`int`): |
| | The size of each image patch. |
| | |
| | Returns: |
| | tuple: The shape of the image patch grid in the format (width, height). |
| | """ |
| | if not isinstance(grid_pinpoints, list): |
| | raise TypeError("grid_pinpoints should be a list of tuples or lists") |
| |
|
| | |
| | if not isinstance(image_size, (list, tuple)): |
| | if not isinstance(image_size, (torch.Tensor, np.ndarray)): |
| | raise TypeError( |
| | f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor" |
| | ) |
| | image_size = image_size.tolist() |
| |
|
| | width, height = select_best_resolution(image_size, grid_pinpoints) |
| | return width // patch_size, height // patch_size |
| |
|
| |
|
| | |
| | class KeeyRatioResize(object): |
| | def __init__(self, size): |
| | self.size = size |
| |
|
| | def __call__(self, image): |
| | return keepratio_resize(image, self.size) |
| |
|
| | def keepratio_resize(image, size, return_scale=False): |
| | |
| | w, h = image.size |
| | resized_w, resized_h = size |
| | if w / h > resized_w / resized_h: |
| | |
| | new_h = int(resized_w*h/w) |
| | resized_image = image.resize((resized_w, new_h), Image.BICUBIC) |
| |
|
| | image = Image.new('RGB', (resized_w, resized_h), (0, 0, 0)) |
| | pad_h = (resized_h - new_h) // 2 |
| | image.paste(resized_image, (0, pad_h)) |
| | scale = resized_w / w |
| | |
| | else: |
| | |
| | new_w = int(resized_h*w/h) |
| | resized_image = image.resize((new_w, resized_h), Image.BICUBIC) |
| | image = Image.new('RGB', (resized_w, resized_h), (0, 0, 0)) |
| | |
| | pad_w = (resized_w - new_w) // 2 |
| | image.paste(resized_image, (pad_w, 0)) |
| | scale = resized_h / h |
| | if return_scale: |
| | return image, scale |
| | return image |
| |
|
| | def _convert_image_to_rgb(image): |
| | return image.convert("RGB") |
| |
|
| | def _transform(img_h, img_w, image_mean=(0.48145466, 0.4578275, 0.40821073), image_std=(0.26862954, 0.26130258, 0.27577711)): |
| | return Compose([ |
| | |
| | |
| | |
| | _convert_image_to_rgb, |
| | ToTensor(), |
| | Normalize(image_mean, image_std), |
| | ]) |
| |
|
| |
|
| | def get_hw_multiple_of(image_size, multiple, max_size=None): |
| | w, h = image_size |
| | new_w = w if w % multiple == 0 else w + (multiple - w % multiple) |
| | new_h = h if h % multiple == 0 else h + (multiple - h % multiple) |
| | if max_size is not None: |
| | assert isinstance(max_size, (list, tuple)) and len(max_size) == 2 |
| | max_w, max_h = max_size |
| | assert max_w % multiple == 0 and max_h % multiple == 0 |
| | if new_w > max_w or new_h > max_h: |
| | |
| | |
| | |
| | new_w = min((new_w * max_w) // new_w, (new_w * max_h) // new_h) |
| | new_h = min((new_h * max_w) // new_w, (new_h * max_h) // new_h) |
| |
|
| | new_w = new_w if new_w % multiple == 0 else new_w + (multiple - new_w % multiple) |
| | new_h = new_h if new_h % multiple == 0 else new_h + (multiple - new_h % multiple) |
| | assert new_w % multiple == 0 and new_h % multiple == 0 |
| | assert new_w <= max_w and new_h <= max_h |
| | return new_w, new_h |
| |
|
| | def resize_multiple_of(image, multiple, max_size=None): |
| | """ |
| | Resize the image to the multiple of a number. |
| | |
| | Args: |
| | image (PIL.Image.Image): The input image. |
| | multiple (int): The number to which the image should be resized. |
| | |
| | Returns: |
| | PIL.Image.Image: The resized image. |
| | """ |
| | width, height = image.size |
| | new_width, new_height = get_hw_multiple_of((width, height), multiple, max_size) |
| | return image.resize((new_width, new_height), Image.BICUBIC) |
| |
|
| |
|
| |
|
| | class CustomBatchFeature(BatchFeature): |
| | def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None): |
| | """ |
| | Convert the inner content to tensors. |
| | |
| | Args: |
| | tensor_type (`str` or [`~utils.TensorType`], *optional*): |
| | The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If |
| | `None`, no modification is done. |
| | """ |
| | if tensor_type is None: |
| | return self |
| |
|
| | is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type) |
| |
|
| | |
| | for key, value in self.items(): |
| | if key == "pixel_values": |
| | for i, image in enumerate(value): |
| | if not is_tensor(image): |
| | tensor = as_tensor(image) |
| | self[key][i] = tensor |
| | continue |
| | try: |
| | if not is_tensor(value): |
| | tensor = as_tensor(value) |
| |
|
| | self[key] = tensor |
| | except: |
| | if key == "overflowing_values": |
| | raise ValueError("Unable to create tensor returning overflowing values of different lengths. ") |
| | raise ValueError( |
| | "Unable to create tensor, you should probably activate padding " |
| | "with 'padding=True' to have batched tensors with the same length." |
| | ) |
| |
|
| | return self |
| |
|
| | def to(self, *args, **kwargs) -> "BatchFeature": |
| | """ |
| | Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in |
| | different `dtypes` and sending the `BatchFeature` to a different `device`. |
| | |
| | Args: |
| | args (`Tuple`): |
| | Will be passed to the `to(...)` function of the tensors. |
| | kwargs (`Dict`, *optional*): |
| | Will be passed to the `to(...)` function of the tensors. |
| | |
| | Returns: |
| | [`BatchFeature`]: The same instance after modification. |
| | """ |
| | requires_backends(self, ["torch"]) |
| | import torch |
| |
|
| | new_data = {} |
| | device = kwargs.get("device") |
| | |
| | if device is None and len(args) > 0: |
| | |
| | arg = args[0] |
| | if is_torch_dtype(arg): |
| | |
| | pass |
| | elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int): |
| | device = arg |
| | else: |
| | |
| | raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.") |
| | |
| | for k, v in self.items(): |
| | if k == "pixel_values": |
| | new_data[k] = [v[i].to(*args, **kwargs) for i in range(len(v))] |
| | continue |
| | |
| | if torch.is_floating_point(v): |
| | |
| | new_data[k] = v.to(*args, **kwargs) |
| | elif device is not None: |
| | new_data[k] = v.to(device=device) |
| | else: |
| | new_data[k] = v |
| | self.data = new_data |
| | return self |
| |
|
| |
|
| | def as_tensor(value): |
| | if isinstance(value, (list, tuple)) and len(value) > 0: |
| | if isinstance(value[0], np.ndarray): |
| | value = np.array(value) |
| | elif ( |
| | isinstance(value[0], (list, tuple)) |
| | and len(value[0]) > 0 |
| | and isinstance(value[0][0], np.ndarray) |
| | ): |
| | value = np.array(value) |
| | if isinstance(value, np.ndarray): |
| | return torch.from_numpy(value) |
| | else: |
| | return torch.tensor(value) |
| |
|
| | class ImageProcessor(BaseImageProcessor): |
| | model_input_names = ["pixel_values"] |
| |
|
| | def __init__( |
| | self, |
| | size: Optional[Union[int, Tuple[int, int], Dict[str, int]]] = None, |
| | image_mean: Optional[Union[float, List[float]]] = None, |
| | image_std: Optional[Union[float, List[float]]] = None, |
| | process_image_mode: Optional[str] = 'resize', |
| | patch_size: Optional[int] = 14, |
| | image_grid_pinpoints: List = None, |
| | **kwargs, |
| | ) -> None: |
| | super().__init__(**kwargs) |
| | self.size = size |
| | self.image_mean = image_mean |
| | self.image_std = image_std |
| | self.process_image_mode = process_image_mode |
| | image_grid_pinpoints = ( |
| | image_grid_pinpoints |
| | if image_grid_pinpoints is not None |
| | else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] |
| | ) |
| | self.image_grid_pinpoints = image_grid_pinpoints |
| | self.patch_size = patch_size |
| |
|
| | def preprocess(self, |
| | images, |
| | return_tensors: Optional[Union[str, TensorType]] = None, |
| | data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, |
| | input_data_format: Optional[Union[str, ChannelDimension]] = None, |
| | **kwargs, |
| | ): |
| | if self.process_image_mode == 'resize': |
| | return self.resize_preprocess(images, return_tensors, data_format, input_data_format, **kwargs) |
| | elif self.process_image_mode == 'anyres': |
| | if processor_for_vllm == 1: |
| | return self.anyres_for_vllm_preprocess(images, return_tensors, data_format, input_data_format, **kwargs) |
| | return self.anyres_preprocess(images, return_tensors, data_format, input_data_format, **kwargs) |
| | elif self.process_image_mode == 'keepratio_resize': |
| | return self.keepratio_resize_preprocess(images, return_tensors, data_format, input_data_format, **kwargs) |
| | elif self.process_image_mode == 'dynamic_res': |
| | return self.dynamic_res_preprocess(images, return_tensors, data_format, input_data_format, **kwargs) |
| | else: |
| | raise ValueError(f"Invalid process_image_mode: {self.process_image_mode}") |
| | |
| | def resize_preprocess(self, images, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs): |
| | images = make_list_of_images(images) |
| | all_images = [] |
| | for image in images: |
| | resized_image = image.resize(self.size, Image.BICUBIC) |
| | transform_img = _transform(self.size[1], self.size[0], self.image_mean, self.image_std)(resized_image) |
| | all_images.append(to_numpy_array(transform_img)) |
| |
|
| | images = [ |
| | to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) |
| | for image in all_images |
| | ] |
| |
|
| | data = {"pixel_values": images} |
| | return CustomBatchFeature(data=data, tensor_type=return_tensors) |
| |
|
| | def keepratio_resize_preprocess(self, images, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs): |
| | images = make_list_of_images(images) |
| | all_images = [] |
| | for image in images: |
| | resized_image = keepratio_resize(image, self.size) |
| | transform_img = _transform(self.size[1], self.size[0], self.image_mean, self.image_std)(resized_image) |
| | all_images.append(to_numpy_array(transform_img)) |
| |
|
| | images = [ |
| | to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) |
| | for image in all_images |
| | ] |
| |
|
| | data = {"pixel_values": images} |
| | return CustomBatchFeature(data=data, tensor_type=return_tensors) |
| |
|
| | def dynamic_res_preprocess(self, images, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs): |
| | images = make_list_of_images(images) |
| | all_images = [] |
| | image_sizes = [] |
| | for image in images: |
| | ori_w, ori_h = image.size |
| | image_sizes.append([ori_h, ori_w]) |
| | resized_image = resize_multiple_of(image, self.patch_size, max_size=self.size) |
| | resized_w, resized_h = resized_image.size |
| | transform_img = _transform(resized_h, resized_w, self.image_mean, self.image_std)(resized_image) |
| | all_images.append(to_numpy_array(transform_img)) |
| |
|
| | images = [ |
| | as_tensor(to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)) |
| | for image in all_images |
| | ] |
| |
|
| | |
| | |
| | data = {"pixel_values": images, "image_sizes": image_sizes} |
| | |
| | |
| | return CustomBatchFeature(data=data, tensor_type=return_tensors) |
| |
|
| | def get_image_patches( |
| | self, |
| | data: Image, |
| | image_grid_pinpoints, |
| | ): |
| | if not isinstance(image_grid_pinpoints, list): |
| | raise TypeError("grid_pinpoints must be a list of possible resolutions.") |
| |
|
| |
|
| | best_resolution = select_best_resolution(data.size, image_grid_pinpoints) |
| |
|
| | resized_data, scale = keepratio_resize(data, best_resolution, return_scale=True) |
| | resized_data = divide_to_patches(resized_data, self.size[0]) |
| | ori_data = data.resize(self.size, Image.BICUBIC) |
| | data = [ori_data] + resized_data |
| | return data |
| | |
| | def pad( |
| | self, |
| | image: np.ndarray, |
| | padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], |
| | mode: PaddingMode = PaddingMode.CONSTANT, |
| | constant_values: Union[float, Iterable[float]] = 0.0, |
| | data_format: Optional[Union[str, ChannelDimension]] = None, |
| | input_data_format: Optional[Union[str, ChannelDimension]] = None, |
| | ) -> np.ndarray: |
| | """ |
| | Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`) |
| | dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected |
| | as input. |
| | |
| | Args: |
| | image (`np.ndarray`): |
| | The image to pad. |
| | padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`): |
| | Padding to apply to the edges of the height, width axes. Can be one of three formats: |
| | - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis. |
| | - `((before, after),)` yields same before and after pad for height and width. |
| | - `(pad,)` or int is a shortcut for before = after = pad width for all axes. |
| | mode (`PaddingMode`): |
| | The padding mode to use. Can be one of: |
| | - `"constant"`: pads with a constant value. |
| | - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the |
| | vector along each axis. |
| | - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis. |
| | - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array. |
| | constant_values (`float` or `Iterable[float]`, *optional*): |
| | The value to use for the padding if `mode` is `"constant"`. |
| | data_format (`str` or `ChannelDimension`, *optional*): |
| | The channel dimension format for the output image. Can be one of: |
| | - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. |
| | - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. |
| | If unset, will use same as the input image. |
| | input_data_format (`str` or `ChannelDimension`, *optional*): |
| | The channel dimension format for the input image. Can be one of: |
| | - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. |
| | - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. |
| | If unset, will use the inferred format of the input image. |
| | |
| | Returns: |
| | `np.ndarray`: The padded image. |
| | |
| | """ |
| |
|
| | |
| | if isinstance(padding, int) or len(padding) != 4: |
| | return pad(image, padding, mode, constant_values, data_format, input_data_format) |
| |
|
| | if input_data_format is None: |
| | input_data_format = infer_channel_dimension_format(image) |
| | if mode == PaddingMode.CONSTANT: |
| | image = np.pad(image, padding, mode="constant", constant_values=constant_values) |
| | elif mode == PaddingMode.REFLECT: |
| | image = np.pad(image, padding, mode="reflect") |
| | elif mode == PaddingMode.REPLICATE: |
| | image = np.pad(image, padding, mode="edge") |
| | elif mode == PaddingMode.SYMMETRIC: |
| | image = np.pad(image, padding, mode="symmetric") |
| | else: |
| | raise ValueError(f"Invalid padding mode: {mode}") |
| | image = ( |
| | to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image |
| | ) |
| | return image |
| |
|
| | def _pad_for_batching( |
| | self, |
| | pixel_values: List[np.ndarray], |
| | data_format: Optional[Union[str, ChannelDimension]] = None, |
| | input_data_format: Optional[Union[str, ChannelDimension]] = None, |
| | ): |
| | """ |
| | Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches. |
| | |
| | Args: |
| | pixel_values (`List[np.ndarray]`): |
| | An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`) |
| | data_format (`str` or `ChannelDimension`, *optional*): |
| | The channel dimension format for the output image. Can be one of: |
| | - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. |
| | - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. |
| | If unset, will use same as the input image. |
| | input_data_format (`str` or `ChannelDimension`, *optional*): |
| | The channel dimension format for the input image. Can be one of: |
| | - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. |
| | - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. |
| | If unset, will use the inferred format of the input image. |
| | |
| | Returns: |
| | List[`np.ndarray`]: The padded images. |
| | """ |
| | max_patch = max(len(x) for x in pixel_values) |
| | pixel_values = [ |
| | self.pad( |
| | image, |
| | padding=((0, max_patch - image.shape[0]), (0, 0), (0, 0), (0, 0)), |
| | data_format=data_format, |
| | input_data_format=input_data_format, |
| | ) |
| | for image in pixel_values |
| | ] |
| |
|
| | return pixel_values |
| |
|
| | def anyres_for_vllm_preprocess(self, images, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, do_pad: Optional[bool] = None, **kwargs): |
| | |
| | images = make_list_of_images(images) |
| | new_images = [] |
| | image_sizes = [] |
| |
|
| | for image in images: |
| | ori_w, ori_h = image.size |
| | image_sizes.append([ori_h, ori_w]) |
| | image_patches = self.get_image_patches( |
| | image, |
| | self.image_grid_pinpoints |
| | ) |
| | all_images = [] |
| | for image in image_patches: |
| | transform_img = _transform(self.size[0], self.size[1], self.image_mean, self.image_std)(image) |
| | img_array = to_numpy_array(transform_img) |
| | img_array = to_channel_dimension_format(img_array, data_format, input_channel_dim=input_data_format) |
| | all_images.append(img_array) |
| | |
| | pixel_values = np.array(all_images) |
| | new_images.append(pixel_values) |
| | |
| |
|
| | new_images = self._pad_for_batching(new_images) |
| |
|
| | data = {"pixel_values": new_images, "image_sizes": image_sizes} |
| | return BatchFeature(data=data, tensor_type=return_tensors) |
| |
|
| | |
| | def anyres_preprocess(self, images, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, do_pad: Optional[bool] = None, **kwargs): |
| | |
| | images = make_list_of_images(images) |
| | new_images = [] |
| | image_sizes = [] |
| |
|
| | for image in images: |
| | ori_w, ori_h = image.size |
| | image_sizes.append([ori_h, ori_w]) |
| | image_patches = self.get_image_patches( |
| | image, |
| | self.image_grid_pinpoints |
| | ) |
| | |
| | for image in image_patches: |
| | transform_img = _transform(self.size[0], self.size[1], self.image_mean, self.image_std)(image) |
| | img_array = to_numpy_array(transform_img) |
| | img_array = to_channel_dimension_format(img_array, data_format, input_channel_dim=input_data_format) |
| | |
| | new_images.append(img_array) |
| | |
| | |
| | |
| | |
| | |
| |
|
| | data = {"pixel_values": new_images, "image_sizes": image_sizes} |
| | return CustomBatchFeature(data=data, tensor_type=return_tensors) |
| |
|
| |
|
| |
|
| | |