| | import torch |
| | import os |
| | import asyncio |
| | import requests |
| | from io import BytesIO |
| | from PIL import Image |
| | from urllib.parse import urlparse |
| | import numpy as np |
| |
|
| |
|
| | def split_image_ur(img, max_slice_num, image_size, vit_image_size, force_min_size=False): |
| | if force_min_size: |
| | img = resize_by_patch_size_ur(img, min_size= image_size, max_size= image_size * max_slice_num, patch_size=14) |
| | slice_config = { |
| | "max_slice_nums": max_slice_num, |
| | "scale_resolution": image_size, |
| | "patch_size": 14 |
| | } |
| | source_image, sub_images, _ = do_slice_by_minicpmv_strategy_ur( |
| | img, max_slice_nums=slice_config["max_slice_nums"], scale_resolution=slice_config["scale_resolution"], patch_size=slice_config["patch_size"], vit_image_size=vit_image_size) |
| | splits = [] |
| | splits.append(source_image) |
| | for i in range(len(sub_images)): |
| | for j in range(len(sub_images[0])): |
| | splits.append(sub_images[i][j]) |
| | sliced_images, sliced_shapes = [], [] |
| | for slice_image in splits: |
| | sliced_images.append(slice_image) |
| | sliced_shapes.append(np.array((slice_image.size[0] // slice_config["patch_size"], slice_image.size[1] // slice_config["patch_size"]))) |
| | |
| | return sliced_images, sliced_shapes |
| |
|
| |
|
| | import math |
| | from PIL import Image |
| | import torch |
| | import torchvision.transforms.functional as F |
| | from torchvision.transforms import InterpolationMode |
| |
|
| | |
| | def do_slice_by_minicpmv_strategy_ur(image, max_slice_nums=9, scale_resolution=1120, patch_size=14, vit_image_size=448, never_split=False): |
| |
|
| | original_size = image.size |
| | original_width, original_height = original_size |
| | log_ratio = math.log(original_width / original_height) |
| | ratio = original_width * original_height / (scale_resolution * scale_resolution) |
| | multiple = min(math.ceil(ratio), max_slice_nums) |
| |
|
| | source_image = None |
| | best_grid = None |
| | patches = [] |
| |
|
| | if multiple <= 1 or never_split: |
| | |
| | |
| | |
| | |
| | best_size = (scale_resolution, scale_resolution) |
| | source_image = image.resize(best_size, Image.BICUBIC) |
| | border_size = (vit_image_size-scale_resolution)/2 |
| | from PIL import ImageOps |
| | source_image = ImageOps.expand(source_image, border=int(border_size), fill=(0,0,0)) |
| | else: |
| | candidate_split_grids_nums = [] |
| | for i in [multiple - 1, multiple, multiple + 1]: |
| | if i == 1 or i > max_slice_nums: |
| | continue |
| | candidate_split_grids_nums.append(i) |
| |
|
| | |
| | |
| | |
| | source_image = image.copy().resize((scale_resolution,scale_resolution), Image.BICUBIC) |
| | candidate_grids = [] |
| |
|
| | |
| | for split_grids_nums in candidate_split_grids_nums: |
| | m = 1 |
| | while m <= split_grids_nums: |
| | if split_grids_nums % m == 0: |
| | candidate_grids.append([m, split_grids_nums // m]) |
| | m += 1 |
| | |
| |
|
| | best_grid = [1, 1] |
| | min_error = float("inf") |
| | for grid in candidate_grids: |
| | error = abs(log_ratio - math.log(grid[0] / grid[1])) |
| | if error < min_error: |
| | best_grid = grid |
| | min_error = error |
| |
|
| | refine_size = get_refine_size( |
| | original_size, best_grid, scale_resolution, patch_size, allow_upscale=True |
| | ) |
| |
|
| | refine_image = image.resize(refine_size, Image.BICUBIC) |
| | patches = split_to_patches(refine_image, best_grid, scale_resolution, vit_image_size) |
| |
|
| | return source_image, patches, best_grid |
| |
|
| |
|
| | def ensure_divide(length, patch_size): |
| | return max(round(length / patch_size) * patch_size, patch_size) |
| |
|
| |
|
| | def find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=False): |
| | width, height = original_size |
| | if (width * height > scale_resolution * scale_resolution) or allow_upscale: |
| | r = width / height |
| | height = int(scale_resolution / math.sqrt(r)) |
| | width = int(height * r) |
| | best_width = ensure_divide(width, patch_size) |
| | best_height = ensure_divide(height, patch_size) |
| |
|
| | |
| | while best_width * best_height > scale_resolution ** 2: |
| | |
| | best_width -= patch_size |
| | |
| | return (best_width, best_height) |
| |
|
| |
|
| | def get_refine_size(original_size, grid, scale_resolution, patch_size, allow_upscale=False): |
| | width, height = original_size |
| | grid_x, grid_y = grid |
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | refine_size = (scale_resolution * grid_x, scale_resolution * grid_y) |
| |
|
| | return refine_size |
| |
|
| |
|
| | def split_to_patches(image, grid, scale_resolution, vit_image_size): |
| | patches = [] |
| | width, height = image.size |
| | grid_x = int(width / grid[0]) |
| | grid_y = int(height / grid[1]) |
| |
|
| | from PIL import ImageOps |
| | border_size = (vit_image_size - scale_resolution)/2 |
| | padded_img = ImageOps.expand(image, border=int(border_size), fill=(0,0,0)) |
| | padded_width, padded_height = padded_img.size |
| | for i in range(0, padded_height-vit_image_size+1, scale_resolution): |
| | images = [] |
| | for j in range(0, padded_width-vit_image_size+1, scale_resolution): |
| | box = (j, i, j + vit_image_size, i + vit_image_size) |
| | patch = padded_img.crop(box) |
| | images.append(patch) |
| | patches.append(images) |
| |
|
| | return patches |
| |
|
| | def resize_by_patch_size_ur(img, min_size=1152, max_size=2240, patch_size=14): |
| | interpolation=InterpolationMode.BICUBIC |
| | |
| | if isinstance(img, torch.Tensor): |
| | height, width = img.shape[:2] |
| | else: |
| | width, height = img.size |
| | |
| | |
| | if min(height, width) < min_size: |
| | |
| | scale_factor = min_size / min(height, width) |
| | new_height = max(min_size, round(height * scale_factor)) |
| | new_width = max(min_size, round(width * scale_factor)) |
| | |
| |
|
| | |
| | if max(new_height, new_width) > max_size: |
| | scale_factor = max_size / max(new_height, new_width) |
| | new_height = min(max_size, round(new_height * scale_factor)) |
| | new_width = min(max_size, round(new_width * scale_factor)) |
| | else: |
| | scale_factor = min(max_size / max(height, width), 1) |
| | new_height = round(height * scale_factor) |
| | new_width = round(width * scale_factor) |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | img = img.resize((new_width, new_height), Image.BICUBIC) |
| |
|
| | return img |