rhymes-ai
/

Aria

@@ -18,6 +18,7 @@
 # under the License.
 import inspect
 from typing import List, Optional, Union
 from transformers import AutoTokenizer, BatchFeature
@@ -61,7 +62,7 @@ class AriaProcessor(ProcessorMixin):
         super().__init__(chat_template=chat_template)
         if image_processor is None:
-            self.image_processor = AriaVisionProcessor(image_max_size=patch_size)
         else:
             self.image_processor = image_processor
@@ -87,6 +88,7 @@ class AriaProcessor(ProcessorMixin):
         truncation: Union[bool, str, TruncationStrategy] = None,
         max_length: Optional[int] = None,
         max_image_size: Optional[int] = 980,
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
     ) -> BatchFeature:
         """
@@ -114,6 +116,8 @@ class AriaProcessor(ProcessorMixin):
                 Maximum length of the returned list and optionally padding length (see above).
             max_image_size (`int`, *optional*):
                 Maximum size of the image to be processed.
             truncation (`bool`, *optional*):
                 Activates truncation to cut input sequences longer than `max_length` to `max_length`.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
@@ -134,24 +138,35 @@ class AriaProcessor(ProcessorMixin):
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
             - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`.
         """
         if images is not None:
             image_inputs = self.image_processor(
                 images,
                 return_tensors=return_tensors,
                 max_image_size=max_image_size,
             )
         else:
             image_inputs = {}
-        if isinstance(text, str):
-            text = [text]
-        elif not isinstance(text, list) and not isinstance(text[0], str):
-            raise ValueError(
-                "Invalid input text. Please provide a string, or a list of strings"
-            )
-        prompt_strings = text
         text_inputs = self.tokenizer(
             prompt_strings,
             return_tensors=return_tensors,

 # under the License.
 import inspect
+import re
 from typing import List, Optional, Union
 from transformers import AutoTokenizer, BatchFeature
         super().__init__(chat_template=chat_template)
         if image_processor is None:
+            self.image_processor = AriaVisionProcessor(max_image_size=patch_size)
         else:
             self.image_processor = image_processor
         truncation: Union[bool, str, TruncationStrategy] = None,
         max_length: Optional[int] = None,
         max_image_size: Optional[int] = 980,
+        split_image: Optional[bool] = False,
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
     ) -> BatchFeature:
         """
                 Maximum length of the returned list and optionally padding length (see above).
             max_image_size (`int`, *optional*):
                 Maximum size of the image to be processed.
+            split_image (`bool`, *optional*):
+                Whether to split the image into patches before processing.
             truncation (`bool`, *optional*):
                 Activates truncation to cut input sequences longer than `max_length` to `max_length`.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
             - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`.
         """
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError(
+                "Invalid input text. Please provide a string, or a list of strings"
+            )
         if images is not None:
             image_inputs = self.image_processor(
                 images,
                 return_tensors=return_tensors,
                 max_image_size=max_image_size,
+                split_image=split_image,
             )
+            # expand the image_token according to the num_crops of image
+            prompt_strings = []
+            crop_iter = iter(image_inputs.pop("num_crops"))
+            for prompt in text:
+                prompt_strings.append(
+                    re.sub(
+                        re.escape(self.image_token),
+                        lambda _: next(crop_iter) * self.image_token,
+                        prompt,
+                    )
+                )
         else:
             image_inputs = {}
         text_inputs = self.tokenizer(
             prompt_strings,
             return_tensors=return_tensors,

vision_processor.py CHANGED Viewed

@@ -19,12 +19,93 @@
 from typing import List, Optional, Union
 import torch
 from PIL import Image, ImageOps
 from torchvision import transforms
 from transformers import BaseImageProcessor, BatchFeature, TensorType
 def keep_ratio_resize_and_pixel_mask(
     img: Image.Image, max_size, min_size=336, padding_value=0
 ):
@@ -127,6 +208,17 @@ class AriaVisionProcessor(BaseImageProcessor):
         max_image_size: Optional[int] = 980,
         min_image_size: Optional[int] = 336,
         return_tensors: Optional[Union[str, TensorType]] = "pt",
     ):
         """
         Process a list of images.
@@ -135,6 +227,8 @@ class AriaVisionProcessor(BaseImageProcessor):
             images (list): List of PIL.Image objects.
             max_image_size (int, optional): Override the default max image size. Defaults to None.
             return_tensors (str or TensorType, optional): The type of tensor to return. Defaults to "pt".
         Returns:
             BatchFeature: A BatchFeature object containing:
                 - 'pixel_values': Tensor of processed image pixel values.
@@ -142,6 +236,7 @@ class AriaVisionProcessor(BaseImageProcessor):
                     - True (1) values indicate pixels that belong to the original resized image.
                     - False (0) values indicate pixels that are part of the padding.
                   The mask helps distinguish between actual image content and padded areas in subsequent processing steps.
         """
         max_size = self.max_image_size if max_image_size is None else max_image_size
         min_size = self.min_image_size if min_image_size is None else min_image_size
@@ -154,19 +249,24 @@ class AriaVisionProcessor(BaseImageProcessor):
         pixel_values = []
         pixel_masks = []
         for image in images:
-            img_padded, pixel_mask = keep_ratio_resize_and_pixel_mask(
-                image, max_size, min_size
-            )
-            img_padded = self.transform(img_padded)
-            pixel_values.append(img_padded)
-            pixel_masks.append(pixel_mask)
         return BatchFeature(
             data={
                 "pixel_values": torch.stack(pixel_values),
                 "pixel_mask": torch.stack(pixel_masks),
             },
             tensor_type=return_tensors,
         )
@@ -177,10 +277,23 @@ class AriaVisionProcessor(BaseImageProcessor):
         max_image_size=None,
         min_image_size=None,
         return_tensors: Optional[Union[str, TensorType]] = None,
     ):
         return self.__call__(
             images,
             max_image_size=max_image_size,
             min_image_size=min_image_size,
             return_tensors=return_tensors,
         )

 from typing import List, Optional, Union
+import numpy as np
 import torch
 from PIL import Image, ImageOps
 from torchvision import transforms
 from transformers import BaseImageProcessor, BatchFeature, TensorType
+def _select_best_resolution(
+    img_width: int, img_height: int, target_ratios: List[List[int]], patch_size: int
+):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        img_width: the original widths of images.
+        img_height: the original heights of images.
+        target_ratios (2d numpy array): dimension size (M,2)
+        patch_size (int): image patch size
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    aspect_ratio = img_width / img_height
+    best_ratio_diff = float("inf")
+    best_ratio_w, best_ratio_h = 1, 1
+    area = np.int32(img_height) * np.int32(img_height)
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio_w, best_ratio_h = ratio[0], ratio[1]
+        elif (
+            ratio_diff == best_ratio_diff
+            and area > 0.5 * patch_size * patch_size * ratio[0] * ratio[1]
+        ):
+            best_ratio_w, best_ratio_h = ratio[0], ratio[1]
+    return best_ratio_w, best_ratio_h
+def _split_image(
+    image: Image.Image,
+    split_image: bool,
+    split_ratio: List[List[int]],
+    patch_size: int,
+) -> List[Image.Image]:
+    """
+    Split image into multiple patches
+    Args:
+        image (PIL.Image): Input image.
+        split_image (bool): Whether to split the image into patches.
+        split_ratio (2d numpy array): dimension size (M,2)
+        patch_size (int): image patch size
+    Returns:
+        List[PIL.Image]: List of splitted images.
+    """
+    if split_image:
+        ratio_width, ratio_height = _select_best_resolution(
+            image.width, image.height, split_ratio, patch_size
+        )
+        resize_width = patch_size * ratio_width
+        resize_height = patch_size * ratio_height
+        blocks = ratio_width * ratio_height
+        resized_img = image.resize((resize_width, resize_height))
+        processed_images = []
+        for i in range(blocks):
+            box = (
+                (i % (resize_width // patch_size)) * patch_size,
+                (i // (resize_width // patch_size)) * patch_size,
+                ((i % (resize_width // patch_size)) + 1) * patch_size,
+                ((i // (resize_width // patch_size)) + 1) * patch_size,
+            )
+            # split the image
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+        assert len(processed_images) == blocks
+        if len(processed_images) != 1:
+            processed_images.insert(0, image)
+        return processed_images
+    else:
+        return [image]
 def keep_ratio_resize_and_pixel_mask(
     img: Image.Image, max_size, min_size=336, padding_value=0
 ):
         max_image_size: Optional[int] = 980,
         min_image_size: Optional[int] = 336,
         return_tensors: Optional[Union[str, TensorType]] = "pt",
+        split_image: Optional[bool] = False,
+        split_ratio: Optional[List[List[int]]] = [
+            [1, 1],
+            [1, 2],
+            [1, 3],
+            [1, 4],
+            [2, 2],
+            [2, 1],
+            [3, 1],
+            [4, 1],
+        ],
     ):
         """
         Process a list of images.
             images (list): List of PIL.Image objects.
             max_image_size (int, optional): Override the default max image size. Defaults to None.
             return_tensors (str or TensorType, optional): The type of tensor to return. Defaults to "pt".
+            split_image (bool, optional): Whether to split the image. Defaults to False.
+            split_ratio (list, optional): The ratio for splitting the image. Defaults to a list of common split ratios.
         Returns:
             BatchFeature: A BatchFeature object containing:
                 - 'pixel_values': Tensor of processed image pixel values.
                     - True (1) values indicate pixels that belong to the original resized image.
                     - False (0) values indicate pixels that are part of the padding.
                   The mask helps distinguish between actual image content and padded areas in subsequent processing steps.
+                - 'num_crops': Tensor of the number of crops for each image.
         """
         max_size = self.max_image_size if max_image_size is None else max_image_size
         min_size = self.min_image_size if min_image_size is None else min_image_size
         pixel_values = []
         pixel_masks = []
+        num_crops = []
         for image in images:
+            crop_images = _split_image(image, split_image, split_ratio, max_size)
+            num_crops.append(torch.tensor(len(crop_images)))
+            for crop_image in crop_images:
+                img_padded, pixel_mask = keep_ratio_resize_and_pixel_mask(
+                    crop_image, max_size, min_size
+                )
+                img_padded = self.transform(img_padded)
+                pixel_values.append(img_padded)
+                pixel_masks.append(pixel_mask)
         return BatchFeature(
             data={
                 "pixel_values": torch.stack(pixel_values),
                 "pixel_mask": torch.stack(pixel_masks),
+                "num_crops": torch.stack(num_crops),
             },
             tensor_type=return_tensors,
         )
         max_image_size=None,
         min_image_size=None,
         return_tensors: Optional[Union[str, TensorType]] = None,
+        split_image: Optional[bool] = False,
+        split_ratio: Optional[List[List[int]]] = [
+            [1, 1],
+            [1, 2],
+            [1, 3],
+            [1, 4],
+            [2, 2],
+            [2, 1],
+            [3, 1],
+            [4, 1],
+        ],
     ):
         return self.__call__(
             images,
             max_image_size=max_image_size,
             min_image_size=min_image_size,
             return_tensors=return_tensors,
+            split_image=split_image,
+            split_ratio=split_ratio,
         )