from typing import List import cv2 import numpy as np import math import pypdfium2 from PIL import Image, ImageOps, ImageDraw import torch from surya.settings import settings def convert_if_not_rgb(images: List[Image.Image]) -> List[Image.Image]: new_images = [] for image in images: if image.mode != "RGB": image = image.convert("RGB") new_images.append(image) return new_images def get_total_splits(image_size, processor): img_height = list(image_size)[1] max_height = settings.DETECTOR_IMAGE_CHUNK_HEIGHT processor_height = processor.size["height"] if img_height > max_height: num_splits = math.ceil(img_height / processor_height) return num_splits return 1 def split_image(img, processor): # This will not modify/return the original image - it will either crop, or copy the image img_height = list(img.size)[1] max_height = settings.DETECTOR_IMAGE_CHUNK_HEIGHT processor_height = processor.size["height"] if img_height > max_height: num_splits = math.ceil(img_height / processor_height) splits = [] split_heights = [] for i in range(num_splits): top = i * processor_height bottom = (i + 1) * processor_height if bottom > img_height: bottom = img_height cropped = img.crop((0, top, img.size[0], bottom)) height = bottom - top if height < processor_height: cropped = ImageOps.pad(cropped, (img.size[0], processor_height), color=255, centering=(0, 0)) splits.append(cropped) split_heights.append(height) return splits, split_heights return [img.copy()], [img_height] def prepare_image_detection(img, processor): new_size = (processor.size["width"], processor.size["height"]) # This double resize actually necessary for downstream accuracy img.thumbnail(new_size, Image.Resampling.LANCZOS) img = img.resize(new_size, Image.Resampling.LANCZOS) # Stretch smaller dimension to fit new size img = np.asarray(img, dtype=np.uint8) img = processor(img)["pixel_values"][0] img = torch.from_numpy(img) return img def open_pdf(pdf_filepath): return pypdfium2.PdfDocument(pdf_filepath) def get_page_images(doc, indices: List, dpi=settings.IMAGE_DPI): renderer = doc.render( pypdfium2.PdfBitmap.to_pil, page_indices=indices, scale=dpi / 72, ) images = list(renderer) images = [image.convert("RGB") for image in images] return images def slice_bboxes_from_image(image: Image.Image, bboxes): lines = [] for bbox in bboxes: line = image.crop((bbox[0], bbox[1], bbox[2], bbox[3])) if line.size[0] == 0: print(f"Warning: found an empty line with bbox {bbox}") lines.append(line) return lines def slice_polys_from_image(image: Image.Image, polys): image_array = np.array(image, dtype=np.uint8) lines = [] for idx, poly in enumerate(polys): lines.append(slice_and_pad_poly(image_array, poly)) return lines def slice_and_pad_poly(image_array: np.array, coordinates): # Draw polygon onto mask coordinates = [(corner[0], corner[1]) for corner in coordinates] bbox = [min([x[0] for x in coordinates]), min([x[1] for x in coordinates]), max([x[0] for x in coordinates]), max([x[1] for x in coordinates])] # We mask out anything not in the polygon cropped_polygon = image_array[bbox[1]:bbox[3], bbox[0]:bbox[2]].copy() coordinates = [(x - bbox[0], y - bbox[1]) for x, y in coordinates] # Pad the area outside the polygon with the pad value mask = np.zeros(cropped_polygon.shape[:2], dtype=np.uint8) cv2.fillPoly(mask, [np.int32(coordinates)], 1) mask = np.stack([mask] * 3, axis=-1) cropped_polygon[mask == 0] = settings.RECOGNITION_PAD_VALUE rectangle_image = Image.fromarray(cropped_polygon) return rectangle_image