# pylint: disable=invalid-name,too-many-branches,too-many-statements,too-many-arguments import os import io import typing import hashlib import urllib.request import urllib.parse import cv2 import imgaug import numpy as np import validators import matplotlib.pyplot as plt from shapely import geometry from scipy import spatial def read(filepath_or_buffer: typing.Union[str, io.BytesIO]): """Read a file into an image object Args: filepath_or_buffer: The path to the file, a URL, or any object with a `read` method (such as `io.BytesIO`) """ if isinstance(filepath_or_buffer, np.ndarray): return filepath_or_buffer if hasattr(filepath_or_buffer, 'read'): image = np.asarray(bytearray(filepath_or_buffer.read()), dtype=np.uint8) image = cv2.imdecode(image, cv2.IMREAD_UNCHANGED) elif isinstance(filepath_or_buffer, str): if validators.url(filepath_or_buffer): return read(urllib.request.urlopen(filepath_or_buffer)) assert os.path.isfile(filepath_or_buffer), \ 'Could not find image at path: ' + filepath_or_buffer image = cv2.imread(filepath_or_buffer) return cv2.cvtColor(image, cv2.COLOR_BGR2RGB) def get_rotated_width_height(box): """ Returns the width and height of a rotated rectangle Args: box: A list of four points starting in the top left corner and moving clockwise. """ w = (spatial.distance.cdist(box[0][np.newaxis], box[1][np.newaxis], "euclidean") + spatial.distance.cdist(box[2][np.newaxis], box[3][np.newaxis], "euclidean")) / 2 h = (spatial.distance.cdist(box[0][np.newaxis], box[3][np.newaxis], "euclidean") + spatial.distance.cdist(box[1][np.newaxis], box[2][np.newaxis], "euclidean")) / 2 return int(w[0][0]), int(h[0][0]) # pylint:disable=too-many-locals def warpBox(image, box, target_height=None, target_width=None, margin=0, cval=None, return_transform=False, skip_rotate=False): """Warp a boxed region in an image given by a set of four points into a rectangle with a specified width and height. Useful for taking crops of distorted or rotated text. Args: image: The image from which to take the box box: A list of four points starting in the top left corner and moving clockwise. target_height: The height of the output rectangle target_width: The width of the output rectangle return_transform: Whether to return the transformation matrix with the image. """ if cval is None: cval = (0, 0, 0) if len(image.shape) == 3 else 0 if not skip_rotate: box, _ = get_rotated_box(box) w, h = get_rotated_width_height(box) assert ( (target_width is None and target_height is None) or (target_width is not None and target_height is not None)), \ 'Either both or neither of target width and height must be provided.' if target_width is None and target_height is None: target_width = w target_height = h scale = min(target_width / w, target_height / h) M = cv2.getPerspectiveTransform(src=box, dst=np.array([[margin, margin], [scale * w - margin, margin], [scale * w - margin, scale * h - margin], [margin, scale * h - margin]]).astype('float32')) crop = cv2.warpPerspective(image, M, dsize=(int(scale * w), int(scale * h))) target_shape = (target_height, target_width, 3) if len(image.shape) == 3 else (target_height, target_width) full = (np.zeros(target_shape) + cval).astype('uint8') full[:crop.shape[0], :crop.shape[1]] = crop if return_transform: return full, M return full def flatten(list_of_lists): return [item for sublist in list_of_lists for item in sublist] def combine_line(line): """Combine a set of boxes in a line into a single bounding box. Args: line: A list of (box, character) entries Returns: A (box, text) tuple """ text = ''.join([character if character is not None else '' for _, character in line]) box = np.concatenate([coords[:2] for coords, _ in line] + [np.array([coords[3], coords[2]]) for coords, _ in reversed(line)]).astype('float32') first_point = box[0] rectangle = cv2.minAreaRect(box) box = cv2.boxPoints(rectangle) # Put the points in clockwise order box = np.array(np.roll(box, -np.linalg.norm(box - first_point, axis=1).argmin(), 0)) return box, text def drawAnnotations(image, predictions, ax=None): """Draw text annotations onto image. Args: image: The image on which to draw predictions: The predictions as provided by `pipeline.recognize`. ax: A matplotlib axis on which to draw. """ if ax is None: _, ax = plt.subplots() ax.imshow(drawBoxes(image=image, boxes=predictions, boxes_format='predictions')) predictions = sorted(predictions, key=lambda p: p[1][:, 1].min()) left = [] right = [] for word, box in predictions: if box[:, 0].min() < image.shape[1] / 2: left.append((word, box)) else: right.append((word, box)) ax.set_yticks([]) ax.set_xticks([]) for side, group in zip(['left', 'right'], [left, right]): for index, (text, box) in enumerate(group): y = 1 - (index / len(group)) xy = box[0] / np.array([image.shape[1], image.shape[0]]) xy[1] = 1 - xy[1] ax.annotate(s=text, xy=xy, xytext=(-0.05 if side == 'left' else 1.05, y), xycoords='axes fraction', arrowprops={ 'arrowstyle': '->', 'color': 'r' }, color='r', fontsize=14, horizontalalignment='right' if side == 'left' else 'left') return ax def drawBoxes(image, boxes, color=(255, 0, 0), thickness=5, boxes_format='boxes'): """Draw boxes onto an image. Args: image: The image on which to draw the boxes. boxes: The boxes to draw. color: The color for each box. thickness: The thickness for each box. boxes_format: The format used for providing the boxes. Options are "boxes" which indicates an array with shape(N, 4, 2) where N is the number of boxes and each box is a list of four points) as provided by `keras_ocr.detection.Detector.detect`, "lines" (a list of lines where each line itself is a list of (box, character) tuples) as provided by `keras_ocr.data_generation.get_image_generator`, or "predictions" where boxes is by itself a list of (word, box) tuples as provided by `keras_ocr.pipeline.Pipeline.recognize` or `keras_ocr.recognition.Recognizer.recognize_from_boxes`. """ if len(boxes) == 0: return image canvas = image.copy() if boxes_format == 'lines': revised_boxes = [] for line in boxes: for box, _ in line: revised_boxes.append(box) boxes = revised_boxes if boxes_format == 'predictions': revised_boxes = [] for _, box in boxes: revised_boxes.append(box) boxes = revised_boxes for box in boxes: cv2.polylines(img=canvas, pts=box[np.newaxis].astype('int32'), color=color, thickness=thickness, isClosed=True) return canvas def adjust_boxes(boxes, boxes_format='boxes', scale=1): """Adjust boxes using a given scale and offset. Args: boxes: The boxes to adjust boxes_format: The format for the boxes. See the `drawBoxes` function for an explanation on the options. scale: The scale to apply """ if scale == 1: return boxes if boxes_format == 'boxes': return np.array(boxes) * scale if boxes_format == 'lines': return [[(np.array(box) * scale, character) for box, character in line] for line in boxes] if boxes_format == 'predictions': return [(word, np.array(box) * scale) for word, box in boxes] raise NotImplementedError(f'Unsupported boxes format: {boxes_format}') def augment(boxes, augmenter: imgaug.augmenters.meta.Augmenter, image=None, boxes_format='boxes', image_shape=None, area_threshold=0.5, min_area=None): """Augment an image and associated boxes together. Args: image: The image which we wish to apply the augmentation. boxes: The boxes that will be augmented together with the image boxes_format: The format for the boxes. See the `drawBoxes` function for an explanation on the options. image_shape: The shape of the input image if no image will be provided. area_threshold: Fraction of bounding box that we require to be in augmented image to include it. min_area: The minimum area for a character to be included. """ if image is None and image_shape is None: raise ValueError('One of "image" or "image_shape" must be provided.') augmenter = augmenter.to_deterministic() if image is not None: image_augmented = augmenter(image=image) image_shape = image.shape[:2] image_augmented_shape = image_augmented.shape[:2] else: image_augmented = None width_augmented, height_augmented = augmenter.augment_keypoints( imgaug.KeypointsOnImage.from_xy_array(xy=[[image_shape[1], image_shape[0]]], shape=image_shape)).to_xy_array()[0] image_augmented_shape = (height_augmented, width_augmented) def box_inside_image(box): area_before = cv2.contourArea(np.int32(box)[:, np.newaxis, :]) if area_before == 0: return False, box clipped = box.copy() clipped[:, 0] = clipped[:, 0].clip(0, image_augmented_shape[1]) clipped[:, 1] = clipped[:, 1].clip(0, image_augmented_shape[0]) area_after = cv2.contourArea(np.int32(clipped)[:, np.newaxis, :]) return ((area_after / area_before) >= area_threshold) and (min_area is None or area_after > min_area), clipped def augment_box(box): return augmenter.augment_keypoints( imgaug.KeypointsOnImage.from_xy_array(box, shape=image_shape)).to_xy_array() if boxes_format == 'boxes': boxes_augmented = [ box for inside, box in [box_inside_image(box) for box in map(augment_box, boxes)] if inside ] elif boxes_format == 'lines': boxes_augmented = [[(augment_box(box), character) for box, character in line] for line in boxes] boxes_augmented = [[(box, character) for (inside, box), character in [(box_inside_image(box), character) for box, character in line] if inside] for line in boxes_augmented] # Sometimes all the characters in a line are removed. boxes_augmented = [line for line in boxes_augmented if line] elif boxes_format == 'predictions': boxes_augmented = [(word, augment_box(box)) for word, box in boxes] boxes_augmented = [(word, box) for word, (inside, box) in [(word, box_inside_image(box)) for word, box in boxes_augmented] if inside] else: raise NotImplementedError(f'Unsupported boxes format: {boxes_format}') return image_augmented, boxes_augmented def pad(image, width: int, height: int, cval: int = 255): """Pad an image to a desired size. Raises an exception if image is larger than desired size. Args: image: The input image width: The output width height: The output height cval: The value to use for filling the image. """ if len(image.shape) == 3: output_shape = (height, width, image.shape[-1]) else: output_shape = (height, width) assert height >= output_shape[0], 'Input height must be less than output height.' assert width >= output_shape[1], 'Input width must be less than output width.' padded = np.zeros(output_shape, dtype=image.dtype) + cval padded[:image.shape[0], :image.shape[1]] = image return padded def resize_image(image, max_scale, max_size): """Obtain the optimal resized image subject to a maximum scale and maximum size. Args: image: The input image max_scale: The maximum scale to apply max_size: The maximum size to return """ if max(image.shape) * max_scale > max_size: # We are constrained by the maximum size scale = max_size / max(image.shape) else: # We are contrained by scale scale = max_scale return cv2.resize(image, dsize=(int(image.shape[1] * scale), int(image.shape[0] * scale))), scale # pylint: disable=too-many-arguments def fit(image, width: int, height: int, cval: int = 255, mode='letterbox', return_scale=False): """Obtain a new image, fit to the specified size. Args: image: The input image width: The new width height: The new height cval: The constant value to use to fill the remaining areas of the image return_scale: Whether to return the scale used for the image Returns: The new image """ fitted = None x_scale = width / image.shape[1] y_scale = height / image.shape[0] if x_scale == 1 and y_scale == 1: fitted = image scale = 1 elif (x_scale <= y_scale and mode == 'letterbox') or (x_scale >= y_scale and mode == 'crop'): scale = width / image.shape[1] resize_width = width resize_height = (width / image.shape[1]) * image.shape[0] else: scale = height / image.shape[0] resize_height = height resize_width = scale * image.shape[1] if fitted is None: resize_width, resize_height = map(int, [resize_width, resize_height]) if mode == 'letterbox': fitted = np.zeros((height, width, 3), dtype='uint8') + cval image = cv2.resize(image, dsize=(resize_width, resize_height)) fitted[:image.shape[0], :image.shape[1]] = image[:height, :width] elif mode == 'crop': image = cv2.resize(image, dsize=(resize_width, resize_height)) fitted = image[:height, :width] else: raise NotImplementedError(f'Unsupported mode: {mode}') if not return_scale: return fitted return fitted, scale def read_and_fit(filepath_or_array: typing.Union[str, np.ndarray], width: int, height: int, cval: int = 255, mode='letterbox'): """Read an image from disk and fit to the specified size. Args: filepath: The path to the image or numpy array of shape HxWx3 width: The new width height: The new height cval: The constant value to use to fill the remaining areas of the image mode: The mode to pass to "fit" (crop or letterbox) Returns: The new image """ image = read(filepath_or_array) if isinstance(filepath_or_array, str) else filepath_or_array image = fit(image=image, width=width, height=height, cval=cval, mode=mode) return image def sha256sum(filename): """Compute the sha256 hash for a file.""" h = hashlib.sha256() b = bytearray(128 * 1024) mv = memoryview(b) with open(filename, 'rb', buffering=0) as f: for n in iter(lambda: f.readinto(mv), 0): h.update(mv[:n]) return h.hexdigest() def get_default_cache_dir(): return os.environ.get('KERAS_OCR_CACHE_DIR', os.path.expanduser(os.path.join('~', '.keras-ocr'))) def download_and_verify(url, sha256=None, cache_dir=None, verbose=True, filename=None): """Download a file to a cache directory and verify it with a sha256 hash. Args: url: The file to download sha256: The sha256 hash to check. If the file already exists and the hash matches, we don't download it again. cache_dir: The directory in which to cache the file. The default is `~/.keras-ocr`. verbose: Whether to log progress filename: The filename to use for the file. By default, the filename is derived from the URL. """ if cache_dir is None: cache_dir = get_default_cache_dir() if filename is None: filename = os.path.basename(urllib.parse.urlparse(url).path) filepath = os.path.join(cache_dir, filename) os.makedirs(os.path.split(filepath)[0], exist_ok=True) if verbose: print('Looking for ' + filepath) if not os.path.isfile(filepath) or (sha256 and sha256sum(filepath) != sha256): if verbose: print('Downloading ' + filepath) urllib.request.urlretrieve(url, filepath) assert sha256 is None or sha256 == sha256sum(filepath), 'Error occurred verifying sha256.' return filepath # pylint: disable=bad-continuation def get_rotated_box( points ) -> typing.Tuple[typing.Tuple[float, float], typing.Tuple[float, float], typing.Tuple[ float, float], typing.Tuple[float, float], float]: """Obtain the parameters of a rotated box. Returns: The vertices of the rotated box in top-left, top-right, bottom-right, bottom-left order along with the angle of rotation about the bottom left corner. """ try: mp = geometry.MultiPoint(points=points) pts = np.array(list(zip(*mp.minimum_rotated_rectangle.exterior.xy)))[:-1] # noqa: E501 except AttributeError: # There weren't enough points for the minimum rotated rectangle function pts = points # The code below is taken from # https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py # sort the points based on their x-coordinates xSorted = pts[np.argsort(pts[:, 0]), :] # grab the left-most and right-most points from the sorted # x-roodinate points leftMost = xSorted[:2, :] rightMost = xSorted[2:, :] # now, sort the left-most coordinates according to their # y-coordinates so we can grab the top-left and bottom-left # points, respectively leftMost = leftMost[np.argsort(leftMost[:, 1]), :] (tl, bl) = leftMost # now that we have the top-left coordinate, use it as an # anchor to calculate the Euclidean distance between the # top-left and right-most points; by the Pythagorean # theorem, the point with the largest distance will be # our bottom-right point D = spatial.distance.cdist(tl[np.newaxis], rightMost, "euclidean")[0] (br, tr) = rightMost[np.argsort(D)[::-1], :] # return the coordinates in top-left, top-right, # bottom-right, and bottom-left order pts = np.array([tl, tr, br, bl], dtype="float32") rotation = np.arctan((tl[0] - bl[0]) / (tl[1] - bl[1])) return pts, rotation def fix_line(line): """Given a list of (box, character) tuples, return a revised line with a consistent ordering of left-to-right or top-to-bottom, with each box provided with (top-left, top-right, bottom-right, bottom-left) ordering. Returns: A tuple that is the fixed line as well as a string indicating whether the line is horizontal or vertical. """ line = [(get_rotated_box(box)[0], character) for box, character in line] centers = np.array([box.mean(axis=0) for box, _ in line]) sortedx = centers[:, 0].argsort() sortedy = centers[:, 1].argsort() if np.diff(centers[sortedy][:, 1]).sum() > np.diff(centers[sortedx][:, 0]).sum(): return [line[idx] for idx in sortedy], 'vertical' return [line[idx] for idx in sortedx], 'horizontal'