python source code of image

import numpy as np
import os
import cv2
import random
from PIL import Image
from bbox.bbox_transform import clip_boxes
from dataset.imdb import flip_boxes
from utils.image_processing import color_transform

classes = ['__background__',  # always index 0
           'airplane', 'antelope', 'bear', 'bicycle',
           'bird', 'bus', 'car', 'cattle',
           'dog', 'domestic_cat', 'elephant', 'fox',
           'giant_panda', 'hamster', 'horse', 'lion',
           'lizard', 'monkey', 'motorcycle', 'rabbit',
           'red_panda', 'sheep', 'snake', 'squirrel',
           'tiger', 'train', 'turtle', 'watercraft',
           'whale', 'zebra']
classes_map = ['__background__',  # always index 0
               'n02691156', 'n02419796', 'n02131653', 'n02834778',
               'n01503061', 'n02924116', 'n02958343', 'n02402425',
               'n02084071', 'n02121808', 'n02503517', 'n02118333',
               'n02510455', 'n02342885', 'n02374451', 'n02129165',
               'n01674464', 'n02484322', 'n03790512', 'n02324045',
               'n02509815', 'n02411705', 'n01726692', 'n02355227',
               'n02129604', 'n04468005', 'n01662784', 'n04530566',
               'n02062744', 'n02391049']

num_classes = len(classes)

_roidb_file = []

_im_tfile = []


def imread_from_tar(filepath, frame, flag):
    global _im_tfile
    import tarfile

    exist_flag = False
    for i in range(len(_im_tfile)):
        if _im_tfile[i]['path'] == filepath:
            tarf = _im_tfile[i]['tarfile']
            exist_flag = True
            break

    if not exist_flag:
        tarf = tarfile.open(filepath)
        _im_tfile.append({
            'path': filepath,
            'tarfile': tarf
        })

    name = './%010d.jpg' % frame
    if name in tarf.getnames():
        data = np.asarray(bytearray(tarf.extractfile(name).read()), dtype=np.uint8)
        return cv2.imdecode(data, flag)
    else:
        return None


# TODO: This two functions should be merged with individual data loader
def get_image(roidb, config):
    """
    preprocess image and return processed roidb
    :param roidb: a list of roidb
    :return: list of img as in mxnet format
    roidb add new item['im_info']
    0 --- x (width, second dim of im)
    |
    y (height, first dim of im)
    """
    num_images = len(roidb)
    processed_ims = []
    processed_roidb = []
    for i in range(num_images):
        roi_rec = roidb[i]
        if 'tar' in roi_rec['pattern']:
            assert os.path.exists(roi_rec['pattern']), '%s does not exist'.format(roi_rec['pattern'])
            im = imread_from_tar(roi_rec['pattern'], roi_rec['image'], cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
        else:
            assert os.path.exists(roi_rec['image']), '%s does not exist'.format(roi_rec['image'])
            im = cv2.imread(roi_rec['image'], cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
        if roidb[i]['flipped']:
            im = im[:, ::-1, :]
        new_rec = roi_rec.copy()
        scale_ind = random.randrange(len(config.SCALES))
        target_size = config.SCALES[scale_ind][0]
        max_size = config.SCALES[scale_ind][1]
        im, im_scale = resize(im, target_size, max_size, stride=config.network.IMAGE_STRIDE)
        im_tensor = transform(im, config.network.PIXEL_MEANS)
        processed_ims.append(im_tensor)
        im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale]
        new_rec['boxes'] = clip_boxes(np.round(roi_rec['boxes'].copy() * im_scale), im_info[:2])
        new_rec['im_info'] = im_info
        processed_roidb.append(new_rec)
    return processed_ims, processed_roidb


def get_pair_image(roidb, config):
    """
    preprocess image and return processed roidb
    :param roidb: a list of roidb
    :return: list of img as in mxnet format
    roidb add new item['im_info']
    0 --- x (width, second dim of im)
    |
    y (height, first dim of im)
    """
    num_images = len(roidb)
    processed_ims = []
    processed_ref_ims = []
    processed_eq_flags = []
    processed_roidb = []
    for i in range(num_images):
        roi_rec = roidb[i]

        eq_flag = 0  # 0 for unequal, 1 for equal
        assert os.path.exists(roi_rec['image']), '%s does not exist'.format(roi_rec['image'])
        im = cv2.imread(roi_rec['image'], cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)

        if roi_rec.has_key('pattern'):
            ref_id = min(
                max(roi_rec['frame_seg_id'] + np.random.randint(config.TRAIN.MIN_OFFSET, config.TRAIN.MAX_OFFSET + 1),
                    0), roi_rec['frame_seg_len'] - 1)
            ref_image = roi_rec['pattern'] % ref_id
            assert os.path.exists(ref_image), '%s does not exist'.format(ref_image)
            ref_im = cv2.imread(ref_image, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
            if ref_id == roi_rec['frame_seg_id']:
                eq_flag = 1
        else:
            ref_im = im.copy()
            eq_flag = 1

        if roidb[i]['flipped']:
            im = im[:, ::-1, :]
            ref_im = ref_im[:, ::-1, :]

        new_rec = roi_rec.copy()
        scale_ind = random.randrange(len(config.SCALES))
        target_size = config.SCALES[scale_ind][0]
        max_size = config.SCALES[scale_ind][1]

        im, im_scale = resize(im, target_size, max_size, stride=config.network.IMAGE_STRIDE)
        ref_im, im_scale = resize(ref_im, target_size, max_size, stride=config.network.IMAGE_STRIDE)
        im_tensor = transform(im, config.network.PIXEL_MEANS)
        ref_im_tensor = transform(ref_im, config.network.PIXEL_MEANS)
        processed_ims.append(im_tensor)
        processed_ref_ims.append(ref_im_tensor)
        processed_eq_flags.append(eq_flag)
        im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale]
        new_rec['boxes'] = roi_rec['boxes'].copy() * im_scale
        new_rec['im_info'] = im_info
        processed_roidb.append(new_rec)
    return processed_ims, processed_ref_ims, processed_eq_flags, processed_roidb


def load_vid_annotation(image_path, data_path):
    """
    for a given index, load image and bounding boxes info from XML file
    :param index: index of a specific image
    :return: record['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
    """

    import xml.etree.ElementTree as ET
    roi_rec = dict()
    filename = image_path.replace('JPEG', 'xml')
    filename = filename.replace('Data', 'Annotations')
    tree = ET.parse(filename)
    size = tree.find('size')
    roi_rec['height'] = float(size.find('height').text)
    roi_rec['width'] = float(size.find('width').text)

    objs = tree.findall('object')
    num_objs = len(objs)

    boxes = np.zeros((num_objs, 4), dtype=np.uint16)
    gt_classes = np.zeros((num_objs), dtype=np.int32)
    overlaps = np.zeros((num_objs, num_classes), dtype=np.float32)
    valid_objs = np.zeros((num_objs), dtype=np.bool)

    class_to_index = dict(zip(classes_map, range(num_classes)))
    # Load object bounding boxes into a data frame.
    for ix, obj in enumerate(objs):
        bbox = obj.find('bndbox')
        # Make pixel indexes 0-based
        x1 = np.maximum(float(bbox.find('xmin').text), 0)
        y1 = np.maximum(float(bbox.find('ymin').text), 0)
        x2 = np.minimum(float(bbox.find('xmax').text), roi_rec['width'] - 1)
        y2 = np.minimum(float(bbox.find('ymax').text), roi_rec['height'] - 1)
        if not class_to_index.has_key(obj.find('name').text):
            continue
        valid_objs[ix] = True
        cls = class_to_index[obj.find('name').text.lower().strip()]
        boxes[ix, :] = [x1, y1, x2, y2]
        gt_classes[ix] = cls
        overlaps[ix, cls] = 1.0

    boxes = boxes[valid_objs, :]
    gt_classes = gt_classes[valid_objs]
    overlaps = overlaps[valid_objs, :]

    assert (boxes[:, 2] >= boxes[:, 0]).all()

    roi_rec.update({'boxes': boxes,
                    'gt_classes': gt_classes,
                    'gt_overlaps': overlaps,
                    'max_classes': overlaps.argmax(axis=1),
                    'max_overlaps': overlaps.max(axis=1),
                    'flipped': False})
    return roi_rec


def img_aug(name, im, config, new_rec, origin_size, max_size, shape_diff=False):
    from augmentations import SSDAugmentation
    if new_rec[name].shape[0] != 0:
        label = np.ones((new_rec[name].shape[0],), dtype=bool)
        im_aug, bbs_aug, mask = SSDAugmentation(mean=config.network.PIXEL_MEANS,
                                                expand_scale=config.TRAIN.ssd_expand_scale,
                                                crop_pert=config.TRAIN.ssd_crop_pert,
                                                color=config.TRAIN.ssd_color,
                                                no_iou_limit=config.TRAIN.ssd_no_iou_limit,
                                                )(im.copy(), new_rec[name].copy(), label)
        im_aug = im_aug.astype(np.uint8)
        if shape_diff:
            im_aug, im_scale = resize_to_2(im_aug, origin_size[0], origin_size[1], stride=config.network.IMAGE_STRIDE)
            tmp_im = np.zeros((origin_size[0], origin_size[1], 3),
                              dtype=im_aug.dtype)
            tmp_im[:, :, :] = config.network.PIXEL_MEANS
            tmp_im[0:im_aug.shape[0], 0:im_aug.shape[1], :] = im_aug
            im_aug = tmp_im
            im_scale = [im_scale, im_scale]
        else:
            im_aug, im_scale = resize_to(im_aug, origin_size, max_size, \
                                         stride=config.network.IMAGE_STRIDE)
    else:
        bbs_aug = None
        if shape_diff:
            im, im_scale = resize_to_2(im, origin_size[0], origin_size[1], stride=config.network.IMAGE_STRIDE)
            tmp_im = np.zeros((origin_size[0], origin_size[1], 3),
                              dtype=im.dtype)
            tmp_im[:, :, :] = config.network.PIXEL_MEANS
            tmp_im[0:im.shape[0], 0:im.shape[1], :] = im
            im = tmp_im
            im_scale = [im_scale, im_scale]
        else:
            im, im_scale = resize_to(im, origin_size, max_size, stride=config.network.IMAGE_STRIDE)

    if bbs_aug is not None:
        bbs_aug[:, :4] *= [im_scale[0], im_scale[1], im_scale[0], im_scale[1]]

    # need to have at lease one boxes if used for loss
    if config.TRAIN.loss_frames != 1:
        if bbs_aug is not None:
            if bbs_aug.shape[0] != 0:
                new_rec[name] = bbs_aug.copy()
            else:
                new_rec[name] = np.zeros((1, 4))
            new_rec[name.split('_')[0] + '_gt_classes'] = new_rec[name.split('_')[0] + '_gt_classes'][mask]

    if bbs_aug is not None:
        im = im_aug

    return im, bbs_aug


def get_triple_image(roidb, config):
    """
    preprocess image and return processed roidb
    :param roidb: a list of roidb
    :return: list of img as in mxnet format
    roidb add new item['im_info']
    0 --- x (width, second dim of im)
    |
    y (height, first dim of im)
    """
    num_images = len(roidb)
    processed_ims = []
    processed_bef_ims = []
    processed_aft_ims = []
    processed_roidb = []
    for i in range(num_images):
        roi_rec = roidb[i]
        assert os.path.exists(roi_rec['image']), '%s does not exist'.format(roi_rec['image'])
        im = cv2.imread(roi_rec['image'], cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)

        shape_diff = False
        if roi_rec.has_key('pattern'):
            # get two different frames from the interval [frame_id + MIN_OFFSET, frame_id + MAX_OFFSET]
            offsets = np.random.choice(config.TRAIN.MAX_OFFSET - config.TRAIN.MIN_OFFSET + 1, 2, replace=False) \
                      + config.TRAIN.MIN_OFFSET

            bef_id = min(max(roi_rec['frame_seg_id'] + offsets[0], 0), roi_rec['frame_seg_len'] - 1)
            aft_id = min(max(roi_rec['frame_seg_id'] + offsets[1], 0), roi_rec['frame_seg_len'] - 1)
            bef_image = roi_rec['pattern'] % bef_id
            aft_image = roi_rec['pattern'] % aft_id

            assert os.path.exists(bef_image), '%s does not exist'.format(bef_image)
            assert os.path.exists(aft_image), '%s does not exist'.format(aft_image)
            bef_im = cv2.imread(bef_image, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
            aft_im = cv2.imread(aft_image, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)

            if config.TRAIN.data_aug:
                data_path = config.dataset.dataset_path
                bef_rec = load_vid_annotation(bef_image, data_path)
                aft_rec = load_vid_annotation(aft_image, data_path)

            condition_has_flipped = False
        else:
            # for DET, conditional frames' gt boxes are the same as the training frame,
            # thus also flipped gt boxes if flip
            # only gt boxes has been flipped, not image
            condition_has_flipped = True
            bef_im = im.copy()
            aft_im = im.copy()

            if config.TRAIN.data_aug:
                bef_rec = roi_rec.copy()
                aft_rec = roi_rec.copy()

        # gt bbox if flipped outside this when the flip flag is True
        if roidb[i]['flipped']:
            # have to flip the training image since the gt boxes is already flipped
            im = im[:, ::-1, :]
            # do random flip on the conditional frames
            # be careful with the gt boxes if conditional frames are also included in loss
            # or using ssd-like augmentation, need the gt boxes correct for cropping the image
            if config.TRAIN.random_flip_condition_frames:
                flip_flags = np.random.choice(2, config.TRAIN.train_frames - 1, replace=True)
                if flip_flags[0]:
                    bef_im = bef_im[:, ::-1, :]
                if flip_flags[1]:
                    aft_im = aft_im[:, ::-1, :]
            else:
                flip_flags = np.ones(config.TRAIN.train_frames - 1)
                bef_im = bef_im[:, ::-1, :]
                aft_im = aft_im[:, ::-1, :]

            if condition_has_flipped:
                flip_flags = 1 - flip_flags

        else:
            flip_flags = np.zeros(config.TRAIN.train_frames - 1)

        new_rec = roi_rec.copy()
        scale_ind = random.randrange(len(config.SCALES))
        target_size = config.SCALES[scale_ind][0]
        max_size = config.SCALES[scale_ind][1]

        im, im_scale = resize(im, target_size, max_size, stride=config.network.IMAGE_STRIDE)

        # data augmentation color change
        if config.TRAIN.data_aug:
            color_factor = config.TRAIN.COLOR_FACTOR
            im = color_transform(im, color_factor)
            bef_im = color_transform(bef_im, color_factor)
            aft_im = color_transform(aft_im, color_factor)

        # bbox transformation corresponding to image resize operation
        new_rec['boxes'] = roi_rec['boxes'].copy() * im_scale
        if config.TRAIN.data_aug and not config.TRAIN.noncur_aug:
            from augmentations import SSDAugmentation
            label = np.ones((new_rec['boxes'].shape[0],), dtype=bool)
            im_aug, bbs_aug, mask = SSDAugmentation(mean=config.network.PIXEL_MEANS,
                                                    expand_scale=config.TRAIN.ssd_expand_scale,
                                                    crop_pert=config.TRAIN.ssd_crop_pert,
                                                    color=config.TRAIN.ssd_color
                                                    )(im.copy(), new_rec['boxes'].copy(), label)
            im_aug = im_aug.astype(np.uint8)
            im_aug, _im_scale = resize(im_aug, target_size, max_size, stride=config.network.IMAGE_STRIDE)
            bbs_aug = bbs_aug * _im_scale

            if np.sum(mask) != 0:
                im = im_aug
                if bbs_aug.shape[0] != 0:
                    new_rec['boxes'] = bbs_aug.copy()
                else:
                    new_rec['boxes'] = np.zeros((1, 4))
                new_rec['gt_classes'] = new_rec['gt_classes'][mask]

        train_img_size = im.shape[:2]

        im_tensor = transform(im, config.network.PIXEL_MEANS)
        processed_ims.append(im_tensor)
        im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale]

        if config.TRAIN.data_aug:
            # correct label for conditional frames if their losses is used.
            if flip_flags[0]:
                old_boxes = bef_rec['boxes'].copy()
                flipped_boxes = flip_boxes(old_boxes, bef_rec['width'])
                bef_rec['boxes'] = flipped_boxes
            if flip_flags[1]:
                old_boxes = aft_rec['boxes'].copy()
                flipped_boxes = flip_boxes(old_boxes, aft_rec['width'])
                aft_rec['boxes'] = flipped_boxes

            new_rec['bef_gt_classes'] = bef_rec['gt_classes'].copy()
            new_rec['aft_gt_classes'] = aft_rec['gt_classes'].copy()

        bef_im, bef_im_scale = resize(bef_im, target_size, max_size, stride=config.network.IMAGE_STRIDE)
        aft_im, aft_im_scale = resize(aft_im, target_size, max_size, stride=config.network.IMAGE_STRIDE)

        if config.TRAIN.data_aug:
            new_rec['bef_boxes'] = bef_rec['boxes'].copy() * bef_im_scale
            new_rec['aft_boxes'] = aft_rec['boxes'].copy() * aft_im_scale

        if config.TRAIN.data_aug:
            origin_size = train_img_size
            bef_im, bef_bbs_aug = img_aug('bef_boxes', bef_im, config, new_rec, origin_size, max_size, shape_diff)
            aft_im, aft_bbs_aug = img_aug('aft_boxes', aft_im, config, new_rec, origin_size, max_size, shape_diff)

        bef_im_tensor = transform(bef_im, config.network.PIXEL_MEANS)
        aft_im_tensor = transform(aft_im, config.network.PIXEL_MEANS)
        processed_bef_ims.append(bef_im_tensor)
        processed_aft_ims.append(aft_im_tensor)

        new_rec['im_info'] = im_info
        processed_roidb.append(new_rec)

    return processed_ims, processed_bef_ims, processed_aft_ims, processed_roidb


def resize(im, target_size, max_size, stride=0, interpolation=cv2.INTER_LINEAR):
    """
    only resize input image to target size and return scale
    :param im: BGR image input by opencv
    :param target_size: one dimensional size (the short side)
    :param max_size: one dimensional max size (the long side)
    :param stride: if given, pad the image to designated stride
    :param interpolation: if given, using given interpolation method to resize image
    :return:
    """
    im_shape = im.shape
    im_size_min = np.min(im_shape[0:2])
    im_size_max = np.max(im_shape[0:2])
    im_scale = float(target_size) / float(im_size_min)
    # prevent bigger axis from being more than max_size:
    if np.round(im_scale * im_size_max) > max_size:
        im_scale = float(max_size) / float(im_size_max)
    im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=interpolation)

    if stride == 0:
        return im, im_scale
    else:
        # pad to product of stride
        im_height = int(np.ceil(im.shape[0] / float(stride)) * stride)
        im_width = int(np.ceil(im.shape[1] / float(stride)) * stride)
        im_channel = im.shape[2]
        padded_im = np.zeros((im_height, im_width, im_channel))
        padded_im[:im.shape[0], :im.shape[1], :] = im
        return padded_im, im_scale


def resize_to(im, target_size, max_size, stride=0, interpolation=cv2.INTER_LINEAR):
    """
    only resize input image to target size and return scale
    :param im: BGR image input by opencv
    :param target_size: one dimensional size (the short side)
    :param max_size: one dimensional max size (the long side)
    :param stride: if given, pad the image to designated stride
    :param interpolation: if given, using given interpolation method to resize image
    :return:
    """
    im_shape = im.shape[:2]
    im_scale = np.zeros((2,), dtype=np.float32)
    im_scale[1] = float(target_size[0]) / float(im_shape[0])
    im_scale[0] = float(target_size[1]) / float(im_shape[1])
    # prevent bigger axis from being more than max_size:
    # if np.round(im_scale * im_size_max) > max_size:
    #     im_scale = float(max_size) / float(im_size_max)
    im = cv2.resize(im, (target_size[1], target_size[0]), interpolation=interpolation)

    if stride == 0:
        return im, im_scale
    else:
        # pad to product of stride
        im_height = int(np.ceil(im.shape[0] / float(stride)) * stride)
        im_width = int(np.ceil(im.shape[1] / float(stride)) * stride)
        im_channel = im.shape[2]
        padded_im = np.zeros((im_height, im_width, im_channel))
        padded_im[:im.shape[0], :im.shape[1], :] = im
        return padded_im, im_scale


def transform(im, pixel_means):
    """
    transform into mxnet tensor
    substract pixel size and transform to correct format
    :param im: [height, width, channel] in BGR
    :param pixel_means: [B, G, R pixel means]
    :return: [batch, channel, height, width]
    """
    im_tensor = np.zeros((1, 3, im.shape[0], im.shape[1]))
    for i in range(3):
        im_tensor[0, i, :, :] = im[:, :, 2 - i] - pixel_means[2 - i]
    return im_tensor


def transform_seg_gt(gt):
    """
    transform segmentation gt image into mxnet tensor
    :param gt: [height, width, channel = 1]
    :return: [batch, channel = 1, height, width]
    """
    gt_tensor = np.zeros((1, 1, gt.shape[0], gt.shape[1]))
    gt_tensor[0, 0, :, :] = gt[:, :]

    return gt_tensor


def transform_inverse(im_tensor, pixel_means):
    """
    transform from mxnet im_tensor to ordinary RGB image
    im_tensor is limited to one image
    :param im_tensor: [batch, channel, height, width]
    :param pixel_means: [B, G, R pixel means]
    :return: im [height, width, channel(RGB)]
    """
    assert im_tensor.shape[0] == 1
    im_tensor = im_tensor.copy()
    # put channel back
    channel_swap = (0, 2, 3, 1)
    im_tensor = im_tensor.transpose(channel_swap)
    im = im_tensor[0]
    assert im.shape[2] == 3
    im += pixel_means[[2, 1, 0]]
    im = im.astype(np.uint8)
    return im


def tensor_vstack(tensor_list, pad=0):
    """
    vertically stack tensors
    :param tensor_list: list of tensor to be stacked vertically
    :param pad: label to pad with
    :return: tensor with max shape
    """
    ndim = len(tensor_list[0].shape)
    dtype = tensor_list[0].dtype
    islice = tensor_list[0].shape[0]
    dimensions = []
    first_dim = sum([tensor.shape[0] for tensor in tensor_list])
    dimensions.append(first_dim)
    for dim in range(1, ndim):
        dimensions.append(max([tensor.shape[dim] for tensor in tensor_list]))
    if pad == 0:
        all_tensor = np.zeros(tuple(dimensions), dtype=dtype)
    elif pad == 1:
        all_tensor = np.ones(tuple(dimensions), dtype=dtype)
    else:
        all_tensor = np.full(tuple(dimensions), pad, dtype=dtype)
    if ndim == 1:
        for ind, tensor in enumerate(tensor_list):
            all_tensor[ind * islice:(ind + 1) * islice] = tensor
    elif ndim == 2:
        for ind, tensor in enumerate(tensor_list):
            all_tensor[ind * islice:(ind + 1) * islice, :tensor.shape[1]] = tensor
    elif ndim == 3:
        for ind, tensor in enumerate(tensor_list):
            all_tensor[ind * islice:(ind + 1) * islice, :tensor.shape[1], :tensor.shape[2]] = tensor
    elif ndim == 4:
        for ind, tensor in enumerate(tensor_list):
            all_tensor[ind * islice:(ind + 1) * islice, :tensor.shape[1], :tensor.shape[2], :tensor.shape[3]] = tensor
    else:
        raise Exception('Sorry, unimplemented.')
    return all_tensor


def resize_to_2(im, target_size, max_size, stride=0, interpolation=cv2.INTER_LINEAR):
    """
    only resize input image to target size and return scale
    :param im: BGR image input by opencv
    :param target_size: one dimensional size (the short side)
    :param max_size: one dimensional max size (the long side)
    :param stride: if given, pad the image to designated stride
    :param interpolation: if given, using given interpolation method to resize image
    :return:
    """
    im_shape = im.shape
    im_size_min = im_shape[0]
    im_size_max = im_shape[1]
    im_scale = float(target_size) / float(im_size_min)
    # prevent bigger axis from being more than max_size:
    if np.round(im_scale * im_size_max) > max_size:
        im_scale = float(max_size) / float(im_size_max)
    im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=interpolation)

    if stride == 0:
        return im, im_scale
    else:
        # pad to product of stride
        im_height = int(np.ceil(im.shape[0] / float(stride)) * stride)
        im_width = int(np.ceil(im.shape[1] / float(stride)) * stride)
        im_channel = im.shape[2]
        padded_im = np.zeros((im_height, im_width, im_channel))
        padded_im[:im.shape[0], :im.shape[1], :] = im
        return padded_im, im_scale