python source code of datasets

import random

import numpy as np
import pandas as pd

from keras.applications import imagenet_utils
from scipy.misc.pilutil import imread

from params import args
from sklearn.model_selection import train_test_split
import sklearn.utils
from random_transform_mask import ImageWithMaskFunction
import os


def pad(image, padding_w, padding_h):
    batch_size, height, width, depth = image.shape
    # @TODO: Avoid creating new array
    new_image = np.zeros((batch_size, height + padding_h * 2, width + padding_w * 2, depth), dtype=image.dtype)
    new_image[:, padding_h:(height + padding_h), padding_w:(width + padding_w)] = image
    # @TODO: Fill padded zones
    # new_image[:, :padding_w] = image[:, :padding_w]
    # new_image[:padding_h, :] = image[:padding_h, :]
    # new_image[-padding_h:, :] = image[-padding_h:, :]

    return new_image


def unpad(image, padding_w):
    return image[:, :, padding_w:(image.shape[1] - padding_w), :]


def generate_filenames(car_ids):
    return ['{}_{}.jpg'.format(id, str(angle + 1).zfill(2)) for angle in range(16) for id in car_ids]

def bootstrapped_split(car_ids, seed=args.seed):
    """
    # Arguments
        metadata: metadata.csv provided by Carvana (should include
        `train` column).

    # Returns
        A tuple (train_ids, test_ids)
    """
    all_ids = pd.Series(car_ids)
    train_ids, valid_ids = train_test_split(car_ids, test_size=args.test_size_float,
                                                     random_state=seed)

    np.random.seed(seed)
    bootstrapped_idx = np.random.random_integers(0, len(train_ids))
    bootstrapped_train_ids = train_ids[bootstrapped_idx]

    return generate_filenames(bootstrapped_train_ids.values), generate_filenames(valid_ids)


def build_batch_generator(filenames, img_dir=None, batch_size=None,
                          shuffle=False, transformations=None,
                          out_size=None, crop_size=None, mask_dir=None, aug=False):
    mask_function = ImageWithMaskFunction(out_size=out_size, crop_size=crop_size, mask_dir=mask_dir)

    while True:
        # @TODO: Should we fixate the seed here?
        if shuffle:
            filenames = sklearn.utils.shuffle(filenames)

        for start in range(0, len(filenames), batch_size):
            batch_x = []
            end = min(start + batch_size, len(filenames))
            train_batch = filenames[start:end]

            for filename in train_batch:
                img = imread(os.path.join(img_dir, filename))

                stacked_channels = []
                for i in range(args.stacked_channels):
                    channel_path = os.path.join(args.stacked_channels_dir,
                                                str(i),
                                                filename.replace('.jpg', '.png'))
                    stacked_channel = imread(channel_path, mode='L')
                    stacked_channels.append(stacked_channel)
                stacked_img = np.dstack((img, *stacked_channels))
                batch_x.append(stacked_img)

            batch_x = np.array(batch_x, np.float32)
            batch_x, masks = mask_function.mask_pred(batch_x, train_batch, range(batch_size), aug)

            if crop_size is None:
                # @TODO: Remove hardcoded padding
                batch_x, masks = pad(batch_x, 1, 0), pad(masks, 1, 0)

            yield imagenet_utils.preprocess_input(batch_x, mode=args.preprocessing_function), masks