python source code of make

import os
import cv2
import h5py
import parmap
import argparse
import numpy as np
try:
    import cPickle as pickle
except ImportError:
    import pickle
from skimage import color
from tqdm import tqdm as tqdm
import sklearn.neighbors as nn
import matplotlib.pylab as plt
from matplotlib.colors import LogNorm
import matplotlib.gridspec as gridspec
from scipy.interpolate import interp1d
from scipy.signal import gaussian, convolve


def format_image(img_path, size):
    """
    Load img with opencv and reshape
    """

    img_color = cv2.imread(img_path)
    img_color = img_color[:, :, ::-1]
    img_black = cv2.imread(img_path, 0)

    img_color = cv2.resize(img_color, (size, size), interpolation=cv2.INTER_AREA)
    img_black = cv2.resize(img_black, (size, size), interpolation=cv2.INTER_AREA)

    img_lab = color.rgb2lab(img_color)

    img_lab = img_lab.reshape((1, size, size, 3)).transpose(0, 3, 1, 2)
    img_color = img_color.reshape((1, size, size, 3)).transpose(0, 3, 1, 2)
    img_black = img_black.reshape((1, size, size, 1)).transpose(0, 3, 1, 2)

    return img_color, img_lab, img_black


def build_HDF5(size=64):
    """
    Gather the data in a single HDF5 file.
    """

    # Read evaluation file, build it if it does not exist
    # In evaluation status, "0" represents training image, "1" represents
    # validation image, "2" represents testing image;
    d_partition = {}
    with open(os.path.join(raw_dir, "Eval/list_eval_partition.txt"), "r") as f:
        lines = f.readlines()
        for celeb in lines:
            celeb = celeb.rstrip().split()
            img = celeb[0]
            attrs = int(celeb[1])
            d_partition[img] = attrs
    with open(os.path.join(data_dir, "d_partition.pickle"), "wb") as fd:
        pickle.dump(d_partition, fd)

    # Put train data in HDF5
    hdf5_file = os.path.join(data_dir, "CelebA_%s_data.h5" % size)
    with h5py.File(hdf5_file, "w") as hfw:

        for dset_idx, dset_type in enumerate(["training", "validation", "test"]):

            list_img = []
            for img in d_partition.keys():
                if d_partition[img] == dset_idx:
                    list_img.append(os.path.join(raw_dir, "img_align_celeba", img))
            list_img = np.array(list_img)

            data_color = hfw.create_dataset("%s_color_data" % dset_type,
                                            (0, 3, size, size),
                                            maxshape=(None, 3, size, size),
                                            dtype=np.uint8)

            data_lab = hfw.create_dataset("%s_lab_data" % dset_type,
                                          (0, 3, size, size),
                                          maxshape=(None, 3, size, size),
                                          dtype=np.float64)

            data_black = hfw.create_dataset("%s_black_data" % dset_type,
                                            (0, 1, size, size),
                                            maxshape=(None, 1, size, size),
                                            dtype=np.uint8)

            num_files = len(list_img)
            chunk_size = 1000
            num_chunks = num_files / chunk_size
            arr_chunks = np.array_split(np.arange(num_files), num_chunks)

            for chunk_idx in tqdm(arr_chunks):

                list_img_path = list_img[chunk_idx].tolist()
                output = parmap.map(format_image, list_img_path, size, pm_parallel=True)

                arr_img_color = np.vstack([o[0] for o in output if o[0].shape[0] > 0])
                arr_img_lab = np.vstack([o[1] for o in output if o[0].shape[0] > 0])
                arr_img_black = np.vstack([o[2] for o in output if o[0].shape[0] > 0])

                # Resize HDF5 dataset
                data_color.resize(data_color.shape[0] + arr_img_color.shape[0], axis=0)
                data_lab.resize(data_lab.shape[0] + arr_img_lab.shape[0], axis=0)
                data_black.resize(data_black.shape[0] + arr_img_black.shape[0], axis=0)

                data_color[-arr_img_color.shape[0]:] = arr_img_color.astype(np.uint8)
                data_lab[-arr_img_lab.shape[0]:] = arr_img_lab.astype(np.float64)
                data_black[-arr_img_black.shape[0]:] = arr_img_black.astype(np.uint8)


def compute_color_prior(size=64, do_plot=False):

    # Load the gamut points location
    q_ab = np.load(os.path.join(data_dir, "pts_in_hull.npy"))

    if do_plot:
        plt.figure(figsize=(15, 15))
        gs = gridspec.GridSpec(1, 1)
        ax = plt.subplot(gs[0])
        for i in range(q_ab.shape[0]):
            ax.scatter(q_ab[:, 0], q_ab[:, 1])
            ax.annotate(str(i), (q_ab[i, 0], q_ab[i, 1]), fontsize=6)
            ax.set_xlim([-110,110])
            ax.set_ylim([-110,110])

    with h5py.File(os.path.join(data_dir, "CelebA_%s_data.h5" % size), "a") as hf:
        # Compute the color prior over a subset of the training set
        # Otherwise it is quite long
        X_ab = hf["training_lab_data"][:100000][:, 1:, :, :]
        npts, c, h, w = X_ab.shape
        X_a = np.ravel(X_ab[:, 0, :, :])
        X_b = np.ravel(X_ab[:, 1, :, :])
        X_ab = np.vstack((X_a, X_b)).T

        if do_plot:
            plt.hist2d(X_ab[:, 0], X_ab[:, 1], bins=100, norm=LogNorm())
            plt.xlim([-110, 110])
            plt.ylim([-110, 110])
            plt.colorbar()
            plt.show()
            plt.clf()
            plt.close()

        # Create nearest neighbord instance with index = q_ab
        NN = 1
        nearest = nn.NearestNeighbors(n_neighbors=NN, algorithm='ball_tree').fit(q_ab)
        # Find index of nearest neighbor for X_ab
        dists, ind = nearest.kneighbors(X_ab)

        # We now count the number of occurrences of each color
        ind = np.ravel(ind)
        counts = np.bincount(ind)
        idxs = np.nonzero(counts)[0]
        prior_prob = np.zeros((q_ab.shape[0]))
        for i in range(q_ab.shape[0]):
            prior_prob[idxs] = counts[idxs]

        # We turn this into a color probability
        prior_prob = prior_prob / (1.0 * np.sum(prior_prob))

        # Save
        np.save(os.path.join(data_dir, "CelebA_%s_prior_prob.npy" % size), prior_prob)

        if do_plot:
            plt.hist(prior_prob, bins=100)
            plt.yscale("log")
            plt.show()


def smooth_color_prior(size=64, sigma=5, do_plot=False):

    prior_prob = np.load(os.path.join(data_dir, "CelebA_%s_prior_prob.npy" % size))
    # add an epsilon to prior prob to avoid 0 vakues and possible NaN
    prior_prob += 1E-3 * np.min(prior_prob)
    # renormalize
    prior_prob = prior_prob / (1.0 * np.sum(prior_prob))

    # Smooth with gaussian
    f = interp1d(np.arange(prior_prob.shape[0]),prior_prob)
    xx = np.linspace(0,prior_prob.shape[0] - 1, 1000)
    yy = f(xx)
    window = gaussian(2000, sigma)  # 2000 pts in the window, sigma=5
    smoothed = convolve(yy, window / window.sum(), mode='same')
    fout = interp1d(xx,smoothed)
    prior_prob_smoothed = np.array([fout(i) for i in range(prior_prob.shape[0])])
    prior_prob_smoothed = prior_prob_smoothed / np.sum(prior_prob_smoothed)

    # Save
    file_name = os.path.join(data_dir, "CelebA_%s_prior_prob_smoothed.npy" % size)
    np.save(file_name, prior_prob_smoothed)

    if do_plot:
        plt.plot(prior_prob)
        plt.plot(prior_prob_smoothed, "g--")
        plt.plot(xx, smoothed, "r-")
        plt.yscale("log")
        plt.show()


def compute_prior_factor(size=64, gamma=0.5, alpha=1, do_plot=False):

    file_name = os.path.join(data_dir, "CelebA_%s_prior_prob_smoothed.npy" % size)
    prior_prob_smoothed = np.load(file_name)

    u = np.ones_like(prior_prob_smoothed)
    u = u / np.sum(1.0 * u)

    prior_factor = (1 - gamma) * prior_prob_smoothed + gamma * u
    prior_factor = np.power(prior_factor, -alpha)

    # renormalize
    prior_factor = prior_factor / (np.sum(prior_factor * prior_prob_smoothed))

    file_name = os.path.join(data_dir, "CelebA_%s_prior_factor.npy" % size)
    np.save(file_name, prior_factor)

    if do_plot:
        plt.plot(prior_factor)
        plt.yscale("log")
        plt.show()


def check_HDF5(size=64):
    """
    Plot images with landmarks to check the processing
    """

    # Get hdf5 file
    hdf5_file = os.path.join(data_dir, "CelebA_%s_data.h5" % size)

    with h5py.File(hdf5_file, "r") as hf:
        data_color = hf["training_color_data"]
        data_lab = hf["training_lab_data"]
        data_black = hf["training_black_data"]
        for i in range(data_color.shape[0]):
            fig = plt.figure()
            gs = gridspec.GridSpec(3, 1)
            for k in range(3):
                ax = plt.subplot(gs[k])
                if k == 0:
                    img = data_color[i, :, :, :].transpose(1,2,0)
                    ax.imshow(img)
                elif k == 1:
                    img = data_lab[i, :, :, :].transpose(1,2,0)
                    img = color.lab2rgb(img)
                    ax.imshow(img)
                elif k == 2:
                    img = data_black[i, 0, :, :] / 255.
                    ax.imshow(img, cmap="gray")
            gs.tight_layout(fig)
            plt.show()
            plt.clf()
            plt.close()


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Build dataset')
    parser.add_argument('--img_size', default=64, type=int,
                        help='Desired Width == Height')
    parser.add_argument('--do_plot', default=False, type=bool,
                        help='Whether to visualize statistics when computing color prior')

    args = parser.parse_args()

    raw_dir = "../../data/raw"
    data_dir = "../../data/processed"

    for d in [raw_dir, data_dir]:
        if not os.path.exists(d):
            os.makedirs(d)

    build_HDF5(size=args.img_size)
    compute_color_prior(size=args.img_size, do_plot=args.do_plot)
    smooth_color_prior(size=args.img_size, do_plot=args.do_plot)
    compute_prior_factor(size=args.img_size, do_plot=args.do_plot)