python source code of vfn

# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np
import skimage.io as io
import skimage.transform as transform
from os.path import join
import network as nw
import argparse
import json
import time

global_dtype = tf.float32
global_dtype_np = np.float32
batch_size = 200


def overlap_ratio(x1, y1, w1, h1, x2, y2, w2, h2):
    intersection = max(0, min(x1 + w1, x2 + w2) - max(x1, x2)) * max(0, min(y1 + h1, y2 + h2) - max(y1, y2))
    union = (w1 * h1) + (w2 * h2) - intersection
    return float(intersection) / float(union)


def evaluate_sliding_window(img_filename, crops):
    img = io.imread(img_filename).astype(np.float32)/255
    if img.ndim == 2: # Handle B/W images
        img = np.expand_dims(img, axis=-1)
        img = np.repeat(img, 3, 2)

    img_crops = np.zeros((batch_size, 227, 227, 3))
    for i in xrange(len(crops)):
        crop = crops[i]
        img_crop = transform.resize(img[crop[1]:crop[1]+crop[3],crop[0]:crop[0]+crop[2]], (227, 227))-0.5
        img_crop = np.expand_dims(img_crop, axis=0)
        img_crops[i,:,:,:] = img_crop

    # compute ranking scores
    scores = sess.run([score_func], feed_dict={image_placeholder: img_crops})

    # find the optimal crop
    idx = np.argmax(scores[:len(crops)])
    best_window = crops[idx]

    # return the best crop
    return (best_window[0], best_window[1], best_window[2], best_window[3])


def evaluate_FCDB():
    slidling_windows_string = open('./sliding_window.json', 'r').read()
    sliding_windows = json.loads(slidling_windows_string)

    cnt = 0
    alpha = 0.75
    alpha_cnt = 0
    accum_boundary_displacement = 0
    accum_overlap_ratio = 0
    crop_cnt = 0

    for item in sliding_windows:
        # print 'processing', item['filename']
        crops = item['crops']
        img_filename = join('FCDB', item['filename'])
        img = io.imread(img_filename)
        height = img.shape[0]
        width = img.shape[1]

        # ground truth
        x = crops[0][0]
        y = crops[0][1]
        w = crops[0][2]
        h = crops[0][3]

        best_x, best_y, best_w, best_h = evaluate_sliding_window(img_filename, crops)
        boundary_displacement = (abs(best_x - x) + abs(best_x + best_w - x - w))/float(width) + (abs(best_y - y) + abs(best_y + best_h - y - h))/float(height)
        accum_boundary_displacement += boundary_displacement
        ratio = overlap_ratio(x, y, w, h, best_x, best_y, best_w, best_h)
        if ratio >= alpha:
            alpha_cnt += 1
        accum_overlap_ratio += ratio
        cnt += 1
        crop_cnt += len(crops)

    print 'Average overlap ratio: {:.4f}'.format(accum_overlap_ratio / cnt)
    print 'Average boundary displacement: {:.4f}'.format(accum_boundary_displacement / (cnt * 4.0))
    print 'Alpha recall: {:.4f}'.format(100 * float(alpha_cnt) / cnt)
    print 'Total image evaluated:', cnt
    print 'Average crops per image:', float(crop_cnt) / cnt


def evaluate_aesthetics_score(images):
    scores = np.zeros(shape=(len(images),))
    for i in range(len(images)):
        img = images[i].astype(np.float32)/255
        img_resize = transform.resize(img, (227, 227))-0.5
        img_resize = np.expand_dims(img_resize, axis=0)
        scores[i] = sess.run([score_func], feed_dict={image_placeholder: img_resize})[0]
    return scores


def str2bool(v):
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--embedding_dim", help="Embedding dimension before mapping to one-dimensional score", type=int, default = 1000)
    parser.add_argument("--initial_parameters", help="Path to initial parameter file", type=str, default="alexnet.npy")
    parser.add_argument("--ranking_loss", help="Type of ranking loss", type=str, choices=['ranknet', 'svm'], default='svm')
    parser.add_argument("--snapshot", help="Name of the checkpoint files", type=str, default='./snapshots/model-spp-max')
    parser.add_argument("--spp", help="Whether to use spatial pyramid pooling in the last layer or not", type=str2bool, default=True)
    parser.add_argument("--pooling", help="Which pooling function to use", type=str, choices=['max', 'avg'], default='max')

    args = parser.parse_args()

    embedding_dim = args.embedding_dim
    ranking_loss = args.ranking_loss
    snapshot = args.snapshot
    net_data = np.load(args.initial_parameters).item()
    image_placeholder = tf.placeholder(dtype=global_dtype, shape=[batch_size,227,227,3])
    var_dict = nw.get_variable_dict(net_data)
    SPP = args.spp
    pooling = args.pooling
    with tf.variable_scope("ranker") as scope:
        feature_vec = nw.build_alexconvnet(image_placeholder, var_dict, embedding_dim, SPP=SPP, pooling=pooling)
        score_func = nw.score(feature_vec)

    # load pre-trained model
    saver = tf.train.Saver(tf.global_variables())
    sess = tf.Session(config=tf.ConfigProto())
    sess.run(tf.global_variables_initializer())
    saver.restore(sess, snapshot)

    print "Snapshot: {}".format(snapshot)
    start_time = time.time()
    evaluate_FCDB()
    print("--- %s seconds ---" % (time.time() - start_time))