python source code of pipeline

# common python
import decimal
from collections import OrderedDict

# scientific python
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# foxhound
from foxhound.preprocessing import Tokenizer
from foxhound import iterators
from foxhound.theano_utils import floatX, intX
from foxhound.transforms import SeqPadded
from foxhound.rng import py_rng

# fuel
from fuel.transformers import Merge

# local imports
from dataset import (coco, cocoXYFilenames, FoxyDataStream, GloveTransformer,
    ShuffleBatch, FoxyIterationScheme, loadFeaturesTargets, fillOutFilenames,
    sbuXYFilenames)
from utils import dict2json, vStackMatrices, DecimalEncoder, ModelIO

def sampleCaptions(ymb, K=1):
    """ymb = minibatch of captions
    it samples K captions from the available list of n captions
    """
    sampled_captions = []
    for captions_of_an_img in ymb:
        sampled_captions.extend(py_rng.sample(captions_of_an_img, K))
    return sampled_captions

def concatCaptions(ymb, K=5):
    """ymb = minibatch of captions
    it concatenates the first K captions from the available list of n captions.
    While destorying some sentence order when concatenating, this is
    helpful when we want the presence of a token"""

    def joinListOfCaptions(listOfCaptions):
        return " ".join(listOfCaptions[:K])
    return map(joinListOfCaptions, ymb)

def prepVect(min_df=2, max_features=50000, n_captions=5, n_sbu=None,
             multilabel=False):
    print "prepping the Word Tokenizer..."
    _0, _1, trY, _3 = coco(mode='full', n_captions=n_captions)
    if n_sbu:
        _4, sbuY, _5 = sbuXYFilenames(n_sbu)
        trY.extend(sbuY)
    vect = Tokenizer(min_df=min_df, max_features=max_features)
    captions = sampleCaptions(trY, n_captions)
    vect.fit(captions)
    if multilabel:
        mlb = MultiLabelBinarizer()
        mlb.fit(vect.transform(captions))
        return vect, mlb
    # if not multilabel:
    return vect

dataset_name = 'coco_train2014'
n_sbu=None
if n_sbu:
    dataset_name += "+sbu%d" % n_sbu
# global vectorizer
vect_name = 'tokenizer_%s' % dataset_name
mlb_name = 'mlb_%s' % dataset_name
try:
    if mlb_name:
        mlb = ModelIO.load(mlb_name)
        print "MLB loaded from file"
    vect = ModelIO.load(vect_name)
    # vect = ModelIO.load('tokenizer_reddit') # gloveglove
    print "Tokenizer loaded from file."
except:
    if mlb_name:
        vect, mlb = prepVect(n_sbu=n_sbu, n_captions=1, multilabel=True)
        ModelIO.save(vect, vect_name)
        ModelIO.save(mlb, mlb_name)
        print "Saved %s, %s for future use." % (vect_name, mlb_name)
    else:
        vect = prepVect(n_sbu=n_sbu, n_captions=1)
        ModelIO.save(vect, vect_name)
        print "Saved %s for future use." % vect_name

class DataETL():

    @staticmethod
    def getFinalStream(X, Y, sources, sources_k, batch_size=128, embedding_dim=300,
        shuffle=False):
        """
        Returns
        -------
        merged stream with sources = sources + sources_k
        """
        trX, trY = (X, Y)
        trX_k, trY_k = (X, Y)

        # Transforms
        trXt=lambda x: floatX(x)
        Yt=lambda y: intX(SeqPadded(vect.transform(sampleCaptions(y)), 'back'))

        # Foxhound Iterators
        # RCL: Write own iterator to sample positive examples/captions, since there are 5 for each image.
        train_iterator = iterators.Linear(
            trXt=trXt, trYt=Yt, size=batch_size, shuffle=shuffle
            )
        train_iterator_k = iterators.Linear(
            trXt=trXt, trYt=Yt, size=batch_size, shuffle=True
            )

        # FoxyDataStreams
        train_stream = FoxyDataStream(
              (trX, trY)
            , sources
            , train_iterator
            , FoxyIterationScheme(len(trX), batch_size)
            )

        train_stream_k = FoxyDataStream(
              (trX_k, trY_k)
            , sources_k
            , train_iterator_k
            , FoxyIterationScheme(len(trX), batch_size)
            )
        glove_version = "glove.6B.%sd.txt.gz" % embedding_dim
        train_transformer = GloveTransformer(
            glove_version, data_stream=train_stream, vectorizer=vect
            )
        train_transformer_k = GloveTransformer(
            glove_version, data_stream=train_stream_k, vectorizer=vect
            )

        # Final Data Streams w/ contrastive examples
        final_train_stream = Merge(
              (train_transformer, ShuffleBatch(train_transformer_k))
            , sources + sources_k
            )
        final_train_stream.iteration_scheme = FoxyIterationScheme(len(trX), batch_size)

        return final_train_stream


class ModelEval():

    @staticmethod
    def rankcaptions(filenames, top_n=5):
        # n_captions = top_n # the captions it ranks as highest should all be relevant
        n_captions = 1 # RCL: image caption mismatch when n_captions is not just one
        batch_size = 128
        image_features, captions = loadFeaturesTargets(filenames, 'val2014', n_captions=n_captions)
        stream = DataETL.getFinalStream(
              image_features
            , captions
            , ("image_vects", "word_vects")
            , ("image_vects_k", "word_vects_k")
            , batch_size=batch_size
            )

        f_emb = ModelIO.load('/home/luke/datasets/coco/predict/fullencoder_maxfeatures.50000')
        im_emb, s_emb = None, None
        print "Computing Image and Text Embeddings"
        for batch in stream.get_epoch_iterator():
            im_vects = batch[0]
            s_vects = batch[1]
            batch_im_emb, batch_s_emb = f_emb(im_vects, s_vects)
            im_emb = vStackMatrices(im_emb, batch_im_emb)
            s_emb = vStackMatrices(s_emb, batch_s_emb)

        # account for make sure theres matching fns for each of the n_captions
        image_fns = fillOutFilenames(filenames, n_captions=n_captions)

        print "Computing Cosine Distances and Ranking Captions"
        relevant_captions = ModelEval.getRelevantCaptions(
            im_emb, s_emb, image_fns, captions, z=n_captions, top_n=top_n
        )
        dict2json(relevant_captions, "rankcaptions_fullencoder_maxfeatures.50000.json", cls=DecimalEncoder)
        return relevant_captions

    @staticmethod
    def rankscores(final_train_stream, final_test_stream, f_emb):

        i2t = ModelEval.i2t
        train_ep = final_train_stream.get_epoch_iterator()
        test_ep = final_test_stream.get_epoch_iterator()

        train_metrics = []
        test_metrics = []
        for train_data, test_data in train_ep, test_ep:
            im_emb, s_emb = f_emb(*train_data)
            train_metrics.append(i2t(im_emb, s_emb))
            im_emb, s_emb = f_emb(*train_data)
            test_metrics.append(i2t(im_emb, s_emb))
        train_metrics = np.vstack(train_metrics)
        test_metrics = np.vstack(test_metrics)

        metric_names = ("r1", "r5", "r10", "med")
        print "\nMean Metric Scores:"
        for i, metric_name in enumerate(metric_names):
            for metrics in (train_metrics, test_metrics):
                print "%s: %d" % metric_name, np.mean(metrics[:, i])

        return train_metrics, test_metrics

    @staticmethod
    def i2t(images, captions, z=1, npts=None):
        """
        Taken from https://github.com/ryankiros/skip-thoughts/blob/master/eval_rank.py
        Images: (z*N, K) matrix of image embeddings
        Captions: (z*N, K) matrix of caption embeddings
        """
        if npts == None:
            npts = images.shape[0] / z
        index_list = []

        # Project captions
        for i in range(len(captions)):
            captions[i] /= np.linalg.norm(captions[i])

        ranks = np.zeros(npts)
        for index in range(npts):

            # Get query image
            im = images[z * index].reshape(1, images.shape[1])
            im /= np.linalg.norm(im)

            # Compute scores
            d = np.dot(im, captions.T).flatten()
            inds = np.argsort(d)[::-1]
            index_list.append(inds[0])

            # Score
            rank = 1e20
            for i in range(z*index, z*index + z, 1):
                tmp = np.where(inds == i)[0][0]
                if tmp < rank:
                    rank = tmp
            ranks[index] = rank

        # Compute metrics
        r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
        r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
        r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
        medr = np.floor(np.median(ranks)) + 1
        return (r1, r5, r10, medr)

    @staticmethod
    def t2i(images, captions, z=1, npts=None):
        """
        Taken from https://github.com/ryankiros/skip-thoughts/blob/master/eval_rank.py
        Images: (z*N, K) matrix of image embeddings
        Captions: (z*N, K) matrix of captions embeddings
        """
        if npts == None:
            npts = images.shape[0] / z
        ims = np.array([images[i] for i in range(0, len(images), z)])


        # Project images
        for i in range(len(ims)):
            ims[i] /= np.linalg.norm(ims[i])

        # Project captions
        for i in range(len(captions)):
            captions[i] /= np.linalg.norm(captions[i])

        ranks = np.zeros(z * npts)
        for index in range(npts):

            # Get query captions
            queries = captions[z*index : z*index + z]

            # Compute scores
            d = np.dot(queries, ims.T)
            inds = np.zeros(d.shape)
            for i in range(len(inds)):
                inds[i] = np.argsort(d[i])[::-1]
                ranks[z * index + i] = np.where(inds[i] == index)[0][0]

        # Compute metrics
        r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
        r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
        r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
        medr = np.floor(np.median(ranks)) + 1
        return (r1, r5, r10, medr)

    @staticmethod
    def ImageSentenceRanking(images, captions, z=1):
        """
        Print nicely formatted tables each iteration
        N = 1000 is commonly used.
        images: (N, K)
        captions: (N, K)
        z: number of images per caption (see i2t, t2i)
        """
        rank_labels = ('R @ 1', 'R @ 5', 'R @ 10', 'Med R')
        image_annotation = ModelEval.i2t(images, captions, z)
        image_search = ModelEval.t2i(images, captions, z)

        print pd.DataFrame(OrderedDict(zip(rank_labels, image_annotation)), 
            index=pd.Index(["Image Annotation"])).to_string()
        print pd.DataFrame(OrderedDict(zip(rank_labels, image_search)), 
            index=pd.Index(["Image Search    "])).to_string()

        return image_annotation, image_search

    @staticmethod
    def ImageSearchSingleCategory(ims, mlb_matrix, captions, category_key, thresh):
        """do a single category, like dog"""
        # project images
        for i in range(len(ims)):
            ims[i] /= np.linalg.norm(ims[i])
        
        # project single captions
        for i in range(len(captions)):
            captions[i] /= np.linalg.norm(captions[i])

        assert captions.shape[0] == 1

        sims = np.dot(captions, ims.T).flatten()
        
        found = []
        n_found = 0
        n_matches = 0
        for i in range(len(sims)):
            if sims[i] > thresh:
                n_found += 1
                found.append((i, sims[i]))

                # depends on category_key being a single integer
                if mlb_matrix[i][category_key]:
                    n_matches += 1

        print "n_found: ", n_found
        print "n_matches: ", n_matches
        return found

    @staticmethod
    def getRelevantCaptions(im_emb, s_emb, image_fns, caption_strings, top_n, z=1, npts=None):
        """
        parameters
        ----------
        Images: (z*N, K) matrix of im_emb
        Captions: (z*N, K) matrix of captions
        image_fns: the filenames of im_emb for each image vectors in the im_emb matrix
        captions_strings: the captions (as strings) for each sentence vector in captions matrix

        Returns
        -------
        relevant_captions: dictionary storing the top_n rank predictions for each image file

        looks like
        {
            ... , 
            filepath.npy: {
                captions: ["caption with ranking 1", ...]
                cos_sims: [0.9, 0.5, ...]
            },
            ...
        }
        """
        if npts == None:
            npts = im_emb.shape[0] / z

        relevant_captions = {}

        # Project captions
        for i in range(len(s_emb)):
            s_emb[i] /= np.linalg.norm(s_emb[i])

        for index in range(npts):

            # Get query image
            im = im_emb[z * index].reshape(1, im_emb.shape[1])
            im /= np.linalg.norm(im)

            # Compute scores
            d = np.dot(im, s_emb.T).flatten() # cosine distance
            inds = np.argsort(d)[::-1] # sort by highest cosine distance

            # build up relevant top_n captions
            image_fn = image_fns[index]
            top_inds = inds[:top_n]
            top_captions = [caption_strings[ind] for ind in top_inds]
            top_cos_sims = [decimal.Decimal(float(d[ind])) for ind in top_inds]

            relevant_captions[image_fn] = {
                  "captions": top_captions
                , "cos_sims": top_cos_sims
                }

        return relevant_captions

    @staticmethod
    def rank_function(self=None):
        teX, teY, _ = cocoXYFilenames(n_captions=5)
        sources = ('X', 'Y')
        sources_k = ('X_k', 'Y_k')
        stream = DataETL.getFinalStream(teX, teY, sources=sources,
                            sources_k=sources_k, batch_size=1000,
                            shuffle=False)
        images, captions, _0, _1 = stream.get_epoch_iterator().next()

        predict_dir = '/home/luke/datasets/coco/predict/'
        # encoder_name = '+coco_encoder_lstm_dim.300'
        encoder_name = 'sbu.100000+coco_encoder_lstm_dim.300_adadelta'
        # encoder_name = 'fullencoder_maxfeatures.50000_epochsampler'
        f_emb = ModelIO.load(predict_dir + encoder_name)
        image_embs, caption_embs = f_emb(images, captions)
        ModelEval.ImageSentenceRanking(image_embs, caption_embs)