python source code of dataset

from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
from builtins import zip
from builtins import range
import hashlib
import os
import numpy
from ektelo.data import Histogram
from ektelo import util
from ektelo.mixins import CartesianInstantiable
from ektelo.mixins import Marshallable
import scipy.stats as sci
from functools import reduce


class Dataset(Marshallable):

    def __init__(self, hist, reduce_to_domain_shape=None, dist=None):
        """
            Any instances with equal key() values should have equal hash() values
            domain_shape will be result of regular grid partition
        """
        if isinstance(reduce_to_domain_shape, int): # allow for integers in 1D, instead of shape tuples
            reduce_to_domain_shape = (reduce_to_domain_shape, )

        if dist is not None:
            self._dist_str = numpy.array_str(numpy.array(dist))
        else:
            self._dist_str = ''

        self._hist = hist
        self._reduce_to_domain_shape = reduce_to_domain_shape if hist.shape != reduce_to_domain_shape else None
        self._dist = dist
        self._payload = None

        self._compiled = False

    def reduce_data(self,p_grid,data):
        """reduce data to the domain indicated by p_grid"""
        assert data.shape == p_grid.shape, 'Shape of x and shape of partition vector must match.'
        #get out_shape
        if p_grid.ndim == 1:
            out_shape =(len(numpy.unique(p_grid)), )
        else:
            out = []
            for i in range(p_grid.ndim):
                ind_slice = [0] * p_grid.ndim
                ind_slice[i] = slice(0, p_grid.shape[i])
                out.append(len(numpy.unique(p_grid[ind_slice])))

            # This is doesn't work for dimension >2, the compressed array is not a strip of the domain,
            # but the first element along the axis. It could be an nd array itself.
            #out = [len(numpy.unique(numpy.compress([True], p_grid, axis=i))) for i in range(p_grid.ndim)]
            #out.reverse()   # rows/cols need to be reversed here
            out_shape = tuple(out)
        #reduce
        unique, indices, inverse, counts = numpy.unique(p_grid, return_index=True, return_inverse=True, return_counts=True)
        res = numpy.zeros_like(unique, dtype=float)
        for index, c in numpy.ndenumerate(data.ravel()):   # needs to be flattened for parallel indexing with output of unique
            res[ inverse[index] ] += c
        return numpy.array(res).reshape(out_shape)


    def compile(self):
        if self._compiled:
            return self

        self._payload = self._hist
        if self._reduce_to_domain_shape:    # reduce cells to get desired shape
            q = [divmod(dom,grid) for (dom,grid) in zip(self._payload.shape, self._reduce_to_domain_shape)]
            for red, rem in q:
                assert rem == 0, 'Domain must be reducible to target domain by uniform grid: %i, %i' % (red,rem)
            grid_shape = [r[0] for r in q]
            p_grid = partition_grid(self._payload.shape, grid_shape)
            self._payload = self.reduce_data(p_grid,self._payload)  # update payload

            if self._dist is not None:
                self._dist = self.reduce_data(p_grid,self._dist)  # update dist

        self._compiled = True
        self._payload = self._payload.astype("int") # partition engines need payload to be of type int

        return self


    @property
    def payload(self):
        return self.compile()._payload

    @property
    def dist(self):
        return self.compile()._dist

    @property
    def scale(self):
        return self.payload.sum()

    @property
    def domain_shape(self):
        return self.payload.shape

    @property
    def fractionZeros(self):
        zero_count = (self.payload == 0).sum()
        return util.old_div(float(zero_count), self.payload.size)

    @property
    def maxCount(self):
        return self.payload.max()

    @property
    def key(self):
        """ Using leading 8 characters of hash as key for now """
        return self.hash[:8]

    @property
    def hash(self):
        m = hashlib.sha1()
        m.update(util.prepare_for_hash(self.__class__.__name__))
        m.update(util.prepare_for_hash(numpy.array(self._hist)))
        m.update(util.prepare_for_hash(numpy.array(self._reduce_to_domain_shape)))
        m.update(util.prepare_for_hash(self._dist_str))
        return m.hexdigest()

    # represent object state as dict (for storage in datastore)
    def asDict(self):
        d = util.class_to_dict(self, ignore_list=['_dist', 'dist', '_payload', 'payload', '_compiled',
                                                  '_dist_str', '_hist', '_reduce_to_domain_shape'])
        ## add decomposed shape attrs ??
        return d

    def analysis_payload(self):
        return {'payload': self.payload}


class DatasetFromRelation(Dataset, CartesianInstantiable):

    def __init__(self, 
                 relation,
                 nickname,
                 normed=False,
                 weights=None,
                 reduce_to_dom_shape=None):
        self.init_params = util.init_params_from_locals(locals())
        self.fname = nickname

        histogram = Histogram(relation.df, relation.bins, relation.domains, normed, weights).generate()
        self.statistics = relation.statistics()
        self.statistics.update(histogram.statistics())

        if reduce_to_dom_shape != None:
            self.statistics['domain_valid'] = False
        else:
            self.statistics['domain_valid'] = True

        super(DatasetFromRelation,self).__init__(histogram.hist.copy(order='C'), reduce_to_dom_shape)

    @staticmethod
    def instantiate(params):
        relation = data.Relation(params['config']).load_csv(params['csv_filename'], params['csv_delimiter'])

        return DatasetFromRelation(relation, params['nickname'], params['normed'], params['weights'], params['domain'])

    def asDict(self):
        d = util.class_to_dict(self, ignore_list=['_dist', '_payload', 'payload', '_compiled',
                                                  '_dist_str', '_hist', '_reduce_to_domain_shape',
                                                  'statistics'])
        return d


class DatasetFromFile(Dataset, CartesianInstantiable):

    def __init__(self, fileName, reduce_to_dom_shape=None):
        self.init_params = util.init_params_from_locals(locals())
        self.fname = fileName

        #assert nickname in filenameDict, 'Filename parameter not recognized: %s' % nickname
        hist = load(self.fname)
        super(DatasetFromFile,self).__init__(hist, reduce_to_dom_shape, None)

    @staticmethod
    def instantiate(params):
        return DatasetFromFile(params['nickname'], params['domain'])
        
class DatasetSampled(Dataset):

    def __init__(self, dist, sample_to_scale, reduce_to_dom_shape=None, seed=None):
        self.seed = seed
        prng = numpy.random.RandomState(self.seed)
        hist = subSample(dist, sample_to_scale, prng)
        super(DatasetSampled,self).__init__(hist, reduce_to_dom_shape, dist)


class DatasetUniformityVaryingSynthetic(Dataset, CartesianInstantiable):

    def __init__(self, uniformity, dom_shape, scale, seed=None):
        '''
        Generate synthetic data of varying uniformity
        uniformity: parameter in [0,1] where 1 produces perfectly uniform data, 0 is maximally non-uniform
        All cells set to zero except fraction equal to 'uniformity' value.
        All non-zero cells are set to same value, then shuffled randomly.
        '''
        self.init_params = util.init_params_from_locals(locals())
        self.u = uniformity
        assert 0 <= uniformity and uniformity <= 1
        n = numpy.prod(dom_shape)       # total domain size
        hist = numpy.zeros(n)
        num_nonzero = max(1, int(uniformity * n))
        hist_value = util.old_div(scale, num_nonzero)
        hist[0:num_nonzero] = hist_value

        prng = numpy.random.RandomState(seed)
        prng.shuffle(hist)

        super(DatasetUniformityVaryingSynthetic, self).__init__(hist.reshape(dom_shape), reduce_to_domain_shape=None, dist=None)


    @staticmethod
    def instantiate(params):
        return DatasetUniformityVaryingSynthetic(params['uniformity'], params['dom_shape'], params['scale'], params['seed'])



tryPaths = [os.path.join(os.environ['EKTELO_HOME'], 'ektelo')]


def load(filename):
    """
    Load from file and return original counts (should be integral)
    """
    path = [p for p in tryPaths if os.path.exists(p)]
    assert path, 'data path not found.'

    fullpath = os.path.join( path[0], 'datafiles', filename )	# fixed for now, so we don't have to include as a parameter in automated key extraction stuff...
    _, file_extension = os.path.splitext(fullpath)
    if file_extension == '.txt':
        x = []
        with open(fullpath, 'r') as fp:
            for ln in fp.readlines():
                x.append(int(ln))
        return numpy.array(x, dtype='int32')
    elif file_extension == '.npy':
        return numpy.load(fullpath)
    else:
        raise Exception('Unrecognized file extension')


def subSample(dist, sampleSize, prng):
    ''' Generate a subsample of given sampleSize from an input distribution '''
    samples = prng.choice(a=dist.size, replace=True, size=int(sampleSize), p=dist.flatten())
    hist = numpy.histogram(samples, bins=dist.size, range=(0,dist.size))[0].astype('int32')		# return only counts (not bins)
    return hist.reshape( dist.shape )

# partition vector generation for data reduction
def cantor_pairing(a, b):
    """
    A function returning a unique positive integer for every pair (a,b) of positive integers
    """
    return util.old_div((a+b)*(a+b+1), 2) + b

def general_pairing(tup):
    """ Generalize cantor pairing to k dimensions """
    if len(tup) == 0:
        return tup[0]  # no need for pairing in 1D case
    else:
        return reduce(cantor_pairing, tup)

def canonicalTransform(vector):
    """ transform a partition vector according to the canonical order.
     if bins are noncontiguous, use position of first occurrence.
     e.g. [3,4,1,1] => [1,2,3,3]; [3,4,1,1,0,1]=>[0,1,2,2,3,2]
    """
    unique, indices, inverse = numpy.unique(vector, return_index=True, return_inverse=True)
    uniqueInverse, indexInverse = numpy.unique(inverse,return_index =True)

    indexInverse.sort()
    newIndex = inverse[indexInverse]
    tups = list(zip(uniqueInverse, newIndex)) #replace uniqueInverse with unique if we want to use the exact numbers in partition vector
    tups.sort(key=lambda x: x[1])
    u = numpy.array( [u for (u,i) in tups] )
    vector = u[inverse].reshape(vector.shape)
    return vector

def partition_grid(domain_shape, grid_shape):
    """
    :param domain_shape: a shape tuple describing the domain, e.g (6,6) (in 2D)
    :param grid_shape: a shape tuple describing cells to be grouped, e.g. (2,3) to form groups of 2 rows and 3 cols
        note: in 1D both of the above params can simply be integers
    :return: a partition array in which grouped cells are assigned some unique 'group id' values
             no guarantee on order of the group ids, only that they are unique
    """

    # allow for integers instead of shape tuples in 1D
    if isinstance(domain_shape, int):
        domain_shape = (domain_shape, )
    if isinstance(grid_shape, int):
        grid_shape = (grid_shape,)

    assert sum(divmod(d,b)[1] for (d,b) in zip(domain_shape, grid_shape)) == 0, "Domain size along each dimension should be a multiple of size of block"

    def g(*idx):
        """
        This function will receive an index tuple from numpy.fromfunction
        It's behavior depends on grid_shape: take (i,j) and divide by grid_shape (in each dimension)
        That becomes an identifier of the block; then assign a unique integer to it using pairing.
        """
        x = numpy.array(idx)
        y = numpy.array(grid_shape)
        return general_pairing( util.old_div(x,y) )  # broadcasting integer division

    h = numpy.vectorize(g)

    # numpy.fromfunction builds an array of domain_shape by calling a function with each index tuple (e.g. (i,j))
    partition_array = numpy.fromfunction(h, domain_shape, dtype=int)
    # transform to canonical order
    partition_array = canonicalTransform(partition_array)
    return partition_array





if __name__ == '__main__':

    scale = 10000
    for u in [util.old_div(i,10.0) for i in range(0,11)]:
        print(u)
        d = DatasetUniformityVaryingSynthetic(uniformity=u, dom_shape=(10,), scale=scale, seed=999)
        size = d.payload.size
        unif = numpy.empty_like(d.payload)
        unif.fill( util.old_div(scale,float(size)) )
        print(sum(abs(unif - d.payload)))