python source code of sfs

import itertools as it
import collections as co
import json
import autograd.numpy as np
import numpy as raw_np
from cached_property import cached_property
import scipy
import scipy.sparse
import gzip
import os
import logging
from .compressed_counts import _hashed2config, _config2hashable
from .compressed_counts import CompressedAlleleCounts
from .configurations import ConfigList
from .configurations import _ConfigList_Subset
from ..util import memoize_instance


def site_freq_spectrum(sampled_pops, freqs_by_locus, length=None):
    """Create :class:`Sfs` from a list of dicts of counts.

    ``freqs_by_locus[l][config]`` gives the frequency of ``config`` \
    on locus ``l``.

    A ``config`` is a ``(n_pops,2)``-array, where ``config[p][a]`` \
    is the count of allele ``a`` in population ``p``. ``a=0`` represents \
    the ancestral allele while ``a=1`` represents the derived allele.

    :param list sampled_pops: list of population names
    :param list freqs_by_locus: list of dict
    :param float,None length: Number of bases in the dataset. Set to ``None`` if unknown.
    :rtype: :class:`Sfs`
    """
    index2loc = []
    index2count = []
    loci_counters = []

    def chained_sequences():
        for loc, locus_iter in enumerate(freqs_by_locus):
            loci_counters.append(co.Counter())
            try:
                locus_iter = locus_iter.items()
            except AttributeError:
                locus_iter = zip(locus_iter, iter(lambda: 1, 0))
            for config, count in locus_iter:
                index2loc.append(loc)
                index2count.append(count)
                yield config

    compressed_counts = CompressedAlleleCounts.from_iter(
        chained_sequences(), len(sampled_pops))
    config_array = compressed_counts.config_array
    index2uniq = compressed_counts.index2uniq

    assert len(index2loc) == len(index2count) and len(
        index2count) == len(index2uniq)
    for loc, count, uniq in zip(index2loc, index2count, index2uniq):
        loci_counters[loc][uniq] += count

    configs = ConfigList(sampled_pops, config_array, None, None)
    return Sfs(loci_counters, configs, folded=False, length=length)



class Sfs(object):
    """
    Class for representing observed site frequnecy spectrum data.

    To create a :class:`Sfs` object, use \
    :meth:`SnpAlleleCounts.extract_sfs`, \
    :meth:`Sfs.load`, or :func:`site_freq_spectrum`. \
    Do NOT call the class constructor directly, it is for internal use only.
    """
    @classmethod
    def from_matrix(cls, mat, configs, *args, **kwargs):
        # convert to csc
        mat = scipy.sparse.csc_matrix(mat)
        indptr = mat.indptr
        loci = []
        for i in range(mat.shape[1]):
            loci.append(np.array([
                mat.indices[indptr[i]:indptr[i+1]],
                mat.data[indptr[i]:indptr[i+1]]]))

        return cls(loci, configs, *args, **kwargs)

    @classmethod
    def load(cls, f):
        """Load :class:`Sfs` from file created by :meth:`Sfs.dump` or ``python -m momi.extract_sfs``

        :param str,file f: file object or file name to read in
        :rtype: :class:`Sfs`
        """
        if isinstance(f, str):
            fname = os.path.expanduser(f)
            if fname.endswith(".gz"):
                with gzip.open(fname, "rt") as f:
                    return cls.load(f)
            else:
                with open(fname) as f:
                    return cls.load(f)

        logging.getLogger(__name__).info("Reading json...")
        info = json.load(f)
        logging.getLogger(__name__).info("Finished reading json...")

        logging.getLogger(__name__).info("Constructing configs...")
        configs = info.pop("configs")
        logging.getLogger(__name__).info("{} unique configs detected".format(len(configs)))
        # don't use autograd here for performance
        configs = raw_np.array(configs, dtype=int)
        logging.getLogger(__name__).info("Converted to numpy array...")
        configs = ConfigList(info.pop("sampled_pops"), configs)
        logging.getLogger(__name__).info("Finished constructing configs")

        logging.getLogger(__name__).info("Constructing SFS...")
        loci = []
        for locus, locus_rows in it.groupby(
                info.pop("(locus,config_id,count)"), lambda x: x[0]):
            loci.append({config_id: count
                        for _, config_id, count in locus_rows})

        ret = cls(loci, configs, **info)
        logging.getLogger(__name__).info("Finished constructing SFS")

        return ret

    def __init__(self, loci, configs, folded, length):
        self.folded = folded
        self._length = length

        self.configs = configs

        self.loc_idxs, self.loc_counts = [], []
        for loc in loci:
            if len(loc) == 0:
                self.loc_idxs.append(np.array([], dtype=int))
                self.loc_counts.append(np.array([], dtype=float))
            else:
                try:
                    loc.items()
                except:
                    loc = np.array(loc)
                    if len(loc.shape) == 2:
                        assert loc.shape[0] == 2
                        idxs, cnts = loc[0, :], loc[1, :]
                    else:
                        idxs, cnts = np.unique(loc, return_counts=True)
                else:
                    idxs, cnts = zip(*loc.items())
                self.loc_idxs.append(np.array(idxs, dtype=int))
                self.loc_counts.append(np.array(cnts, dtype=float))

        if len(self.loc_idxs) > 1:
            self._total_freqs = self.freqs_matrix.dot(np.ones(self.n_loci))
            assert self._total_freqs.shape == (self.freqs_matrix.shape[0],)
        else:
            # avoid costly building of frequency matrix, when there are many
            # Sfs's of a single locus (e.g. in many stochastic minibatches)
            idxs, = self.loc_idxs
            cnts, = self.loc_counts
            self._total_freqs = np.zeros(len(self.configs))
            self._total_freqs[idxs] = cnts

        assert not np.any(self._total_freqs == 0)

    def dump(self, f):
        """Write Sfs to file

        :param str,file f: Filename or object. If name ends with ".gz" gzip it.
        """
        if isinstance(f, str):
            fname = os.path.expanduser(f)
            if fname.endswith(".gz"):
                with gzip.open(fname, "wt") as f:
                    self.dump(f)
            else:
                with open(fname, 'w') as f:
                    self.dump(f)
            return

        print("{", file=f)
        print('\t"sampled_pops": {},'.format(
            json.dumps(list(self.sampled_pops))), file=f)
        print('\t"folded": {},'.format(
            json.dumps(self.folded)), file=f)
        print('\t"length": {},'.format(
            json.dumps(self._length)), file=f)
        print('\t"configs": [', file=f)
        n_configs = len(self.configs.value)
        for i, c in enumerate(self.configs.value.tolist()):
            if i != n_configs - 1:
                print("\t\t{},".format(c), file=f)
            else:
                # no trailing comma
                print("\t\t{}".format(c), file=f)
        print("\t],", file=f)
        print('\t"(locus,config_id,count)": [', file=f)
        for loc in range(self.n_loci):
            n_loc_rows = len(self.loc_idxs[loc])
            assert n_loc_rows == len(self.loc_counts[loc])
            for i in range(n_loc_rows):
                config_id = int(self.loc_idxs[loc][i])
                count = float(self.loc_counts[loc][i])
                if loc == self.n_loci - 1 and i == n_loc_rows - 1:
                    # no trailing comma
                    print("\t\t{}".format(json.dumps(
                        (loc, config_id, count))), file=f)
                else:
                    print("\t\t{},".format(json.dumps(
                        (loc, config_id, count))), file=f)
        print("\t]", file=f)
        print("}", file=f)

    @property
    def populations(self):
        return self.sampled_pops

    @memoize_instance
    def combine_loci(self):
        # return copy with all loci combined
        return self.from_matrix(self.freqs_matrix.sum(axis=1),
                                self.configs, self.folded,
                                self._length)

    @property
    def freqs_matrix(self):
        """Sparse matrix representing the frequencies at each locus.

        The ``(i,j)``-th entry gives the frequency of the ``i``-th config in :attr:`Sfs.config_array` at locus ``j``

        :rtype: :class:`scipy.sparse.csr_matrix`
        """
        return self.csr_freqs_matrix

    @cached_property
    def csr_freqs_matrix(self):
        return _csr_freq_matrix_from_counters(
            self.loc_idxs, self.loc_counts, len(self.configs))

    @cached_property
    def avg_pairwise_hets(self):
        # avg number of hets per ind per pop (assuming Hardy-Weinberg)
        n_nonmissing = np.sum(self.configs.value, axis=2)
        # for denominator, assume 1 allele is drawn from whole sample, and 1
        # allele is drawn only from nomissing alleles
        denoms = np.maximum(n_nonmissing * (self.sampled_n - 1), 1.0)
        p_het = 2 * self.configs.value[:, :, 0] * \
            self.configs.value[:, :, 1] / denoms

        return self.freqs_matrix.T.dot(p_het)

    def resample(self):
        """Create a new SFS by resampling blocks with replacement.

        Note the resampled SFS is assumed to have the same length in base pairs \
        as the original SFS, which may be a poor assumption if the blocks are not of equal length.

        :returns: Resampled SFS
        :rtype: :class:`Sfs`
        """
        loci = np.random.choice(
            np.arange(self.n_loci), size=self.n_loci, replace=True)
        mat = self.freqs_matrix[:, loci]
        to_keep = np.asarray(mat.sum(axis=1) > 0).squeeze()
        to_keep = np.arange(len(self.configs))[to_keep]
        mat = mat[to_keep, :]
        configs = _ConfigList_Subset(self.configs, to_keep)

        return self.from_matrix(mat, configs, self.folded, self.length)

    @property
    def sampled_n(self):
        """Number of samples per population

        ``sampled_n[i]`` is the number of haploids \
        in the ``i``-th population of :attr:`Sfs.sampled_pops`

        :returns: 1-d array of integers
        :rtype: :class:`numpy.ndarray`
        """
        return self.configs.sampled_n

    @property
    def ascertainment_pop(self):
        return self.configs.ascertainment_pop

    @property
    def sampled_pops(self):
        """Names of sampled populations

        :returns: 1-d array of strings
        :rtype: :class:`numpy.ndarray`
        """
        return self.configs.sampled_pops

    @property
    def n_loci(self):
        """The number of loci in the dataset

        :rtype: int
        """
        return len(self.loc_idxs)

    @property
    def n_nonzero_entries(self):
        return len(self.configs)

    @property
    def config_array(self):
        """Array of unique configs.

        This is a 3-d array with shape ``(n_configs, n_pops, 2)``. \
        ``config_array[i, p, a]`` gives the count of allele ``a`` \
        in population ``p`` in configuration ``i``. \
        Allele ``a=0`` is ancestral, ``a=1`` is derived. \
        :attr:`Sfs.sampled_pops` gives the ordering of the populations.

        :returns: array of configs
        :rtype: :class:`numpy.ndarray`
        """
        return self.configs.config_array

    @property
    def length(self):
        """Length of data in bases. ``None`` if unknown.

        :rtype: float
        """
        return self._length

    @memoize_instance
    def n_snps(self, vector=False):
        """Number of SNPs, either per-locus or overall.

        :param bool vector: Whether to return a single total count, or an array of counts per-locus
        """
        if vector:
            return raw_np.array(self.freqs_matrix.sum(axis=0)).reshape((-1,))
        else:
            return np.sum(self._total_freqs)

    def __eq__(self, other):
        loci_dicts = self._get_dict(vector=True)
        try:
            return loci_dicts == other._get_dict(vector=True)
        except AttributeError:
            return False

    def _get_dict(self, vector=False, locus=None):
        if vector:
            assert locus is None
            return [self._get_dict(locus=loc) for loc in range(self.n_loci)]
        elif locus is None:
            return dict(zip(map(_config2hashable, self.configs),
                            self._total_freqs))
        return dict(zip(
            (_config2hashable(self.configs[i]) for i in self.loc_idxs[locus]),
            self.loc_counts[locus]))

    def to_dict(self, vector=False):
        if not vector:
            return {_hashed2config(k): v for k, v in self._get_dict().items()}
        else:
            return [{_hashed2config(k): v for k, v in d.items()}
                    for d in self._get_dict(vector=True)]

    @cached_property
    def _entropy(self):
        counts = self._total_freqs
        n_snps = float(self.n_snps())
        p = counts / n_snps
        # return np.sum(p * np.log(p))
        ret = np.sum(p * np.log(p))

        # correct for missing data
        sampled_n = np.sum(self.configs.value, axis=2)
        sampled_n_counts = co.Counter()
        assert len(counts) == len(sampled_n)
        for c, n in zip(counts, sampled_n):
            n = tuple(n)
            sampled_n_counts[n] += c
        sampled_n_counts = np.array(
            list(sampled_n_counts.values()), dtype=float)

        ret = ret + np.sum(sampled_n_counts / n_snps *
                           np.log(n_snps / sampled_n_counts))
        assert not np.isnan(ret)
        return ret

    def _get_muts_poisson_entropy(self, use_pairwise_diffs):
        if use_pairwise_diffs:
            return self._pairwise_muts_poisson_entropy
        else:
            return self._total_muts_poisson_entropy

    @cached_property
    def _total_muts_poisson_entropy(self):
        lambd = self.n_snps(vector=True)
        lambd = lambd[lambd > 0]
        return np.sum(-lambd + lambd * np.log(lambd)
                      - scipy.special.gammaln(lambd + 1))

    @cached_property
    def _pairwise_muts_poisson_entropy(self):
        lambd = self.avg_pairwise_hets
        ret = -lambd + lambd * np.log(lambd) - scipy.special.gammaln(lambd + 1)
        ret[lambd <= 1e-16] = 0
        ret = ret * self.sampled_n / float(np.sum(
            self.sampled_n[self.ascertainment_pop]))
        return np.sum(ret[:, self.ascertainment_pop])

    def fold(self):
        """
        :returns: A copy of the SFS, but with folded entries.
        :rtype: :class:`Sfs`
        """
        def get_folded(config):
            if tuple(config[:, 0]) < tuple(config[:, 1]):
                return config[:, ::-1]
            else:
                return config

        compressed_folded = CompressedAlleleCounts.from_iter(
            map(get_folded, self.configs),
            len(self.sampled_pops), sort=False)

        return self.from_matrix(
            compressed_folded.index2uniq_mat @ self.freqs_matrix,
            ConfigList(self.sampled_pops, compressed_folded.config_array,
                        sampled_n=self.sampled_n,
                        ascertainment_pop=self.ascertainment_pop),
            folded=True, length=self._length)

    def _copy(self, sampled_n=None):
        """
        See also: ConfigList._copy()
        """
        if sampled_n is None:
            sampled_n = self.sampled_n
        return self.from_matrix(
            self.csr_freqs_matrix,
            ConfigList(self.sampled_pops, self.configs.value,
                        sampled_n=sampled_n,
                        ascertainment_pop=self.ascertainment_pop),
            self.folded, self._length)

    def _integrate_sfs(self, weights, vector=False, locus=None):
        if vector:
            assert locus is None
            return np.array([self._integrate_sfs(weights, locus=loc)
                             for loc in range(self.n_loci)])
        if locus is None:
            idxs, counts = slice(None), self._total_freqs
        else:
            idxs, counts = self.loc_idxs[locus], self.loc_counts[locus]
        return np.sum(weights[idxs] * counts)

    def subset_populations(self, populations, non_ascertained_pops=None):
        """Returns lower-dimensional SFS restricted to a subset of populations.

        :param list populations: list of populations to subset to
        :param list non_ascertained_pops: list of populations to treat as non-ascertained (SNPs must be polymorphic with respect to the ascertained populations)
        :returns: Lower-dimensional SFS restricted to a subset of populations.
        :rtype: :class:`Sfs`
        """
        if non_ascertained_pops is None:
            non_ascertained_pops = []
            for pop, asc in zip(self.sampled_pops, self.ascertainment_pop):
                if not asc:
                    non_ascertained_pops.append(pop)
        return self._subset_populations(tuple(populations),
                                        tuple(non_ascertained_pops))

    @memoize_instance
    def _subset_populations(self, populations, non_ascertained_pops):
        # old indexes of the new population ordering
        old_sampled_pops = list(self.sampled_pops)
        old_pop_idx = np.array([
            old_sampled_pops.index(p) for p in populations], dtype=int)

        # old ascertainment
        ascertained = dict(zip(self.sampled_pops,
                               self.ascertainment_pop))
        # restrict to new populations
        ascertained = {p: ascertained[p] for p in populations}
        # previously non-ascertained pops should remain so
        for pop, is_asc in ascertained.items():
            if not is_asc:
                assert pop in non_ascertained_pops
        # update ascertainment
        for pop in non_ascertained_pops:
            ascertained[pop] = False
        # convert from dict to numpy array
        ascertained = np.array([ascertained[p] for p in populations])

        # keep only polymorphic configs
        asc_only = self.configs[:, old_pop_idx[ascertained], :]
        asc_is_poly = (asc_only.sum(axis=1) != 0).all(axis=1)
        asc_is_poly = np.arange(len(asc_is_poly))[asc_is_poly]
        sub_sfs = self._subset_configs(asc_is_poly)

        # get the new configs
        new_configs = CompressedAlleleCounts.from_iter(
            sub_sfs.configs[:, old_pop_idx, :],
            len(populations), sort=False)

        return self.from_matrix(
            new_configs.index2uniq_mat @ sub_sfs.freqs_matrix,
            ConfigList(populations, new_configs.config_array,
                        ascertainment_pop=ascertained),
            self.folded, self._length)

    def _subset_configs(self, idxs):
        return self.from_matrix(
            self.csr_freqs_matrix[idxs, :],
            _ConfigList_Subset(self.configs, idxs),
            self.folded, self._length)

    @property
    def sfs(self):
        return self

    @cached_property
    def p_missing(self):
        """Estimate of probability that a random allele from each population is missing.

        Missingness is estimated as follows: \
        from each SNP remove a random allele; \
        if the resulting config is monomorphic, then ignore. \
        If polymorphic, then count whether the removed allele \
        is missing or not.

        This avoids bias from fact that we don't observe \
        some polymorphic configs that appear monomorphic \
        after removing the missing alleles.

        :returns: 1-d array of missingness per population
        :rtype: :class:`numpy.ndarray`
        """
        counts = self._total_freqs
        sampled_n = self.sampled_n
        n_pops = len(self.sampled_pops)

        config_arr = self.configs.value
        # augment config_arr to contain the missing counts
        n_miss = sampled_n - np.sum(config_arr, axis=2)
        config_arr = np.concatenate((config_arr, np.reshape(
            n_miss, list(n_miss.shape)+[1])), axis=2)

        ret = []
        for i in range(n_pops):
            n_valid = []
            for allele in (0, 1, -1):
                # configs with removed allele
                removed = np.array(config_arr)
                removed[:, i, allele] -= 1
                # is the resulting config polymorphic?
                valid_removed = (removed[:, i, allele] >= 0) & np.all(
                    np.sum(removed[:, :, :2], axis=1) > 0, axis=1)

                # sum up the valid configs
                n_valid.append(np.sum(
                    (counts * config_arr[:, i, allele])[valid_removed]))
            # fraction of valid configs with missing additional allele
            ret.append(n_valid[-1] / float(sum(n_valid)))
        return np.array(ret)


def _csr_freq_matrix_from_counters(idxs_by_loc, cnts_by_loc,
                                   n_configs):
    data = []
    indices = []
    indptr = []
    n_loc = 0
    for idxs, cnts in zip(idxs_by_loc, cnts_by_loc):
        indptr.append(len(data))
        data.extend(cnts)
        indices.extend(idxs)
        n_loc += 1
    indptr.append(len(data))
    return scipy.sparse.csc_matrix((data, indices, indptr),
                                   shape=(n_configs, n_loc)).tocsr()


def _get_subsample_counts(configs, n):
    subconfigs, weights = [], []
    for pop_comb in it.combinations_with_replacement(configs.sampled_pops, n):
        subsample_n = co.Counter(pop_comb)
        subsample_n = np.array([subsample_n[pop]
                                for pop in configs.sampled_pops], dtype=int)
        if np.any(subsample_n > configs.sampled_n):
            continue

        for sfs_entry in it.product(*(range(sub_n + 1)
                                      for sub_n in subsample_n)):
            sfs_entry = np.array(sfs_entry, dtype=int)
            if np.all(sfs_entry == 0) or np.all(sfs_entry == subsample_n):
                # monomorphic
                continue

            sfs_entry = np.transpose([subsample_n - sfs_entry, sfs_entry])
            cnt_vec = configs.subsample_probs(sfs_entry)
            if not np.all(cnt_vec == 0):
                subconfigs.append(sfs_entry)
                weights.append(cnt_vec)

    return np.array(subconfigs), np.array(weights)


class SimpleNamespace(object):
    pass