Python scipy.argsort() Examples

The following are 5 code examples of scipy.argsort(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scipy , or try the search function .
Example #1
Source File: plinkfiles.py    From ldpred with MIT License 5 votes vote down vote up
def parse_plink_snps(genotype_file, snp_indices):
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    num_snps = len(snp_indices)
    raw_snps = sp.empty((num_snps, num_individs), dtype='int8')
    # If these indices are not in order then we place them in the right place while parsing SNPs.
    snp_order = sp.argsort(snp_indices)
    ordered_snp_indices = list(snp_indices[snp_order])
    ordered_snp_indices.reverse()
    # Iterating over file to load SNPs
    snp_i = 0
    next_i = ordered_snp_indices.pop()
    line_i = 0
    max_i = ordered_snp_indices[0]
    while line_i <= max_i: 
        if line_i < next_i:
            next(plinkf)
        elif line_i == next_i:
            line = next(plinkf)
            snp = sp.array(line, dtype='int8')
            bin_counts = line.allele_counts()
            if bin_counts[-1] > 0:
                mode_v = sp.argmax(bin_counts[:2])
                snp[snp == 3] = mode_v
            s_i = snp_order[snp_i]
            raw_snps[s_i] = snp
            if line_i < max_i:
                next_i = ordered_snp_indices.pop()
            snp_i += 1
        line_i += 1
    plinkf.close()
    assert snp_i == len(raw_snps), 'Parsing SNPs from plink file failed.'
    num_indivs = len(raw_snps[0])
    freqs = sp.sum(raw_snps, 1, dtype='float32') / (2 * float(num_indivs))
    return raw_snps, freqs 
Example #2
Source File: MultipleComparisonCorrection.py    From ClearMap with GNU General Public License v3.0 4 votes vote down vote up
def correctPValues(pvalues, method = 'BH'):
    """Corrects p-values for multiple testing using various methods 
    
    Arguments:
        pvalues (array): list of p values to be corrected
        method (Optional[str]): method to use: BH = FDR = Benjamini-Hochberg, B = FWER = Bonferoni
    
    References: 
        - `Benjamini Hochberg, 1995 <http://www.jstor.org/stable/2346101?seq=1#page_scan_tab_contents>`_
        - `Bonferoni correction <http://www.tandfonline.com/doi/abs/10.1080/01621459.1961.10482090#.VmHWUHbH6KE>`_
        - `R statistics package <https://www.r-project.org/>`_
    
    Notes:
        - modified from http://statsmodels.sourceforge.net/ipdirective/generated/scikits.statsmodels.sandbox.stats.multicomp.multipletests.html
    """
    
    pvals = numpy.asarray(pvalues);

    if method.lower() in ['bh', 'fdr']:
        
        pvals_sorted_ids = numpy.argsort(pvals);
        pvals_sorted = pvals[pvals_sorted_ids]
        sorted_ids_inv = pvals_sorted_ids.argsort()

        n = len(pvals);
        bhfactor = numpy.arange(1,n+1)/float(n);

        pvals_corrected_raw = pvals_sorted / bhfactor;
        pvals_corrected = numpy.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]
        pvals_corrected[pvals_corrected>1] = 1;
    
        return pvals_corrected[sorted_ids_inv];
    
    elif method.lower() in ['b', 'fwer']:
        
        n = len(pvals);        
        
        pvals_corrected = n * pvals;
        pvals_corrected[pvals_corrected>1] = 1;\
        
        return pvals_corrected;
        
    #return reject[pvals_sortind.argsort()] 
Example #3
Source File: pcabinaryprojections.py    From aurum-datadiscovery with MIT License 4 votes vote down vote up
def __init__(self, hash_name, projection_count, training_set):
        """
        Computes principal components for training vector set. Uses
        first projection_count principal components for projections.

        Training set must be either a numpy matrix or a list of
        numpy vectors.
        """
        super(PCABinaryProjections, self).__init__(hash_name)
        self.projection_count = projection_count

        # Only do training if training set was specified
        if not training_set is None:
            # Get numpy array representation of input
            training_set = numpy_array_from_list_or_numpy_array(training_set)

            # Get subspace size from training matrix
            self.dim = training_set.shape[0]

            # Get transposed training set matrix for PCA
            training_set_t = numpy.transpose(training_set)

            # Compute principal components
            (eigenvalues, eigenvectors) = perform_pca(training_set_t)

            # Get largest N eigenvalue/eigenvector indices
            largest_eigenvalue_indices = numpy.flipud(
                scipy.argsort(eigenvalues))[:projection_count]

            # Create matrix for first N principal components
            self.components = numpy.zeros((self.dim,
                                           len(largest_eigenvalue_indices)))

            # Put first N principal components into matrix
            for index in range(len(largest_eigenvalue_indices)):
                self.components[:, index] = \
                    eigenvectors[:, largest_eigenvalue_indices[index]]

            # We need the component vectors to be in the rows
            self.components = numpy.transpose(self.components)
        else:
            self.dim = None
            self.components = None

        # This is only used in case we need to process sparse vectors
        self.components_csr = None 
Example #4
Source File: pcadiscretizedprojections.py    From aurum-datadiscovery with MIT License 4 votes vote down vote up
def __init__(self, hash_name, projection_count, training_set, bin_width):
        """
        Computes principal components for training vector set. Uses
        first projection_count principal components for projections.

        Training set must be either a numpy matrix or a list of
        numpy vectors.
        """
        super(PCADiscretizedProjections, self).__init__(hash_name)
        self.projection_count = projection_count
        self.bin_width = bin_width

        # Only do training if training set was specified
        if not training_set is None:
            # Get numpy array representation of input
            training_set = numpy_array_from_list_or_numpy_array(training_set)

            # Get subspace size from training matrix
            self.dim = training_set.shape[0]

            # Get transposed training set matrix for PCA
            training_set_t = numpy.transpose(training_set)

            # Compute principal components
            (eigenvalues, eigenvectors) = perform_pca(training_set_t)

            # Get largest N eigenvalue/eigenvector indices
            largest_eigenvalue_indices = numpy.flipud(
                scipy.argsort(eigenvalues))[:projection_count]

            # Create matrix for first N principal components
            self.components = numpy.zeros((self.dim,
                                           len(largest_eigenvalue_indices)))

            # Put first N principal components into matrix
            for index in range(len(largest_eigenvalue_indices)):
                self.components[:, index] = \
                    eigenvectors[:, largest_eigenvalue_indices[index]]

            # We need the component vectors to be in the rows
            self.components = numpy.transpose(self.components)

        # This is only used in case we need to process sparse vectors
        self.components_csr = None 
Example #5
Source File: recallprecisionexperiment.py    From aurum-datadiscovery with MIT License 4 votes vote down vote up
def __init__(self, N, vectors, coverage_ratio=0.2):
        """
        Performs exact nearest neighbour search on the data set.

        vectors can either be a numpy matrix with all the vectors
        as columns OR a python array containing the individual
        numpy vectors.
        """
        # We need a dict from vector string representation to index
        self.vector_dict = {}
        self.N = N
        self.coverage_ratio = coverage_ratio
        numpy_vectors = numpy_array_from_list_or_numpy_array(vectors)

        # Get numpy array representation of input
        self.vectors = numpy.vstack([unitvec(v) for v in numpy_vectors.T])

        # Build map from vector string representation to vector
        for index, v in enumerate(self.vectors):
            self.vector_dict[self.__vector_to_string(v)] = index

        # Determine the indices of query vectors used for comparance
        # with approximated search.
        query_count = numpy.floor(self.coverage_ratio *
                                  len(self.vectors))
        self.query_indices = []
        for k in range(int(query_count)):
            index = numpy.floor(k * (float(len(self.vectors)) / query_count))
            index = min(index, len(self.vectors) - 1)
            self.query_indices.append(int(index))

        print('\nStarting exact search (query set size=%d)...\n' % query_count)

        # For each query vector get the closest N neighbours
        self.closest = {}
        self.exact_search_time_per_vector = 0.0

        for index in self.query_indices:
            v = self.vectors[index, numpy.newaxis]
            exact_search_start_time = time.time()
            D = cdist(v, self.vectors, 'euclidean')
            self.closest[index] = scipy.argsort(D)[0, 1:N + 1]

            # Save time needed for exact search
            exact_search_time = time.time() - exact_search_start_time
            self.exact_search_time_per_vector += exact_search_time

        print('Done with exact search...\n')

        # Normalize search time
        self.exact_search_time_per_vector /= float(len(self.query_indices))