Python scipy.stats.percentileofscore() Examples

The following are code examples for showing how to use scipy.stats.percentileofscore(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: cms   Author: broadinstitute   File: power_func.py    BSD 2-Clause "Simplified" License 6 votes vote down vote up
def plot_dist(allvals, savefilename= "/web/personal/vitti/test.png", numBins=1000):
	#print(allvals)

	#get rid of nans and infs
	#cleanvals = [item for item in allvals if not np.isnan(item)]
	#allvals = cleanvals
	allvals = np.array(allvals)
	allvals = allvals[~np.isnan(allvals)]
	allvals = allvals[~np.isinf(allvals)]
	#allvals = list(allvals)
	#print(allvals)
	print("percentile for score = 10: " + str(percentileofscore(allvals, 10)))
	print("percentile for score = 15: " + str(percentileofscore(allvals, 15)))
	if len(allvals) > 0:
		f, ax = plt.subplots(1)
		ax.hist(allvals, bins=numBins)
		plt.savefig(savefilename)
		print('plotted to ' + savefilename)
	return 
Example 2
Project: blackSheep   Author: ruggleslab   File: simulate.py    MIT License 6 votes vote down vote up
def generate_output_line(o_thresh, values, ol_dist, pval):
	# Determines the p-value for each sample being significantly hyperphosphorylated
	# for the given gene.
		
	output_list = []

	for s in range(len(list(values.values())[0])): # for each sample
		tot_outliers = 0
		for o in o_thresh.keys():
			if values[o][s]:
				if float(values[o][s]) > o_thresh[o]:
					tot_outliers += 1
		pv = round(((100-percentileofscore(ol_dist,tot_outliers-1,kind='weak'))/100.0),3) # pval for i+1 outliers
		if pv <= pval:
			output_list.append(str(pv))
		else:
			output_list.append('NS')
		
	return output_list 
Example 3
Project: PIDGINv3   Author: lhm30   File: predict.py    GNU General Public License v3.0 6 votes vote down vote up
def doPercentileCalculation(model_name):
	global rdkit_mols
	#expensive to unzip training file - so only done if smiles requested
	if options.ad_smiles:
		smiles = get_training_smiles(model_name)
	ad_data = getAdData(model_name)
	def calcPercentile(rdkit_mol):
		sims = DataStructs.BulkTanimotoSimilarity(rdkit_mol,ad_data[:,0])
		bias = ad_data[:,2].astype(float)
		std_dev = ad_data[:,3].astype(float)
		scores = ad_data[:,5].astype(float)
		weights = sims / (bias * std_dev)
		critical_weight = weights.max()
		percentile = percentileofscore(scores,critical_weight)
		if options.ad_smiles:
			critical_smiles = smiles[np.argmax(weights)]
			result = percentile, critical_smiles
		else:
			result = percentile, None
		return result
	ret = [calcPercentile(x) for x in rdkit_mols]
	return model_name, ret

#prediction runner for percentile calculation 
Example 4
Project: datastream.io   Author: MentatInnovations   File: anomaly_detectors.py    Apache License 2.0 5 votes vote down vote up
def score_anomaly(self, x):
        x = pd.Series(x)
        scores = pd.Series([0.01*percentileofscore(self.sample_, z) for z in x])
        return scores 
Example 5
Project: datastream.io   Author: MentatInnovations   File: detector.py    Apache License 2.0 5 votes vote down vote up
def score(self, x):
        from scipy.stats import percentileofscore
        return [0.01*percentileofscore(x, z) for z in x] 
Example 6
Project: tia   Author: bpsmith   File: perf.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def rolling_percentileofscore(series, window, min_periods=None):
    """Computue the score percentile for the specified window."""
    import scipy.stats as stats

    def _percentile(arr):
        score = arr[-1]
        vals = arr[:-1]
        return stats.percentileofscore(vals, score)

    notnull = series.dropna()
    min_periods = min_periods or window
    if notnull.empty:
        return pd.Series(np.nan, index=series.index)
    else:
        return pd.rolling_apply(notnull, window, _percentile, min_periods=min_periods).reindex(series.index) 
Example 7
Project: tia   Author: bpsmith   File: perf.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def expanding_percentileofscore(series, min_periods=None):
    import scipy.stats as stats

    def _percentile(arr):
        score = arr[-1]
        vals = arr[:-1]
        return stats.percentileofscore(vals, score)

    notnull = series.dropna()
    if notnull.empty:
        return pd.Series(np.nan, index=series.index)
    else:
        return pd.expanding_apply(notnull, _percentile, min_periods=min_periods).reindex(series.index) 
Example 8
Project: NiMARE   Author: neurostuff   File: stats.py    MIT License 5 votes vote down vote up
def null_to_p(test_value, null_array, tail='two'):
    """Return two-sided p-value for test value against null array.
    """
    if tail == 'two':
        p_value = (50 - np.abs(stats.percentileofscore(
            null_array, test_value) - 50.)) * 2. / 100.
    elif tail == 'upper':
        p_value = 1 - (stats.percentileofscore(null_array, test_value) / 100.)
    elif tail == 'lower':
        p_value = stats.percentileofscore(null_array, test_value) / 100.
    else:
        raise ValueError('Argument "tail" must be one of ["two", "upper", '
                         '"lower"]')
    return p_value 
Example 9
Project: quip_cnn_segmentation   Author: SBU-BMI   File: mask2image_otsu.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def draw_random_texture_try(self, only_want_nuc_texture, size0, size1):
        pathid = int(np.random.rand()*len(self.paths));
        full_tile_path = self.paths[pathid];

        full_tile = np.array(Image.open(join(self.tile_path, full_tile_path)).convert('RGB'));
        hed = rgb2hed((full_tile.astype(np.float32)/255.0).astype(np.float32));

        nuc_texture = 1.3 * (hed[:,:,1] - np.min(hed[:,:,1])) / (np.max(hed[:,:,1]) - np.min(hed[:,:,1]))
        x = int(np.random.rand()*(nuc_texture.shape[0]-size0));
        y = int(np.random.rand()*(nuc_texture.shape[1]-size1));
        nuc_texture = nuc_texture[x:x+size0, y:y+size1].copy();

        if only_want_nuc_texture:
            return True, True, True, True, True, True, True, nuc_texture, True;

        hed_mean = np.mean(hed[:,:,0]);
        if hed_mean < -1.33 or hed_mean > -1.1:
            return False, False, False, False, False, False, False, False, False;

        gray = (255 * (hed[:,:,0] - hed[:,:,0].min()) / (hed[:,:,0].max() - hed[:,:,0].min())).astype(np.uint8);
        nuc_color_thr, thresholded = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU);
        nuc_color_thr_per = stats.percentileofscore(gray.flatten(), nuc_color_thr)
        nuc_color_per = 100 - (100 - nuc_color_thr_per) * 0.92
        nuc_map_per = 100 - (100 - nuc_color_thr_per) * 1.08
        dense_lvl = (thresholded > 0).sum() / 0.06 / thresholded.size

        to_map_mask = (hed[:,:,0] > np.percentile(hed[:,:,0], nuc_map_per));
        to_map_mask = (self.gaussian_blur(to_map_mask.astype(np.float32), 0.5) > 0.50);

        nucl_color_mask = (hed[:,:,0] > np.percentile(hed[:,:,0], nuc_color_per)) * \
                          (hed[:,:,0] < np.percentile(hed[:,:,0], 99));
        nucl_color = [np.mean(full_tile[:,:,0][nucl_color_mask]), \
                      np.mean(full_tile[:,:,1][nucl_color_mask]), \
                      np.mean(full_tile[:,:,2][nucl_color_mask])];
        noise_color_mask = (hed[:,:,0] > np.percentile(hed[:,:,0],
                                (nuc_color_per+35)//2 - np.random.randint(0,3) - 25)) * \
                           (hed[:,:,0] < np.percentile(hed[:,:,0],
                                (nuc_color_per+35)//2 - np.random.randint(0,3) - 12));
        noise_color = [np.mean(full_tile[:,:,0][noise_color_mask]), \
                       np.mean(full_tile[:,:,1][noise_color_mask]), \
                       np.mean(full_tile[:,:,2][noise_color_mask])];
        if np.min(nucl_color) > 100:
            return False, False, False, False, False, False, False, False, False;

        x, y, fx, fy = self.sample_xy_fxfy(full_tile.shape[0]-size0, full_tile.shape[1]-size1, size0, size1);
        cyto_tile = cv2.inpaint(full_tile[x:x+size0, y:y+size1, :], \
                to_map_mask.astype(np.uint8)[x:x+size0, y:y+size1], 3, cv2.INPAINT_TELEA);

        source_tl = full_tile[x:x+size0, y:y+size1, :].copy();
        full_tile = full_tile[fx:fx+size0, fy:fy+size1, :];
        return True, dense_lvl, full_tile, cyto_tile, source_tl, nucl_color, noise_color, nuc_texture, full_tile_path; 
Example 10
Project: proteinpy   Author: haocai1992   File: electron_density.py    MIT License 4 votes vote down vote up
def fetch_data(self, pdb_id, filepath=None):
        """Take the pdb_id or CCP4 filepath, reads a CCP4 file and return an ElectronDensityData object storing the data in CCP4 file."""
        
        if filepath==None:
            filepath = '../CCP4/{}_2fofc.ccp4'.format(pdb_id)

        try:
            with mrcfile.open(filepath) as mrc_f:
                if mrc_f.data.shape == (mrc_f.header.nz.item(), \
                                        mrc_f.header.ny.item(), \
                                        mrc_f.header.nx.item()):

                    # make sure the data array sequence is section-row-column

                    self.cellb = mrc_f.header.cellb

                    self.column_number = mrc_f.header.nx
                    self.row_number = mrc_f.header.ny
                    self.section_number = mrc_f.header.nz

                    self.start_column_number = mrc_f.header.nxstart
                    self.start_row_number = mrc_f.header.nystart
                    self.start_section_number = mrc_f.header.nzstart

                    self.axis_to_column = mrc_f.header.mapc # 1-x; 2-y; 3-z axis.
                    self.axis_to_row = mrc_f.header.mapr # 1-x; 2-y; 3-z axis.
                    self.axis_to_section = mrc_f.header.maps # 1-x; 2-y; 3-z axis.

                    x_voxel_size = mrc_f.voxel_size.x # voxel size in x axis
                    y_voxel_size = mrc_f.voxel_size.y # voxel size in y axis
                    z_voxel_size = mrc_f.voxel_size.z # voxel size in z axis

                    self.column_voxel_size = [x_voxel_size, y_voxel_size, z_voxel_size]\
                                             [self.axis_to_column-1]
                    self.row_voxel_size = [x_voxel_size, y_voxel_size, z_voxel_size]\
                                          [self.axis_to_row-1]
                    self.section_voxel_size = [x_voxel_size, y_voxel_size, z_voxel_size]\
                                              [self.axis_to_section-1]

                    self.start_column_coord = self.start_column_number * self.column_voxel_size
                    self.start_row_coord = self.start_row_number * self.row_voxel_size
                    self.start_section_coord = self.start_section_number * self.section_voxel_size

                    # arr_sorted = sorted(mrc_f.data.flatten())
                    # data_s = pd.Series(mrc_f.data.flatten())
                    # self.electron_density_data = data_s.apply(lambda x: percentileofscore(arr_sorted, x)).values.reshape(data.shape) # re-normalized standard deviation
                    self.electron_density_data = mrc_f.data # numpy array

                else:
                    print("ERROR! INCONSISTENT DATA SHAPE IN HEADER AND BODY.")
        
        except:
            print("ERROR! NO CCP4 FILE FOUND.")

        return self 
Example 11
Project: accasim   Author: cgalleguillosm   File: workload_generator.py    MIT License 4 votes vote down vote up
def add_sample(self, submission_times, rush_hours=(8, 17), save=False):
        """

        :param submission_times:
        :param rush_hours:
        :param save:
        :return:
        """
        
        total_jobs = len(submission_times)
        assert(total_jobs >= 1000), 'Data might no be representative. There are just {} jobs.'.format(total_jobs)
        _bucket_number = lambda _dtime: _dtime.get_hours() * (self.BUCKETS // self.HOURS_PER_DAY) + (
        _dtime.get_minutes() // 30)
        submission_times = sorted(submission_times)
        sample_size = len(submission_times)
        data = {type: [] for type in ['total', 'rush']}

        init, end = rush_hours

        max_arrive_time_diff = 0
        ia_times = []

        for i, (cur_time, next_time) in enumerate(zip(submission_times[:-1], submission_times[1:])):
            ia_time = abs(next_time - cur_time)  # Must be always positive... using abs just in case
            _datetime = str_datetime(cur_time)
            _hour = _datetime.get_hours()
            _pos = _bucket_number(_datetime)
            
            data['total'].append(_pos)
            ia_times.append(ia_time)
            # Store jobs that were submitted at rush hours
            if init < _hour < end:
                data['rush'].append(_pos)
                
            if ia_time > max_arrive_time_diff:
                max_arrive_time_diff = ia_time
        avg_iatimes = sum(ia_times) / len(ia_times)
        avg_percentile = percentileofscore(ia_times, avg_iatimes)
        iatimes_stdev = pstdev(ia_times)
        
        _log_arrive_time = log(max_arrive_time_diff)
        
        if self.TOO_MUCH_ARRIVE_TIME > _log_arrive_time:
            self.TOO_MUCH_ARRIVE_TIME = _log_arrive_time

        if not self.params:
            for type in data:
                if not data[type]:
                    continue
                self.params[type], _, _ = self._generate_dist_params(data[type])
    
                if self.distribution_plot:
                    self._save_distribution_plot('arrive distribution', data, type)
            if save:
                filename = 'arrive_params-{}'.format(int(time.time()))
                self._save_parameters(filename, self.params) 
Example 12
Project: disparity_filter   Author: DerwenAI   File: disparity.py    MIT License 4 votes vote down vote up
def disparity_filter (graph):
    """
    implements a disparity filter, based on multiscale backbone networks
    https://arxiv.org/pdf/0904.2389.pdf
    """
    alpha_measures = []
    
    for node_id in graph.nodes():
        node = graph.node[node_id]
        degree = graph.degree(node_id)
        strength = 0.0

        for id0, id1 in graph.edges(nbunch=[node_id]):
            edge = graph[id0][id1]
            strength += edge["weight"]

        node["strength"] = strength

        for id0, id1 in graph.edges(nbunch=[node_id]):
            edge = graph[id0][id1]

            norm_weight = edge["weight"] / strength
            edge["norm_weight"] = norm_weight

            if degree > 1:
                try:
                    if norm_weight == 1.0:
                        norm_weight -= 0.0001

                    alpha = get_disparity_significance(norm_weight, degree)
                except AssertionError:
                    report_error("disparity {}".format(repr(node)), fatal=True)

                edge["alpha"] = alpha
                alpha_measures.append(alpha)
            else:
                edge["alpha"] = 0.0

    for id0, id1 in graph.edges():
        edge = graph[id0][id1]
        edge["alpha_ptile"] = percentileofscore(alpha_measures, edge["alpha"]) / 100.0

    return alpha_measures


######################################################################
## related metrics 
Example 13
Project: nelpy   Author: nelpy   File: replay.py    MIT License 4 votes vote down vote up
def score_hmm_events_no_xval(bst, training=None, validation=None, num_states=30, n_shuffles=5000, shuffle='row-wise', verbose=False):
    """same as score_hmm_events, but train on training set, and only score validation set..."""
    if shuffle == 'row-wise':
        rowwise = True
    elif shuffle == 'col-wise':
        rowwise = False
    else:
        shuffle = 'timeswap'

    scores_hmm = np.zeros(len(validation))
    scores_hmm_shuffled = np.zeros((len(validation), n_shuffles))

    PBEs_train = bst[training]
    PBEs_test = bst[validation]

    # train HMM on all training PBEs
    hmm = hmmutils.PoissonHMM(n_components=num_states, random_state=0, verbose=False)
    hmm.fit(PBEs_train)

    # reorder states according to transmat ordering
    transmat_order = hmm.get_state_order('transmat')
    hmm.reorder_states(transmat_order)

    # compute scores_hmm (log likelihoods) of validation set:
    scores_hmm[:] = hmm.score(PBEs_test)

    if shuffle == 'timeswap':
        _, scores_tswap_hmm = score_hmm_timeswap_shuffle(bst=PBEs_test,
                                                        hmm=hmm,
                                                        n_shuffles=n_shuffles)

        scores_hmm_shuffled[:,:] = scores_tswap_hmm.T
    else:
        hmm_shuffled = copy.deepcopy(hmm)
        for nn in range(n_shuffles):
            # shuffle transition matrix:
            if rowwise:
                hmm_shuffled.transmat_ = shuffle_transmat(hmm_shuffled.transmat)
            else:
                hmm_shuffled.transmat_ = shuffle_transmat_Kourosh_breaks_stochasticity(hmm_shuffled.transmat)
                hmm_shuffled.transmat_ = hmm_shuffled.transmat / np.tile(hmm_shuffled.transmat.sum(axis=1), (hmm_shuffled.n_components, 1)).T

            # score validation set with shuffled HMM
            scores_hmm_shuffled[:, nn] = hmm_shuffled.score(PBEs_test)

    n_scores = len(scores_hmm)
    scores_hmm_percentile = np.array([stats.percentileofscore(scores_hmm_shuffled[idx], scores_hmm[idx], kind='mean') for idx in range(n_scores)])

    return scores_hmm, scores_hmm_shuffled, scores_hmm_percentile 
Example 14
Project: gs-quant   Author: goldmansachs   File: statistics.py    Apache License 2.0 4 votes vote down vote up
def percentiles(x: pd.Series, y: pd.Series = None, w: Union[Window, int] = Window(None, 0)) -> pd.Series:
    """
    Rolling percentiles over given window

    :param x: value series
    :param y: distribution series
    :param w: Window or int: number of observations and ramp up to use. e.g. Window(22, 10) where 22 is the window size
    and 10 the ramp up value. Window size defaults to length of series.
    :return: timeseries of percentiles

    **Usage**

    Calculate `percentile rank <https://en.wikipedia.org/wiki/Percentile_rank>`_ of :math:`y` in the sample distribution
    of :math:`x` over a rolling window of length :math:`w`:

    :math:`R_t = \\frac{\sum_{i=t-N+1}^{t}{[X_i<{Y_t}]}+0.5\sum_{i=t-N+1}^{t}{[X_i={Y_t}]}}{N}\\times100\%`

    Where :math:`N` is the number of observations in a rolling window. If :math:`y` is not provided, calculates
    percentiles of :math:`x` over its historical values. If window length :math:`w` is not provided, uses an
    ever-growing history of values. If :math:`w` is greater than the available data size, returns empty.

    **Examples**

    Compute percentile ranks of a series in the sample distribution of a second series over :math:`22` observations

    >>> a = generate_series(100)
    >>> b = generate_series(100)
    >>> percentiles(a, b, 22)

    **See also**

    :func:`zscores`

    """
    w = normalize_window(x, w)
    if x.empty:
        return x

    if y is None:
        y = x.copy()

    res = pd.Series()
    for idx, val in y.iteritems():
        sample = x[:idx][-w.w:]
        res.loc[idx] = percentileofscore(sample, val, kind='mean')

    return apply_ramp(res, w)