Python pandas.value_counts() Examples

The following are 27 code examples for showing how to use pandas.value_counts(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may want to check out the right sidebar which shows the related API usage.

You may also want to check out all available functions/classes of the module pandas , or try the search function .

Example 1
Project: NanoPlot   Author: wdecoster   File: spatial_heatmap.py    License: GNU General Public License v3.0 6 votes vote down vote up
def spatial_heatmap(array, path, title=None, color="Greens", figformat="png"):
    """Taking channel information and creating post run channel activity plots."""
    logging.info("Nanoplotter: Creating heatmap of reads per channel using {} reads."
                 .format(array.size))
    activity_map = Plot(
        path=path + "." + figformat,
        title="Number of reads generated per channel")
    layout = make_layout(maxval=np.amax(array))
    valueCounts = pd.value_counts(pd.Series(array))
    for entry in valueCounts.keys():
        layout.template[np.where(layout.structure == entry)] = valueCounts[entry]
    plt.figure()
    ax = sns.heatmap(
        data=pd.DataFrame(layout.template, index=layout.yticks, columns=layout.xticks),
        xticklabels="auto",
        yticklabels="auto",
        square=True,
        cbar_kws={"orientation": "horizontal"},
        cmap=color,
        linewidths=0.20)
    ax.set_title(title or activity_map.title)
    activity_map.fig = ax.get_figure()
    activity_map.save(format=figformat)
    plt.close("all")
    return [activity_map] 
Example 2
Project: underthesea   Author: undertheseanlp   File: tc_.py    License: GNU General Public License v3.0 6 votes vote down vote up
def _analyze_field(self, df, id, output_folder=".", n_head=10):
        id = str(id)
        m = df.shape[1]
        df.columns = [str(i) for i in range(m)]

        agg_dict = dict()
        agg_dict[id] = "size"
        for i in range(int(id)):
            agg_dict[str(i)] = lambda x: ", ".join(
                pd.value_counts(x).index[:n_head])
        name_dict = dict()
        name_dict[id] = "count"
        df_analyze = df.groupby(id).agg(agg_dict).rename(
            columns=name_dict).reset_index()
        filename = join(output_folder, "column-%s-analyze.xlsx" % id)

        log = u""
        log += u"Tags         : {}\n".format(df_analyze.shape[0])
        tags = df_analyze[id].to_dict().values()
        tags = sorted(tags)
        log += u"List tags    : {}\n".format(u", ".join(tags))
        df_analyze.to_excel(filename, index=False)
        return log 
Example 3
Project: underthesea   Author: undertheseanlp   File: tagged_corpus.py    License: GNU General Public License v3.0 6 votes vote down vote up
def _analyze_field(self, df, id, output_folder=".", n_head=10):
        id = str(id)
        m = df.shape[1]
        df.columns = [str(i) for i in range(m)]

        agg_dict = dict()
        agg_dict[id] = "size"
        for i in range(int(id)):
            agg_dict[str(i)] = lambda x: ", ".join(
                pd.value_counts(x).index[:n_head])
        name_dict = dict()
        name_dict[id] = "count"
        df_analyze = df.groupby(id).agg(agg_dict).rename(
            columns=name_dict).reset_index()
        filename = join(output_folder, "column-%s-analyze.xlsx" % id)

        log = u""
        log += u"Tags         : {}\n".format(df_analyze.shape[0])
        tags = df_analyze[id].to_dict().values()
        tags = sorted(tags)
        log += u"List tags    : {}\n".format(u", ".join(tags))
        df_analyze.to_excel(filename, index=False)
        return log 
Example 4
Project: dask-ml   Author: dask   File: test_split.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_blockwise_shufflesplit():
    splitter = dask_ml.model_selection.ShuffleSplit(random_state=0)
    assert splitter.get_n_splits() == 10
    gen = splitter.split(dX)

    train_idx, test_idx = next(gen)
    assert isinstance(train_idx, da.Array)
    assert isinstance(test_idx, da.Array)

    assert train_idx.shape == (99,)  # 90% of 110
    assert test_idx.shape == (11,)

    assert train_idx.chunks == ((45, 45, 9),)
    assert test_idx.chunks == ((5, 5, 1),)

    counts = pd.value_counts(train_idx.compute())
    assert counts.max() == 1

    N = len(X)

    np.testing.assert_array_equal(
        np.unique(da.concatenate([train_idx, test_idx])), np.arange(N)
    ) 
Example 5
Project: adversarial-policies   Author: HumanCompatibleAI   File: visualize.py    License: MIT License 5 votes vote down vote up
def _visualize_helper(
    model_dir, output_dir, subsample_rate, save_type, ordering, external_legend_params
):
    logger.info("Generating figures")

    # Data
    metadata_df = pd.read_csv(os.path.join(model_dir, "metadata.csv"))
    cluster_ids = np.load(os.path.join(model_dir, "cluster_ids.npy"))
    metadata_df["ax_1"] = cluster_ids[:, 0]
    metadata_df["ax_2"] = cluster_ids[:, 1]
    metadata_df["opponent_id"] = metadata_df["opponent_id"].apply(ABBREVIATIONS.get)

    def save_path(prefix):
        return osp.join(output_dir, f"{prefix}.{save_type}")

    counts = pd.value_counts(metadata_df["opponent_id"])
    min_counts = counts.min()
    opponent_groups = metadata_df.groupby("opponent_id")
    opponent_dfs = {name: group.sample(n=min_counts) for name, group in opponent_groups}
    opponent_dfs = [opponent_dfs[label] for label in ordering]
    metadata_df = pd.concat(opponent_dfs)

    _plot_and_save_chart(save_path("combined"), [metadata_df])
    _plot_and_save_chart(save_path("subsampled"), [metadata_df.sample(frac=subsample_rate)])
    _plot_and_save_chart(save_path("sidebyside"), opponent_dfs)

    if external_legend_params is not None:
        _external_legend(osp.join(output_dir, "external_legend.pdf"))

    logger.info("Visualization complete") 
Example 6
Project: CausalGAN   Author: mkocaoglu   File: pairwise.py    License: MIT License 5 votes vote down vote up
def calc_tvd(label_dict,attr):
    '''
    attr should be a 0,1 pandas dataframe with
    columns corresponding to label names

    for example:
    names=zip(*self.graph)[0]
    calc_tvd(label_dict,attr[names])

    label_dict should be a dictionary key:1d-array of samples
    '''
    ####Calculate Total Variation####
    if np.min(attr.values)<0:
        raise ValueError('calc_tvd received \
                 attr that may not have been in {0,1}')

    label_names=label_dict.keys()
    attr=attr[label_names]

    df2=attr.drop_duplicates()
    df2 = df2.reset_index(drop = True).reset_index()
    df2=df2.rename(columns = {'index':'ID'})
    real_data_id=pd.merge(attr,df2)
    real_counts = pd.value_counts(real_data_id['ID'])
    real_pdf=real_counts/len(attr)

    label_list_dict={k:np.round(v.ravel()) for k,v in label_dict.items()}
    df_dat=pd.DataFrame.from_dict(label_list_dict)
    dat_id=pd.merge(df_dat,df2,on=label_names,how='left')
    dat_counts=pd.value_counts(dat_id['ID'])
    dat_pdf = dat_counts / dat_counts.sum()
    diff=real_pdf.subtract(dat_pdf, fill_value=0)
    tvd=0.5*diff.abs().sum()
    return tvd 
Example 7
Project: ibeis   Author: Erotemic   File: train_main.py    License: Apache License 2.0 5 votes vote down vote up
def class_weights(self):
        import pandas as pd
        label_freq = pd.value_counts(self.labels)
        class_weights = label_freq.median() / label_freq
        class_weights = class_weights.sort_index().values
        class_weights = torch.from_numpy(class_weights.astype(np.float32))
        return class_weights 
Example 8
Project: vivarium   Author: ihmeuw   File: test_randomness_index_map.py    License: GNU General Public License v3.0 5 votes vote down vote up
def test_hash_uniformity(map_size_and_hashed_values):
    n, h = map_size_and_hashed_values

    k = len(h)
    num_bins = k//5  # Want about 5 items per bin for chi-squared
    bins = np.linspace(0, n + 1, num_bins)

    binned_data = pd.cut(h, bins)
    distribution = pd.value_counts(binned_data).sort_index()
    c, p = chisquare(distribution)

    assert p > 0.05, "Data not uniform" 
Example 9
Project: pandas-summary   Author: mouradmourafiq   File: __init__.py    License: MIT License 5 votes vote down vote up
def columns_types(self):
        return pd.value_counts(self.columns_stats.loc['types']) 
Example 10
Project: pandas-summary   Author: mouradmourafiq   File: __init__.py    License: MIT License 5 votes vote down vote up
def _get_deviation_of_mean(self, series, multiplier=3):
        """
        Returns count of values deviating of the mean, i.e. larger than `multiplier` * `std`.
        :type series:
        :param multiplier:
        :return:
        """
        capped_series = np.minimum(
            series, series.mean() + multiplier * series.std())
        count = pd.value_counts(series != capped_series)
        count = count[True] if True in count else 0
        perc = self._percent(count / self.length)
        return count, perc 
Example 11
Project: pandas-summary   Author: mouradmourafiq   File: __init__.py    License: MIT License 5 votes vote down vote up
def _get_median_absolute_deviation(self, series, multiplier=3):
        """
        Returns count of values larger than `multiplier` * `mad`
        :type series:
        :param multiplier:
        :return (array):
        """
        capped_series = np.minimum(
            series, series.median() + multiplier * series.mad())
        count = pd.value_counts(series != capped_series)
        count = count[True] if True in count else 0
        perc = self._percent(count / self.length)
        return count, perc 
Example 12
Project: pandas-summary   Author: mouradmourafiq   File: __init__.py    License: MIT License 5 votes vote down vote up
def _get_categorical_summary(self, column):
        series = self.df[column]
        # Only run if at least 1 non-missing value
        value_counts = series.value_counts()
        stats = {
            'top': '{}: {}'.format(value_counts.index[0], value_counts.iloc[0]),
        }
        return pd.concat([pd.Series(stats, name=column),
                          self.columns_stats[column]],
                         sort=True) 
Example 13
Project: pandas-summary   Author: mouradmourafiq   File: __init__.py    License: MIT License 5 votes vote down vote up
def _get_bool_summary(self, column):
        series = self.df[column]

        stats = {}
        for class_name, class_value in dict(series.value_counts()).items():
            stats['"{}" count'.format(class_name)] = '{}'.format(class_value)
            stats['"{}" perc'.format(class_name)] = '{}'.format(
                self._percent(class_value / self.length))

        return pd.concat([pd.Series(stats, name=column),
                          self.columns_stats[column]],
                         sort=True) 
Example 14
Project: underthesea   Author: undertheseanlp   File: tc_.py    License: GNU General Public License v3.0 5 votes vote down vote up
def _analyze_first_token(self, df, id, output_folder="."):
        filename = join(output_folder, "column-%s-analyze.xlsx" % id)
        df_analyze = df[id].value_counts().reset_index(name="count")
        df_analyze = df_analyze.rename(columns={"index": "0"})
        df_analyze.to_excel(filename, index=False)
        log = u""
        log += u"Unique words : {}\n".format(df_analyze.shape[0])
        log += u"Top words    : {}\n".format(
            u", ".join(list(df_analyze["0"].to_dict().values())[:20]))
        return log 
Example 15
Project: underthesea   Author: undertheseanlp   File: tagged_corpus.py    License: GNU General Public License v3.0 5 votes vote down vote up
def _analyze_first_token(self, df, id, output_folder="."):
        filename = join(output_folder, "column-%s-analyze.xlsx" % id)
        df_analyze = df[id].value_counts().reset_index(name="count")
        df_analyze = df_analyze.rename(columns={"index": "0"})
        df_analyze.to_excel(filename, index=False)
        log = u""
        log += u"Unique words : {}\n".format(df_analyze.shape[0])
        log += u"Top words    : {}\n".format(
            u", ".join(list(df_analyze["0"].to_dict().values())[:20]))
        return log 
Example 16
Project: arche   Author: scrapinghub   File: others.py    License: MIT License 5 votes vote down vote up
def garbage_symbols(df: pd.DataFrame) -> Result:
    """Find unwanted symbols in `np.object` columns.

    Returns:
        A result containing item keys per field which contained any trash symbol
    """
    garbage = (
        r"(?P<spaces>^\s|\s$)"
        r"|(?P<html_entities>&[a-zA-Z]{2,}?;|&#\d*?;)"
        r"|(?P<css>[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?\s*?})"
        r"|(?P<html_tags></??(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
        r"blockquote)\s*?/??>|<!--|-->)"
    )

    errors = {}
    row_keys: Set = set()
    rule_result = Result("Garbage Symbols", items_count=len(df))

    for column in tqdm(df.select_dtypes([np.object]).columns, desc="Garbage Symbols"):
        matches = df[column].apply(str).str.extractall(garbage, flags=re.IGNORECASE)
        if not matches.empty:
            error_keys = df.loc[matches.unstack().index.values].index
            bad_texts = matches.stack().value_counts().index.sort_values().tolist()
            # escape backslashes for markdown repr, `\n > \\n`
            bad_texts = [
                f"'{codecs.encode(bx, 'unicode_escape').decode()[:20]}'"
                for bx in bad_texts
            ]
            error = (
                f"{len(error_keys)/len(df)*100:.1f}% of '{column}' "
                f"values contain `{', '.join(bad_texts)}`"
            )

            errors[error] = list(error_keys)
            row_keys = row_keys.union(error_keys)
    if errors:
        rule_result.add_error(
            f"{len(row_keys)/len(df) * 100:.1f}% ({len(row_keys)}) items affected",
            errors=errors,
        )
    return rule_result 
Example 17
Project: moses   Author: molecularsets   File: prepare_dataset.py    License: MIT License 5 votes vote down vote up
def split_dataset(dataset, seed):
    logger.info('Splitting the dataset')
    scaffolds = pd.value_counts(dataset['scaffold'])
    scaffolds = sorted(scaffolds.items(), key=lambda x: (-x[1], x[0]))
    test_scaffolds = set([x[0] for x in scaffolds[9::10]])
    dataset['SPLIT'] = 'train'
    test_scaf_idx = [x in test_scaffolds for x in dataset['scaffold']]
    dataset.loc[test_scaf_idx, 'SPLIT'] = 'test_scaffolds'
    test_idx = dataset.loc[dataset['SPLIT'] == 'train'].sample(
        frac=0.1, random_state=seed
    ).index
    dataset.loc[test_idx, 'SPLIT'] = 'test'
    dataset.drop('scaffold', axis=1, inplace=True)
    return dataset 
Example 18
Project: econtools   Author: dmsul   File: core.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def df_cluster(n, k, cluster_id):
    g = len(pd.value_counts(cluster_id))
    df = g - 1
    vce_correct = ((n - 1) / (n - k)) * (g / (g - 1))
    return df, vce_correct, g 
Example 19
Project: dask-ml   Author: dask   File: test_split.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_kfold(shuffle):
    splitter = dask_ml.model_selection.KFold(
        n_splits=5, random_state=0, shuffle=shuffle
    )
    assert splitter.get_n_splits() == 5
    gen = splitter.split(dX)

    train_idx, test_idx = next(gen)
    assert isinstance(train_idx, da.Array)
    assert isinstance(test_idx, da.Array)

    assert train_idx.shape == (88,)  # 80% of 110
    assert test_idx.shape == (22,)

    assert train_idx.chunks == ((28, 50, 10),)
    assert test_idx.chunks == ((22,),)

    counts = pd.value_counts(train_idx.compute())
    assert counts.max() == 1

    N = len(X)

    np.testing.assert_array_equal(
        np.unique(da.concatenate([train_idx, test_idx])), np.arange(N)
    )

    expected_chunks = [
        (((22, 6, 50, 10),), ((22,),)),
        (((44, 34, 10),), ((6, 16),)),
        (((50, 16, 12, 10),), ((22,),)),
        (((50, 38),), ((12, 10),)),
    ]

    for (exp_train_idx, exp_test_idx), (train_idx, test_idx) in zip(
        expected_chunks, gen
    ):
        assert train_idx.chunks == exp_train_idx
        assert test_idx.chunks == exp_test_idx 
Example 20
Project: fitbit-analyzer   Author: 5agado   File: sleepStats.py    License: Apache License 2.0 5 votes vote down vote up
def normalizedIntradayCountStats(intradayStats, limitCount=5):
    # For each minute, number of days for which we have a valid measure (record)
    notNullCount = intradayStats.count()
    # Ignore minutes where we have low level of records
    notNullCount[notNullCount < limitCount] = None
    # Count how many times each value appears for each minute
    valueCount = intradayStats.apply(pd.value_counts)
    # Normalize each minute by records count
    res = valueCount.div(notNullCount, axis=1)
    return res 
Example 21
Project: fitbit-analyzer   Author: 5agado   File: testPlotting.py    License: Apache License 2.0 5 votes vote down vote up
def test_plottingOnIntradayStats(self):
        filepath =  RESOURCE_PATH + "\\unittest\\test_sleep_basic01.csv"
        data1 = utils.loadIntradayData(filepath)
        filepath =  RESOURCE_PATH + "\\unittest\\test_sleep_basic02.csv"
        data2 = utils.loadIntradayData(filepath)
        stats = sleepStats.generateStatsFrom([data1, data2],
                                             sleepStats.STATS_NAME_INTRADAY)

        data = stats.apply(pd.value_counts)
        mplot.plotSleepValueHeatmap(data, sleepValue=1) 
Example 22
Project: postman_problems   Author: brooksandrew   File: test_example_sleeping_giant.py    License: MIT License 5 votes vote down vote up
def test_get_shortest_paths_distances():
    df = read_edgelist(EDGELIST)
    graph = create_networkx_graph_from_edgelist(df, edge_id='id')

    odd_nodes = get_odd_nodes(graph)
    odd_node_pairs = list(itertools.combinations(odd_nodes, 2))

    # coarsely checking structure of `get_shortest_paths_distances` return value
    odd_node_pairs_shortest_paths = get_shortest_paths_distances(graph, odd_node_pairs, 'distance')
    assert len(odd_node_pairs_shortest_paths) == 630
    assert type(odd_node_pairs_shortest_paths) == dict

    # check that each node name appears the same number of times in `get_shortest_paths_distances` return value
    node_names = list(itertools.chain(*[i[0] for i in odd_node_pairs_shortest_paths.items()]))
    assert set(pd.value_counts(node_names)) == set([35]) 
Example 23
Project: dtale   Author: man-group   File: views.py    License: GNU Lesser General Public License v2.1 4 votes vote down vote up
def describe(data_id, column):
    """
    :class:`flask:flask.Flask` route which returns standard details about column data using
    :meth:`pandas:pandas.DataFrame.describe` to the front-end as JSON

    :param data_id: integer string identifier for a D-Tale process's data
    :type data_id: str
    :param column: required dash separated string "START-END" stating a range of row indexes to be returned
                   to the screen
    :return: JSON {
        describe: object representing output from :meth:`pandas:pandas.Series.describe`,
        unique_data: array of unique values when data has <= 100 unique values
        success: True/False
    }

    """
    data = global_state.get_data(data_id)[[column]]
    additional_aggs = None
    curr_dtypes = global_state.get_dtypes(data_id)
    dtype = next(
        (
            dtype_info["dtype"]
            for dtype_info in curr_dtypes
            if dtype_info["name"] == column
        ),
        None,
    )
    if classify_type(dtype) in ["I", "F"]:
        additional_aggs = ["sum", "median", "mode", "var", "sem", "skew", "kurt"]
    code = build_code_export(data_id)
    desc, desc_code = load_describe(data[column], additional_aggs=additional_aggs)
    code += desc_code
    return_data = dict(describe=desc, success=True)
    uniq_vals = data[column].value_counts().sort_values(ascending=False)
    total_uniq_vals = len(uniq_vals)
    if "unique" not in return_data["describe"]:
        return_data["describe"]["unique"] = json_int(total_uniq_vals, as_string=True)
    uniq_vals.index.name = "value"
    uniq_vals.name = "count"
    uniq_vals = uniq_vals.reset_index()
    uniq_f, _ = build_formatters(uniq_vals)
    if total_uniq_vals <= 100:
        code.append("uniq_vals = data['{}'].unique()".format(column))
        return_data["uniques"] = dict(
            data=uniq_f.format_dicts(uniq_vals.itertuples()),
            total=total_uniq_vals,
            top=False,
        )
    else:  # get top 100 most common values
        return_data["uniques"] = dict(
            data=uniq_f.format_dicts(uniq_vals.head(100).itertuples()),
            total=total_uniq_vals,
            top=True,
        )
        uniq_code = "uniq_vals = data['{}'].value_counts().sort_values(ascending=False).head(100).index.values"
        code.append(uniq_code.format(column))
    return_data["code"] = "\n".join(code)
    return jsonify(return_data) 
Example 24
Project: pyseqlogo   Author: saketkc   File: format_utils.py    License: MIT License 4 votes vote down vote up
def read_alignment(infile, data_type='fasta', seq_type='dna', pseudo_count=1):
    """Read alignment file as motif

    Parameters
    ----------

    infile: str
        Path to input alignment file

    data_type: str
        'fasta', 'stockholm', etc/. as supported by Bio.AlignIO

    seq_type: str
        'dna', 'rna' or 'aa'

    pseudo_count: int
        psuedo counts to add before calculating information cotent

    Returns
    -------

    (motif, information_content) : tuple
        A motif instance followd by total informatio content of the motif

    """
    alignment = AlignIO.read(infile, data_type)
    data = []
    for aln in alignment:
        data.append([x for x in str(aln.seq)])
    df = pd.DataFrame(data)
    df_counts = df.apply(pd.value_counts, 0)
    total = df_counts[[0]].sum()
    df_counts = df_counts[df_counts.index != '-']
    # Remove - from counts
    counts_dict = df_counts.to_dict(orient='index')
    counts = {}
    for key, val in counts_dict.items():
        counts[key] = list(val.values())
    return counts, total
    """
    summary_align = AlignInfo.SummaryInfo(alignment)
    if seq_type == 'dna':
        info_content = summary_align.information_content(e_freq_table = naive_freq_tables['dna'],
                                                         chars_to_ignore = ['N'],
                                                         pseudo_count = pseudo_count)
    elif seq_type == 'rna':
        info_content = summary_align.information_content(e_freq_table = naive_freq_tables['rna'],
                                                         chars_to_ignore = ['N'],
                                                         pseudo_count = pseudo_count)
    else:
        info_content = summary_align.information_content(e_freq_table = naive_freq_tables['aa'],
                                                         pseudo_count = pseudo_count)
    motif = create_motif_from_alignment(alignment)
    return (motif, summary_align.ic_vector)
    """ 
Example 25
Project: arche   Author: scrapinghub   File: others.py    License: MIT License 4 votes vote down vote up
def compare_boolean_fields(
    source_df: pd.DataFrame,
    target_df: pd.DataFrame,
    err_thr: float = 0.10,
    warn_thr: float = 0.05,
) -> Result:
    """Compare booleans distribution between two dataframes

    Returns:
        A result containing dataframe with distributions and messages if differences
        are in thresholds
    """

    source_bool = source_df.select_dtypes(include="bool")
    target_bool = target_df.select_dtypes(include="bool")

    result = Result("Boolean Fields")
    if not fields_to_compare(source_bool, target_bool):
        result.outcome = Outcome.SKIPPED
        return result

    dummy = pd.DataFrame(columns=[True, False])
    source_counts = pd.concat(
        [dummy, source_bool.apply(pd.value_counts, normalize=True).T], sort=False
    ).fillna(0.0)
    target_counts = pd.concat(
        [dummy, target_bool.apply(pd.value_counts, normalize=True).T], sort=False
    ).fillna(0.0)
    difs = (source_counts - target_counts)[True]

    bool_covs = pd.concat(
        [
            source_counts.rename("{}_source".format),
            target_counts.rename("{}_target".format),
        ]
    ).sort_index()
    bool_covs.name = "Coverage for boolean fields"
    result.stats.append(bool_covs)

    err_diffs = difs[difs.abs() > err_thr]
    if not err_diffs.empty:
        result.add_error(
            f"{', '.join(err_diffs.index)} relative frequencies differ "
            f"by more than {err_thr:.0%}"
        )

    warn_diffs = difs[(difs.abs() > warn_thr) & (difs.abs() <= err_thr)]
    if not warn_diffs.empty:
        result.add_warning(
            f"{', '.join(warn_diffs.index)} relative frequencies differ by "
            f"{warn_thr:.0%}-{err_thr:.0%}"
        )

    return result 
Example 26
Project: modin   Author: modin-project   File: test_general.py    License: Apache License 2.0 4 votes vote down vote up
def test_value_counts(normalize, bins, dropna):
    def sort_index_for_equal_values(result, ascending):
        is_range = False
        is_end = False
        i = 0
        new_index = np.empty(len(result), dtype=type(result.index))
        while i < len(result):
            j = i
            if i < len(result) - 1:
                while result[result.index[i]] == result[result.index[i + 1]]:
                    i += 1
                    if is_range is False:
                        is_range = True
                    if i == len(result) - 1:
                        is_end = True
                        break
            if is_range:
                k = j
                for val in sorted(result.index[j : i + 1], reverse=not ascending):
                    new_index[k] = val
                    k += 1
                if is_end:
                    break
                is_range = False
            else:
                new_index[j] = result.index[j]
            i += 1
        return pandas.Series(result, index=new_index)

    # We sort indices for pandas result because of issue #1650
    values = np.array([3, 1, 2, 3, 4, np.nan])
    modin_result = pd.value_counts(values, normalize=normalize, ascending=False)
    pandas_result = sort_index_for_equal_values(
        pandas.value_counts(values, normalize=normalize, ascending=False), False
    )
    df_equals(modin_result, pandas_result)

    modin_result = pd.value_counts(values, bins=bins, ascending=False)
    pandas_result = sort_index_for_equal_values(
        pandas.value_counts(values, bins=bins, ascending=False), False
    )
    df_equals(modin_result, pandas_result)

    modin_result = pd.value_counts(values, dropna=dropna, ascending=True)
    pandas_result = sort_index_for_equal_values(
        pandas.value_counts(values, dropna=dropna, ascending=True), True
    )
    df_equals(modin_result, pandas_result) 
Example 27
Project: stacked_generalization   Author: fukatani   File: kaggle_titanic.py    License: Apache License 2.0 4 votes vote down vote up
def pre_process(self, drop=True, title_to_onehot=True, norm_fare=True):
        def get_title(name):
            title_search = re.search(' ([A-Za-z]+)\.', name)
            if title_search:
                return title_search.group(1)
            return ""

        def normalize_fare(data):
            new_data = None
            for embarked in (0, 1, 2):
                temp = data[data.Embarked == embarked]
                temp['Fare'] /= temp['Fare'].values.mean()
                if new_data is None:
                    new_data = temp
                else:
                    new_data = pd.concat([new_data, temp])
            new_data = new_data.sort('PassengerId')
            return new_data

        data = pd.read_csv(self.file_name).replace('male',0).replace('female',1)
        data['Age'].fillna(data.Age.median(), inplace=True)
        data['Fare'].fillna(data.Fare.median(), inplace=True)
        data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
        data['Embarked'] = data['Embarked'].replace('S',0).replace('C',1).replace('Q',2)
        data['Embarked'].fillna(0, inplace=True)
        if norm_fare:
            data = normalize_fare(data)

        # Get all the titles and print how often each one occurs.
        titles = data["Name"].apply(get_title)
        print(pd.value_counts(titles))

        # Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
        title_mapping = {"Dona": 1, "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
        for k,v in title_mapping.items():
            titles[titles == k] = v

        # Add in the title column.
        data['Title'] = titles
        data['Title'].fillna(1, inplace=True)
        #data['Pos'] = data["Title"] + data['Pclass']
        if drop:
            #data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Pclass', 'Cabin', 'Embarked'], axis=1)
            data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1)
            #data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Pclass', 'Title'], axis=1)
        print(data.keys())
        if title_to_onehot:
            self.encode(data, 'Title', [i for i in range(1, 11)])
            data = data.drop(['Title'], axis=1)
        return data