Python pandas.crosstab() Examples

The following are 30 code examples of pandas.crosstab(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: test_pivot.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_crosstab_with_empties(self):
        # Check handling of empties
        df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
                           'c': [np.nan, np.nan, np.nan, np.nan, np.nan]})

        empty = pd.DataFrame([[0.0, 0.0], [0.0, 0.0]],
                             index=pd.Index([1, 2],
                                            name='a',
                                            dtype='int64'),
                             columns=pd.Index([3, 4], name='b'))

        for i in [True, 'index', 'columns']:
            calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count',
                                     normalize=i)
            tm.assert_frame_equal(empty, calculated)

        nans = pd.DataFrame([[0.0, np.nan], [0.0, 0.0]],
                            index=pd.Index([1, 2],
                                           name='a',
                                           dtype='int64'),
                            columns=pd.Index([3, 4], name='b'))

        calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count',
                                 normalize=False)
        tm.assert_frame_equal(nans, calculated) 
Example #2
Source File: test_pivot.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_crosstab_ndarray(self):
        a = np.random.randint(0, 5, size=100)
        b = np.random.randint(0, 3, size=100)
        c = np.random.randint(0, 10, size=100)

        df = DataFrame({'a': a, 'b': b, 'c': c})

        result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'))
        expected = crosstab(df['a'], [df['b'], df['c']])
        tm.assert_frame_equal(result, expected)

        result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c'))
        expected = crosstab([df['b'], df['c']], df['a'])
        tm.assert_frame_equal(result, expected)

        # assign arbitrary names
        result = crosstab(self.df['A'].values, self.df['C'].values)
        assert result.index.name == 'row_0'
        assert result.columns.name == 'col_0' 
Example #3
Source File: contingency_tables.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def from_data(cls, data, shift_zeros=True):
        """
        Construct a Table object from data.

        Parameters
        ----------
        data : array-like
            The raw data, the first column defines the rows and the
            second column defines the columns.
        shift_zeros : boolean
            If True, and if there are any zeros in the contingency
            table, add 0.5 to all four cells of the table.
        """

        if isinstance(data, pd.DataFrame):
            table = pd.crosstab(data.iloc[:, 0], data.iloc[:, 1])
        else:
            table = pd.crosstab(data[:, 0], data[:, 1])
        return cls(table, shift_zeros) 
Example #4
Source File: contingency_tables.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def from_data(cls, data, shift_zeros=True):
        """
        Construct a Table object from data.

        Parameters
        ----------
        data : array-like
            The raw data, from which a contingency table is constructed
            using the first two columns.
        shift_zeros : boolean
            If True and any cell count is zero, add 0.5 to all values
            in the table.

        Returns
        -------
        A Table instance.
        """

        if isinstance(data, pd.DataFrame):
            table = pd.crosstab(data.iloc[:, 0], data.iloc[:, 1])
        else:
            table = pd.crosstab(data[:, 0], data[:, 1])

        return cls(table, shift_zeros) 
Example #5
Source File: test_contingency_tables.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_from_data(self):

        np.random.seed(241)
        df = pd.DataFrame(index=range(100), columns=("v1", "v2", "strat"))
        df["v1"] = np.random.randint(0, 2, 100)
        df["v2"] = np.random.randint(0, 2, 100)
        df["strat"] = np.kron(np.arange(10), np.ones(10))

        tables = []
        for k in range(10):
            ii = np.arange(10*k, 10*(k+1))
            tables.append(pd.crosstab(df.loc[ii, "v1"], df.loc[ii, "v2"]))

        rslt1 = ctab.StratifiedTable(tables)
        rslt2 = ctab.StratifiedTable.from_data("v1", "v2", "strat", df)

        assert_equal(rslt1.summary().as_text(), rslt2.summary().as_text()) 
Example #6
Source File: test_contingency_tables.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_SquareTable_from_data():

    np.random.seed(434)
    df = pd.DataFrame(index=range(100), columns=["v1", "v2"])
    df["v1"] = np.random.randint(0, 5, 100)
    df["v2"] = np.random.randint(0, 5, 100)
    table = pd.crosstab(df["v1"], df["v2"])

    rslt1 = ctab.SquareTable(table)
    rslt2 = ctab.SquareTable.from_data(df)
    rslt3 = ctab.SquareTable(np.asarray(table))

    assert_equal(rslt1.summary().as_text(),
                 rslt2.summary().as_text())

    assert_equal(rslt2.summary().as_text(),
                 rslt3.summary().as_text())

    s = str(rslt1)
    assert_equal(s.startswith('A 5x5 contingency table with counts:'), True)
    assert_equal(rslt1.table[0, 0], 8.) 
Example #7
Source File: test_mosaicplot.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_mosaic_empty_cells():
    # SMOKE test  see #2286
    import pandas as pd
    mydata = pd.DataFrame({'id2': {64: 'Angelica',
                                   65: 'DXW_UID', 66: 'casuid01',
                                   67: 'casuid01', 68: 'EC93_uid',
                                   69: 'EC93_uid', 70: 'EC93_uid',
                                   60: 'DXW_UID',  61: 'AtmosFox',
                                   62: 'DXW_UID', 63: 'DXW_UID'},
                           'id1': {64: 'TGP',
                                   65: 'Retention01', 66: 'default',
                                   67: 'default', 68: 'Musa_EC_9_3',
                                   69: 'Musa_EC_9_3', 70: 'Musa_EC_9_3',
                                   60: 'default', 61: 'default',
                                   62: 'default', 63: 'default'}})

    ct = pd.crosstab(mydata.id1, mydata.id2)
    fig, vals = mosaic(ct.T.unstack())
    pylab.close('all')
    fig, vals = mosaic(mydata, ['id1','id2'])
    pylab.close('all') 
Example #8
Source File: test_pivot.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_crosstab_errors(self):
        # Issue 12578

        df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
                           'c': [1, 1, np.nan, 1, 1]})

        error = 'values cannot be used without an aggfunc.'
        with tm.assert_raises_regex(ValueError, error):
            pd.crosstab(df.a, df.b, values=df.c)

        error = 'aggfunc cannot be used without values'
        with tm.assert_raises_regex(ValueError, error):
            pd.crosstab(df.a, df.b, aggfunc=np.mean)

        error = 'Not a valid normalize argument'
        with tm.assert_raises_regex(ValueError, error):
            pd.crosstab(df.a, df.b, normalize='42')

        with tm.assert_raises_regex(ValueError, error):
            pd.crosstab(df.a, df.b, normalize=42)

        error = 'Not a valid margins argument'
        with tm.assert_raises_regex(ValueError, error):
            pd.crosstab(df.a, df.b, normalize='all', margins=42) 
Example #9
Source File: test_pivot.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_crosstab_errors(self):
        # Issue 12578

        df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
                           'c': [1, 1, np.nan, 1, 1]})

        error = 'values cannot be used without an aggfunc.'
        with pytest.raises(ValueError, match=error):
            pd.crosstab(df.a, df.b, values=df.c)

        error = 'aggfunc cannot be used without values'
        with pytest.raises(ValueError, match=error):
            pd.crosstab(df.a, df.b, aggfunc=np.mean)

        error = 'Not a valid normalize argument'
        with pytest.raises(ValueError, match=error):
            pd.crosstab(df.a, df.b, normalize='42')

        with pytest.raises(ValueError, match=error):
            pd.crosstab(df.a, df.b, normalize=42)

        error = 'Not a valid margins argument'
        with pytest.raises(ValueError, match=error):
            pd.crosstab(df.a, df.b, normalize='all', margins=42) 
Example #10
Source File: decisionTree.py    From statistical_learning with Apache License 2.0 6 votes vote down vote up
def SplitData(self, df):
        labels = df.iloc[:, -1]
        data = df.iloc[:, :-1]
        # use crosstab to count the frequency
        cbs = (pd.crosstab(data.iloc[:, i], labels)
               for i in range(data.columns.size))
        y_c = labels.groupby(labels).count()
        # entropy of y
        HD = self.calH(y_c)
        HDA = [self.calg(cb) for cb in cbs]
        if self.method == "ID3":
            g = HD-HDA
        elif self.method == "C4.5":
            g = 1-HDA/HD
        if g.max() < self.eps:
            return None
        # the split location
        split = g.argmax()
        name = df.columns[split]
        # divide into parts
        gp = df.groupby(df.iloc[:, split])
        return ((name, i, d.drop(name, axis=1)) for i, d in gp) 
Example #11
Source File: test_replication_kw_97.py    From respy with MIT License 6 votes vote down vote up
def test_distribution_of_lagged_choices():
    params, options, actual_df = rp.get_example_model("kw_97_extended")

    options["n_periods"] = 1
    options["simulated_agents"] = 10_000

    simulate = rp.get_simulate_func(params, options)
    df = simulate(params)

    actual_df = actual_df.query("Period == 0")
    expected = pd.crosstab(
        actual_df.Lagged_Choice_1, actual_df.Experience_School, normalize="columns"
    )

    df = df.query("Period == 0")
    calculated = pd.crosstab(
        df.Lagged_Choice_1, df.Experience_School, normalize="columns"
    )

    # Allow for 4% differences which likely for small subsets.
    np.testing.assert_allclose(expected, calculated, atol=0.04) 
Example #12
Source File: test_pivot.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_crosstab_errors(self):
        # Issue 12578

        df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
                           'c': [1, 1, np.nan, 1, 1]})

        error = 'values cannot be used without an aggfunc.'
        with pytest.raises(ValueError, match=error):
            pd.crosstab(df.a, df.b, values=df.c)

        error = 'aggfunc cannot be used without values'
        with pytest.raises(ValueError, match=error):
            pd.crosstab(df.a, df.b, aggfunc=np.mean)

        error = 'Not a valid normalize argument'
        with pytest.raises(ValueError, match=error):
            pd.crosstab(df.a, df.b, normalize='42')

        with pytest.raises(ValueError, match=error):
            pd.crosstab(df.a, df.b, normalize=42)

        error = 'Not a valid margins argument'
        with pytest.raises(ValueError, match=error):
            pd.crosstab(df.a, df.b, normalize='all', margins=42) 
Example #13
Source File: test_pivot.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_crosstab_with_empties(self):
        # Check handling of empties
        df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
                           'c': [np.nan, np.nan, np.nan, np.nan, np.nan]})

        empty = pd.DataFrame([[0.0, 0.0], [0.0, 0.0]],
                             index=pd.Index([1, 2],
                                            name='a',
                                            dtype='int64'),
                             columns=pd.Index([3, 4], name='b'))

        for i in [True, 'index', 'columns']:
            calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count',
                                     normalize=i)
            tm.assert_frame_equal(empty, calculated)

        nans = pd.DataFrame([[0.0, np.nan], [0.0, 0.0]],
                            index=pd.Index([1, 2],
                                           name='a',
                                           dtype='int64'),
                            columns=pd.Index([3, 4], name='b'))

        calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count',
                                 normalize=False)
        tm.assert_frame_equal(nans, calculated) 
Example #14
Source File: contingency_tables.py    From Splunking-Crime with GNU Affero General Public License v3.0 6 votes vote down vote up
def from_data(cls, data, shift_zeros=True):
        """
        Construct a Table object from data.

        Parameters
        ----------
        data : array-like
            The raw data, from which a contingency table is constructed
            using the first two columns.
        shift_zeros : boolean
            If True and any cell count is zero, add 0.5 to all values
            in the table.

        Returns
        -------
        A Table instance.
        """

        if isinstance(data, pd.DataFrame):
            table = pd.crosstab(data.iloc[:, 0], data.iloc[:, 1])
        else:
            table = pd.crosstab(data[:, 0], data[:, 1])

        return cls(table, shift_zeros) 
Example #15
Source File: contingency_tables.py    From Splunking-Crime with GNU Affero General Public License v3.0 6 votes vote down vote up
def from_data(cls, data, shift_zeros=True):
        """
        Construct a Table object from data.

        Parameters
        ----------
        data : array-like
            The raw data, the first column defines the rows and the
            second column defines the columns.
        shift_zeros : boolean
            If True, and if there are any zeros in the contingency
            table, add 0.5 to all four cells of the table.
        """

        if isinstance(data, pd.DataFrame):
            table = pd.crosstab(data.iloc[:, 0], data.iloc[:, 1])
        else:
            table = pd.crosstab(data[:, 0], data[:, 1])
        return cls(table, shift_zeros) 
Example #16
Source File: test_pivot.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_crosstab_ndarray(self):
        a = np.random.randint(0, 5, size=100)
        b = np.random.randint(0, 3, size=100)
        c = np.random.randint(0, 10, size=100)

        df = DataFrame({'a': a, 'b': b, 'c': c})

        result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'))
        expected = crosstab(df['a'], [df['b'], df['c']])
        tm.assert_frame_equal(result, expected)

        result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c'))
        expected = crosstab([df['b'], df['c']], df['a'])
        tm.assert_frame_equal(result, expected)

        # assign arbitrary names
        result = crosstab(self.df['A'].values, self.df['C'].values)
        assert result.index.name == 'row_0'
        assert result.columns.name == 'col_0' 
Example #17
Source File: evaluate.py    From toad with MIT License 6 votes vote down vote up
def crosstab_data(columns_var, row_var, data,unique_num,*args):
    columns_data, columns_target, columns_bins = merger_data(data, columns_var, unique_num,args[0])
    row_data, row_target, row_bins = merger_data(data, row_var, unique_num,args[1])
    result = pd.crosstab(row_data, columns_data, margins=True, dropna=False)
    if columns_bins is not None:
        columns = result.columns.tolist()
        columns.remove('All')
        columns_bins_list = rename_columns(columns, columns_bins, args[2])
        columns_bins_list.append('All')
        result.set_axis(columns_bins_list, axis=1, inplace=True)
    if row_bins is not None:
        index = result.index.tolist()
        index.remove('All')
        index_bins_list = rename_columns(index, row_bins, args[3])
        index_bins_list.append('All')
        result.set_axis(index_bins_list, axis=0, inplace=True)
    return result


# 写入所有高iv的变量分组和图到excel 
Example #18
Source File: crosstabs.py    From audit-ai with MIT License 6 votes vote down vote up
def crosstab_df(labels, decisions):
    """
    Parameters
    ------------
    labels : array_like
        containing categorical values like ['M', 'F']
    decisions : array_like
        containing boolean / binary values

    Returns
    --------
    crosstab : 2x2 array
        in the form,
                    False True
        TopGroup       5    4
        BottomGroup    3    4
        so, crosstab = array([[5, 4], [3, 4]])
    """
    labels, decisions = pd.Series(labels), pd.Series(decisions)
    # rows are label values (e.g. ['F', 'M'])
    # columns are decision values (e.g. [False, True])
    ctab = pd.crosstab(labels, decisions)
    return ctab 
Example #19
Source File: test_pivot.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_crosstab_ndarray(self):
        a = np.random.randint(0, 5, size=100)
        b = np.random.randint(0, 3, size=100)
        c = np.random.randint(0, 10, size=100)

        df = DataFrame({'a': a, 'b': b, 'c': c})

        result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'))
        expected = crosstab(df['a'], [df['b'], df['c']])
        tm.assert_frame_equal(result, expected)

        result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c'))
        expected = crosstab([df['b'], df['c']], df['a'])
        tm.assert_frame_equal(result, expected)

        # assign arbitrary names
        result = crosstab(self.df['A'].values, self.df['C'].values)
        assert result.index.name == 'row_0'
        assert result.columns.name == 'col_0' 
Example #20
Source File: test_pivot.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_crosstab_with_empties(self):
        # Check handling of empties
        df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
                           'c': [np.nan, np.nan, np.nan, np.nan, np.nan]})

        empty = pd.DataFrame([[0.0, 0.0], [0.0, 0.0]],
                             index=pd.Index([1, 2],
                                            name='a',
                                            dtype='int64'),
                             columns=pd.Index([3, 4], name='b'))

        for i in [True, 'index', 'columns']:
            calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count',
                                     normalize=i)
            tm.assert_frame_equal(empty, calculated)

        nans = pd.DataFrame([[0.0, np.nan], [0.0, 0.0]],
                            index=pd.Index([1, 2],
                                           name='a',
                                           dtype='int64'),
                            columns=pd.Index([3, 4], name='b'))

        calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count',
                                 normalize=False)
        tm.assert_frame_equal(nans, calculated) 
Example #21
Source File: test_pivot.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_crosstab_with_numpy_size(self):
        # GH 4003
        df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 6,
                           'B': ['A', 'B', 'C'] * 8,
                           'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4,
                           'D': np.random.randn(24),
                           'E': np.random.randn(24)})
        result = pd.crosstab(index=[df['A'], df['B']],
                             columns=[df['C']],
                             margins=True,
                             aggfunc=np.size,
                             values=df['D'])
        expected_index = pd.MultiIndex(levels=[['All', 'one', 'three', 'two'],
                                               ['', 'A', 'B', 'C']],
                                       codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0],
                                              [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]],
                                       names=['A', 'B'])
        expected_column = pd.Index(['bar', 'foo', 'All'],
                                   dtype='object',
                                   name='C')
        expected_data = np.array([[2., 2., 4.],
                                  [2., 2., 4.],
                                  [2., 2., 4.],
                                  [2., np.nan, 2.],
                                  [np.nan, 2., 2.],
                                  [2., np.nan, 2.],
                                  [np.nan, 2., 2.],
                                  [2., np.nan, 2.],
                                  [np.nan, 2., 2.],
                                  [12., 12., 24.]])
        expected = pd.DataFrame(expected_data,
                                index=expected_index,
                                columns=expected_column)
        tm.assert_frame_equal(result, expected) 
Example #22
Source File: _histograms.py    From epiScanpy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def cluster_composition(adata, cluster, condition, xlabel='cell cluster',
                        ylabel='cell count', title=None, save=False):
    """
    """
    

    contingency_table = pd.crosstab(
        adata.obs[condition],
        adata.obs[cluster],
        margins = True
    )

    counts = []
    p_part = []
    index = 0
    categories = sorted(list(set(adata.obs[cluster])))
    for n in sorted(set(adata.obs[condition])):
        #counts.append()
        p_part.append(plt.bar(categories, contingency_table.iloc[index][0:-1].values))
        index += 1

    #Plots the bar chart
    #plt.figsize(figsize=[6.4, 4.8])
    plt.legend(tuple([p[0] for p in p_part]), tuple(sorted(set(adata.obs[condition]))))
    plt.xlabel(xlabel, )
    plt.ylabel(ylabel)
    plt.title(title)
    
    
    if save!=False:
        
        if (save==True) or (save.split('.')[-1] not in ['png', 'pdf']):
            plt.savefig('cluster_composition.png', dpi=300, bbox_inches="tight")
        else:
            plt.savefig('_'.join(['cluster_composition',save]), #format=save.split('.')[-1],
                        dpi=300, bbox_inches="tight")
            
    plt.show() 
Example #23
Source File: random_forest.py    From Speculator with MIT License 5 votes vote down vote up
def confusion_matrix(self, actual, preds):
        """ Confusion matrix of actual set to predicted set """
        return crosstab(actual, preds, rownames=['(A)'], colnames=['(P)']) 
Example #24
Source File: NaiveBayes.py    From statistical_learning with Apache License 2.0 5 votes vote down vote up
def __init__(self, data, lam=0):
        df = pd.DataFrame(data)
        dim = df.shape[1]
        self.y_p = df[dim-1].groupby(df[dim-1]).count()+lam
        self.y_p /= self.y_p.sum()
        self.cb = []
        for i in range(dim-1):
            xi_p = pd.crosstab(df[i], df[dim-1])+lam
            self.cb.append(xi_p/xi_p.sum()) 
Example #25
Source File: test_pivot.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_crosstab_unsorted_order(self):
        df = pd.DataFrame({"b": [3, 1, 2], 'a': [5, 4, 6]},
                          index=['C', 'A', 'B'])
        result = pd.crosstab(df.index, [df.b, df.a])
        e_idx = pd.Index(['A', 'B', 'C'], name='row_0')
        e_columns = pd.MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)],
                                              names=['b', 'a'])
        expected = pd.DataFrame([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                                index=e_idx,
                                columns=e_columns)
        tm.assert_frame_equal(result, expected) 
Example #26
Source File: test_pivot.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_crosstab_tuple_name(self, names):
        s1 = pd.Series(range(3), name=names[0])
        s2 = pd.Series(range(1, 4), name=names[1])

        mi = pd.MultiIndex.from_arrays([range(3), range(1, 4)], names=names)
        expected = pd.Series(1, index=mi).unstack(1, fill_value=0)

        result = pd.crosstab(s1, s2)
        tm.assert_frame_equal(result, expected) 
Example #27
Source File: test_pivot.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_crosstab_dup_index_names(self):
        # GH 13279
        s = pd.Series(range(3), name='foo')

        result = pd.crosstab(s, s)
        expected_index = pd.Index(range(3), name='foo')
        expected = pd.DataFrame(np.eye(3, dtype=np.int64),
                                index=expected_index,
                                columns=expected_index)
        tm.assert_frame_equal(result, expected) 
Example #28
Source File: metrics.py    From reportgen with MIT License 5 votes vote down vote up
def info_value(X,y,bins='auto'):
    '''计算连续变量的IV值
    计算X和y之间的IV值
    IV=\sum (g_k/n_g-b_k/n_b)*log2(g_k*n_b/n_g/)
    '''
    threshold=[]
    for q in [0.05,0.04,0.03,0.02,0.01,1e-7]:
         t_down=max([X[y==k].quantile(q) for k in y.dropna().unique()])
         t_up=min([X[y==k].quantile(1-q) for k in y.dropna().unique()])
         threshold.append((t_down,t_up))

    if bins is not None:
        X=pd.cut(X,bins)
    ctable=pd.crosstab(X,y)
    p=ctable.sum()/ctable.sum().sum()
    if ctable.shape[1]==2:
        ctable=ctable/ctable.sum()
        IV=((ctable.iloc[:,0]-ctable.iloc[:,1])*np.log2(ctable.iloc[:,0]/ctable.iloc[:,1])).sum()
        return IV

    IV=0
    for cc in ctable.columns:
        ctable_bin=pd.concat([ctable[cc],ctable.loc[:,~(ctable.columns==cc)].sum(axis=1)],axis=1)
        ctable_bin=ctable_bin/ctable_bin.sum()
        IV_bin=((ctable_bin.iloc[:,0]-ctable_bin.iloc[:,1])*np.log2(ctable_bin.iloc[:,0]/ctable_bin.iloc[:,1])).sum()
        IV+=IV_bin*p[cc]
    return IV



# 计算离散随机变量的熵 
Example #29
Source File: test_pivot.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_crosstab_no_overlap(self):
        # GS 10291

        s1 = pd.Series([1, 2, 3], index=[1, 2, 3])
        s2 = pd.Series([4, 5, 6], index=[4, 5, 6])

        actual = crosstab(s1, s2)
        expected = pd.DataFrame()

        tm.assert_frame_equal(actual, expected) 
Example #30
Source File: test_pivot.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_crosstab_dropna(self):
        # GH 3820
        a = np.array(['foo', 'foo', 'foo', 'bar',
                      'bar', 'foo', 'foo'], dtype=object)
        b = np.array(['one', 'one', 'two', 'one',
                      'two', 'two', 'two'], dtype=object)
        c = np.array(['dull', 'dull', 'dull', 'dull',
                      'dull', 'shiny', 'shiny'], dtype=object)
        res = pd.crosstab(a, [b, c], rownames=['a'],
                          colnames=['b', 'c'], dropna=False)
        m = MultiIndex.from_tuples([('one', 'dull'), ('one', 'shiny'),
                                    ('two', 'dull'), ('two', 'shiny')],
                                   names=['b', 'c'])
        tm.assert_index_equal(res.columns, m)