Python pandas.api.types.is_categorical_dtype() Examples

The following are 17 code examples of pandas.api.types.is_categorical_dtype(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas.api.types , or try the search function .
Example #1
Source File: data.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _fit(self, X: DataFrameType):
        if self.columns is None:
            columns = X.select_dtypes(include=["object", "category"]).columns
        else:
            columns = self.columns
        categories = {}
        for name in columns:
            col = X[name]
            if not is_categorical_dtype(col):
                # This shouldn't ever be hit on a dask.array, since
                # the object columns would have been converted to known cats
                # already
                col = pd.Series(col, index=X.index).astype("category")

            if _HAS_CTD:
                categories[name] = col.dtype
            else:
                categories[name] = (col.cat.categories, col.cat.ordered)

        return columns, categories 
Example #2
Source File: zarr.py    From anndata with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def write_series(group, key, series, dataset_kwargs=MappingProxyType({})):
    if series.dtype == object:
        group.create_dataset(
            key,
            shape=series.shape,
            dtype=object,
            object_codec=numcodecs.VLenUTF8(),
            **dataset_kwargs,
        )
        group[key][:] = series.values
    elif is_categorical_dtype(series):
        # This should work for categorical Index and Series
        categorical: pd.Categorical = series.values
        categories: np.ndarray = categorical.categories.values
        codes: np.ndarray = categorical.codes
        category_key = f"__categories/{key}"

        write_array(group, category_key, categories, dataset_kwargs=dataset_kwargs)
        write_array(group, key, codes, dataset_kwargs=dataset_kwargs)

        group[key].attrs["categories"] = category_key
        # Must coerce np.bool_ to bool for json writing
        group[category_key].attrs["ordered"] = bool(categorical.ordered)
    else:
        group[key] = series.values 
Example #3
Source File: h5ad.py    From anndata with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def write_series(group, key, series, dataset_kwargs=MappingProxyType({})):
    # group here is an h5py type, otherwise categoricals won’t write
    if series.dtype == object:  # Assuming it’s string
        group.create_dataset(
            key,
            data=series.values,
            dtype=h5py.special_dtype(vlen=str),
            **dataset_kwargs,
        )
    elif is_categorical_dtype(series):
        # This should work for categorical Index and Series
        categorical: pd.Categorical = series.values
        categories: np.ndarray = categorical.categories.values
        codes: np.ndarray = categorical.codes
        category_key = f"__categories/{key}"

        write_array(group, category_key, categories, dataset_kwargs=dataset_kwargs)
        write_array(group, key, codes, dataset_kwargs=dataset_kwargs)

        group[key].attrs["categories"] = group[category_key].ref
        group[category_key].attrs["ordered"] = categorical.ordered
    else:
        group[key] = series.values 
Example #4
Source File: test_concatenate.py    From anndata with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fix_known_differences(orig, result):
    """
    Helper function for reducing anndata's to only the elements we expect to be
    equivalent after concatenation.

    Only for the case where orig is the ground truth result of what concatenation should be.
    """
    orig = orig.copy()
    result = result.copy()

    result.obs.drop(columns=["batch"], inplace=True)
    result.strings_to_categoricals()  # Should this be implicit in concatenation?

    # TODO
    # * merge varm, varp similar to uns
    # * merge obsp, but some information should be lost
    del orig.varm
    del orig.varp
    del orig.obsp  # TODO

    # Possibly need to fix this, ordered categoricals lose orderedness
    for k, dtype in orig.obs.dtypes.items():
        if is_categorical_dtype(dtype) and dtype.ordered:
            result.obs[k] = result.obs[k].astype(dtype)

    return orig, result 
Example #5
Source File: test_data.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_dask(self):
        a = dd.from_pandas(raw, npartitions=2)
        ce = dpp.Categorizer()
        trn = ce.fit_transform(a)
        assert is_categorical_dtype(trn["A"])
        assert is_categorical_dtype(trn["B"])
        assert is_categorical_dtype(trn["C"])
        assert trn["D"].dtype == np.dtype("int64")
        tm.assert_index_equal(ce.columns_, pd.Index(["A", "B", "C"])) 
Example #6
Source File: test_data.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_ce(self):
        ce = dpp.Categorizer()
        original = raw.copy()
        trn = ce.fit_transform(raw)
        assert is_categorical_dtype(trn["A"])
        assert is_categorical_dtype(trn["B"])
        assert is_categorical_dtype(trn["C"])
        assert trn["D"].dtype == np.dtype("int64")
        tm.assert_index_equal(ce.columns_, pd.Index(["A", "B", "C"]))
        tm.assert_frame_equal(raw, original) 
Example #7
Source File: data.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fit(
        self, X: DataFrameType, y: Optional[Union[ArrayLike, SeriesType]] = None
    ) -> "OrdinalEncoder":
        """Determine the categorical columns to be encoded.

        Parameters
        ----------
        X : pandas.DataFrame or dask.dataframe.DataFrame
        y : ignored

        Returns
        -------
        self
        """
        self.columns_ = X.columns
        columns = self.columns
        if columns is None:
            columns = X.select_dtypes(include=["category"]).columns
        else:
            for column in columns:
                assert is_categorical_dtype(X[column]), "Must be categorical"

        self.categorical_columns_ = columns
        self.non_categorical_columns_ = X.columns.drop(self.categorical_columns_)

        if _HAS_CTD:
            self.dtypes_ = {col: X[col].dtype for col in self.categorical_columns_}
        else:
            self.dtypes_ = {
                col: (X[col].cat.categories, X[col].cat.ordered)
                for col in self.categorical_columns_
            }

        return self 
Example #8
Source File: analysis.py    From reportgen with MIT License 5 votes vote down vote up
def describe(data):
    '''
    对每个变量生成统计指标特征
    对于每一个变量,生成如下字段:
        数据类型:
        最大值/频数最大的那个:
        最小值/频数最小的那个:
        均值/频数中间的那个:
        缺失率:
        范围/唯一数:
    '''

    data=pd.DataFrame(data)
    n_sample=len(data)
    var_type=type_of_var(data,copy=True)
    summary=pd.DataFrame(columns=data.columns,index=['dtype','max','min','mean','missing_pct','std/nuniue'])
    for c in data.columns:
        missing_pct=1-data[c].count()/n_sample
        if var_type[c] == 'number':
            max_value,min_value,mean_value=data[c].max(),data[c].min(),data[c].mean()
            std_value=data[c].std()
            summary.loc[:,c]=[var_type[c],max_value,min_value,mean_value,missing_pct,std_value]
        elif var_type[c] == 'category' or is_categorical_dtype(data[c].dtype):
            tmp=data[c].value_counts()
            max_value,min_value=tmp.argmax(),tmp.argmin()
            mean_value_index=tmp[tmp==tmp.median()].index
            mean_value=mean_value_index[0] if len(mean_value_index)>0 else np.nan
            summary.loc[:,c]=[var_type[c],max_value,min_value,mean_value,missing_pct,len(tmp)]
        elif var_type[c] == 'datetime':
            max_value,min_value=data[c].max(),data[c].min()
            summary.loc[:,c]=[var_type[c],max_value,min_value,np.nan,missing_pct,np.nan]
        else:
            summary.loc[:,c]=[var_type[c],np.nan,np.nan,np.nan,missing_pct,np.nan]
    return summary 
Example #9
Source File: common.py    From plydata with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _add_group_columns(data, gdf):
    """
    Add group columns to data with a value from the grouped dataframe

    It is assumed that the grouped dataframe contains a single group

    >>> data = pd.DataFrame({
    ...     'x': [5, 6, 7]})
    >>> gdf = GroupedDataFrame({
    ...     'g': list('aaa'),
    ...     'x': range(3)}, groups=['g'])
    >>> _add_group_columns(data, gdf)
       g  x
    0  a  5
    1  a  6
    2  a  7
    """
    n = len(data)
    if isinstance(gdf, GroupedDataFrame):
        for i, col in enumerate(gdf.plydata_groups):
            if col not in data:
                group_values = [gdf[col].iloc[0]] * n
                # Need to be careful and maintain the dtypes
                # of the group columns
                if pdtypes.is_categorical_dtype(gdf[col]):
                    col_values = pd.Categorical(
                        group_values,
                        categories=gdf[col].cat.categories,
                        ordered=gdf[col].cat.ordered
                    )
                else:
                    col_values = pd.Series(
                        group_values,
                        index=data.index,
                        dtype=gdf[col].dtype
                    )
                # Group columns come first
                data.insert(i, col, col_values)
    return data 
Example #10
Source File: density.py    From plotnine with GNU General Public License v2.0 5 votes vote down vote up
def get_var_type(col):
    """
    Return var_type (for KDEMultivariate) of the column

    Parameters
    ----------
    col : pandas.Series
        A dataframe column.

    Returns
    -------
    out : str
        One of ['c', 'o', 'u'].

    See Also
    --------
    The origin of the character codes is
    :class:`statsmodels.nonparametric.kernel_density.KDEMultivariate`.
    """
    if pdtypes.is_numeric_dtype(col):
        # continuous
        return 'c'
    elif pdtypes.is_categorical_dtype(col):
        # ordered or unordered
        return 'o' if col.cat.ordered else 'u'
    else:
        # unordered if unsure, e.g string columns that
        # are not categorical
        return 'u' 
Example #11
Source File: utils.py    From plotnine with GNU General Public License v2.0 5 votes vote down vote up
def _id_var(x, drop=False):
    """
    Assign ids to items in x. If two items
    are the same, they get the same id.

    Parameters
    ----------
    x : array-like
        items to associate ids with
    drop : bool
        Whether to drop unused factor levels
    """
    if len(x) == 0:
        return []

    categorical = pdtypes.is_categorical_dtype(x)

    if categorical:
        if drop:
            x = x.cat.remove_unused_categories()
            lst = list(x.cat.codes + 1)
        else:
            has_nan = any(np.isnan(i) for i in x if isinstance(i, float))
            if has_nan:
                # NaNs are -1, we give them the highest code
                nan_code = -1
                new_nan_code = np.max(x.cat.codes) + 1
                lst = [val if val != nan_code else new_nan_code for val in x]
            else:
                lst = list(x.cat.codes + 1)
    else:
        try:
            levels = np.sort(np.unique(x))
        except TypeError:
            # x probably has NANs
            levels = multitype_sort(set(x))

        lst = match(x, levels)
        lst = [item + 1 for item in lst]

    return lst 
Example #12
Source File: validation.py    From PandasSchema with GNU General Public License v3.0 5 votes vote down vote up
def get_errors(self, series: pd.Series, column: 'column.Column'):

        errors = []

        # Calculate which columns are valid using the child class's validate function, skipping empty entries if the
        # column specifies to do so
        simple_validation = ~self.validate(series)
        if column.allow_empty:
            # Failing results are those that are not empty, and fail the validation
            # explicitly check to make sure the series isn't a category because issubdtype will FAIL if it is
            if is_categorical_dtype(series) or is_numeric_dtype(series):
                validated = ~series.isnull() & simple_validation
            else:
                validated = (series.str.len() > 0) & simple_validation

        else:
            validated = simple_validation

        # Cut down the original series to only ones that failed the validation
        indices = series.index[validated]

        # Use these indices to find the failing items. Also print the index which is probably a row number
        for i in indices:
            element = series[i]
            errors.append(ValidationWarning(
                message=self.message,
                value=element,
                row=i,
                column=series.name
            ))

        return errors 
Example #13
Source File: test_dataframe.py    From plydata with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_group_by_all():
    df = pd.DataFrame({
        'alpha': list('aaabbb'),
        'beta': list('babruq'),
        'theta': list('cdecde'),
        'x': [1, 2, 3, 4, 5, 6],
        'y': [6, 5, 4, 3, 2, 1],
        'z': [7, 9, 11, 8, 10, 12]
    })

    result = df >> group_by_all()
    assert len(df.columns) == len(result.columns)
    assert len(df.columns) == len(result.plydata_groups)

    result = df >> group_by_all(pd.Categorical)
    assert len(df.columns) == len(result.columns)
    assert len(df.columns) == len(result.plydata_groups)

    result = df >> group_by_all(dict(cat=pd.Categorical))
    assert len(df.columns)*2 == len(result.columns)
    for col in df.columns:
        col_cat = '{}_cat'.format(col)
        assert not pdtypes.is_categorical_dtype(result[col])
        assert pdtypes.is_categorical_dtype(result[col_cat])

    result = (df
              >> group_by('x')
              >> group_by_all(dict(cat=pd.Categorical)))
    assert result.plydata_groups == [
        '{}_cat'.format(col) for col in df.columns if col != 'x']
    assert len(df.columns)*2-1 == len(result.columns)
    assert 'x_cat' not in result 
Example #14
Source File: test_dataframe.py    From plydata with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_summarize():
    df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0, 4],
                       'y': [1, 2, 3, 4, 5, 6, 5],
                       'z': [1, 3, 3, 4, 5, 5, 5]})

    result = df >> summarize('np.sum(x)', max='np.max(x)')
    assert result.loc[0, 'max'] == np.max(df['x'])
    assert result.loc[0, 'np.sum(x)'] == np.sum(df['x'])

    result = df >> group_by('y', 'z') >> summarize(mean_x='np.mean(x)')
    assert 'y' in result
    assert 'z' in result
    assert all(result['mean_x'] == [1, 5, 2, 2, 4, 0])

    # (Name, Expression) tuples
    result = df >> summarize(('sum', 'np.sum(x)'), ('max', 'np.max(x)'))
    assert 'sum' in result
    assert 'max' in result

    # Branches
    result = df >> group_by('y') >> summarize('np.sum(z)', constant=1)
    assert 'y' in result
    assert result.loc[0, 'constant'] == 1

    # Category stays category
    df1 = df.copy()
    df1['z'] = pd.Categorical(df1['z'])
    result = df1 >> group_by('y', 'z') >> summarize(mean_x='np.mean(x)')
    assert result['y'].dtype == np.int
    assert pdtypes.is_categorical_dtype(result['z']) 
Example #15
Source File: utils.py    From plotnine with GNU General Public License v2.0 4 votes vote down vote up
def add_margins(df, vars, margins=True):
    """
    Add margins to a data frame.

    All margining variables will be converted to factors.

    Parameters
    ----------
    df : dataframe
        input data frame

    vars : list
        a list of 2 lists | tuples vectors giving the
        variables in each dimension

    margins : bool | list
        variable names to compute margins for.
        True will compute all possible margins.
    """
    margin_vars = _margins(vars, margins)
    if not margin_vars:
        return df

    # create margin dataframes
    margin_dfs = [df]
    for vlst in margin_vars[1:]:
        dfx = df.copy()
        for v in vlst:
            dfx.loc[0:, v] = '(all)'
        margin_dfs.append(dfx)

    merged = pd.concat(margin_dfs, axis=0)
    merged.reset_index(drop=True, inplace=True)

    # All margin columns become categoricals. The margin indicator
    # (all) needs to be added as the last level of the categories.
    categories = {}
    for v in itertools.chain(*vars):
        col = df[v]
        if not pdtypes.is_categorical_dtype(df[v].dtype):
            col = pd.Categorical(df[v])
        categories[v] = col.categories
        if '(all)' not in categories[v]:
            categories[v] = categories[v].insert(
                len(categories[v]), '(all)')

    for v in merged.columns.intersection(set(categories)):
        merged[v] = merged[v].astype(
           pdtypes.CategoricalDtype(categories[v]))

    return merged 
Example #16
Source File: test_integration.py    From pymapd with Apache License 2.0 4 votes vote down vote up
def test_upload_pandas_categorical_ipc(self, con):

        con.execute("DROP TABLE IF EXISTS test_categorical;")

        df = pd.DataFrame({"A": ["a", "b", "c", "a"]})
        df["B"] = df["A"].astype('category')

        # test that table created correctly when it doesn't exist on server
        con.load_table("test_categorical", df)
        ans = con.execute("select * from test_categorical").fetchall()

        assert ans == [('a', 'a'), ('b', 'b'), ('c', 'c'), ('a', 'a')]

        assert con.get_table_details("test_categorical") == [
            ColumnDetails(
                name='A',
                type='STR',
                nullable=True,
                precision=0,
                scale=0,
                comp_param=32,
                encoding='DICT',
                is_array=False,
            ),
            ColumnDetails(
                name='B',
                type='STR',
                nullable=True,
                precision=0,
                scale=0,
                comp_param=32,
                encoding='DICT',
                is_array=False,
            ),
        ]

        # load row-wise
        con.load_table("test_categorical", df, method="rows")

        # load columnar
        con.load_table("test_categorical", df, method="columnar")

        # load arrow
        con.load_table("test_categorical", df, method="arrow")

        # test end result
        df_ipc = con.select_ipc("select * from test_categorical")
        assert df_ipc.shape == (16, 2)

        res = df.append([df, df, df]).reset_index(drop=True)
        res["A"] = res["A"].astype('category')
        res["B"] = res["B"].astype('category')
        assert pd.DataFrame.equals(df_ipc, res)

        # test that input df wasn't mutated
        # original input is object, categorical
        # to load via Arrow, converted internally to object, object
        assert is_object_dtype(df["A"])
        assert is_categorical_dtype(df["B"])
        con.execute("DROP TABLE IF EXISTS test_categorical;") 
Example #17
Source File: data.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def fit(
        self, X: DataFrameType, y: Optional[Union[ArrayLike, SeriesType]] = None
    ) -> "DummyEncoder":
        """Determine the categorical columns to be dummy encoded.

        Parameters
        ----------
        X : pandas.DataFrame or dask.dataframe.DataFrame
        y : ignored

        Returns
        -------
        self
        """
        self.columns_ = X.columns
        columns = self.columns
        if columns is None:
            columns = X.select_dtypes(include=["category"]).columns
        else:
            for column in columns:
                assert is_categorical_dtype(X[column]), "Must be categorical"

        self.categorical_columns_ = columns
        self.non_categorical_columns_ = X.columns.drop(self.categorical_columns_)

        if _HAS_CTD:
            self.dtypes_ = {col: X[col].dtype for col in self.categorical_columns_}
        else:
            self.dtypes_ = {
                col: (X[col].cat.categories, X[col].cat.ordered)
                for col in self.categorical_columns_
            }

        left = len(self.non_categorical_columns_)
        self.categorical_blocks_ = {}
        for col in self.categorical_columns_:
            right = left + len(X[col].cat.categories)
            if self.drop_first:
                right -= 1
            self.categorical_blocks_[col], left = slice(left, right), right

        if isinstance(X, pd.DataFrame):
            sample = X.iloc[:1]
        else:
            sample = X._meta_nonempty

        self.transformed_columns_ = pd.get_dummies(
            sample, drop_first=self.drop_first
        ).columns
        return self