Python pandas.api.types.is_categorical() Examples

The following are 21 code examples of pandas.api.types.is_categorical(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas.api.types , or try the search function .
Example #1
Source File: test_readwrite.py    From anndata with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_readwrite_zarr(typ, tmp_path):
    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    adata_src.raw = adata_src
    assert not is_categorical(adata_src.obs["oanno1"])
    adata_src.write_zarr(tmp_path / "test_zarr_dir", chunks=True)

    adata = ad.read_zarr(tmp_path / "test_zarr_dir")
    assert is_categorical(adata.obs["oanno1"])
    assert not is_categorical(adata.obs["oanno2"])
    assert adata.obs.index.tolist() == ["name1", "name2", "name3"]
    assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"]
    assert is_categorical(adata.raw.var["vanno2"])
    assert np.all(adata.obs == adata_src.obs)
    assert np.all(adata.var == adata_src.var)
    assert np.all(adata.var.index == adata_src.var.index)
    assert adata.var.index.dtype == adata_src.var.index.dtype
    assert type(adata.raw.X) is type(adata_src.raw.X)
    assert np.allclose(asarray(adata.raw.X), asarray(adata_src.raw.X))
    assert np.all(adata.raw.var == adata_src.raw.var)
    assert isinstance(adata.uns["uns4"]["a"], (int, np.integer))
    assert isinstance(adata_src.uns["uns4"]["a"], (int, np.integer))
    assert type(adata.uns["uns4"]["c"]) is type(adata_src.uns["uns4"]["c"])
    assert_equal(adata, adata_src) 
Example #2
Source File: anndata.py    From anndata with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _remove_unused_categories(self, df_full, df_sub, uns):
        from pandas.api.types import is_categorical

        for k in df_full:
            if not is_categorical(df_full[k]):
                continue
            all_categories = df_full[k].cat.categories
            df_sub[k].cat.remove_unused_categories(inplace=True)
            # also correct the colors...
            color_key = f"{k}_colors"
            if color_key not in uns:
                continue
            color_vec = uns[color_key]
            if np.array(color_vec).ndim == 0:
                # Make 0D arrays into 1D ones
                uns[color_key] = np.array(color_vec)[(None,)]
            elif len(color_vec) != len(all_categories):
                # Reset colors
                del uns[color_key]
            else:
                idx = np.where(np.in1d(all_categories, df_sub[k].cat.categories))[0]
                uns[color_key] = np.array(color_vec)[(idx,)] 
Example #3
Source File: utils.py    From plotnine with GNU General Public License v2.0 6 votes vote down vote up
def ordinal(arr):
        """
        Return True if array is an ordered categorical

        Parameters
        ----------
        arr : numpy.array
            Must have a dtype

        Returns
        -------
        out : bool
            Whether array `arr` is an ordered categorical
        """
        if pdtypes.is_categorical(arr):
            return arr.cat.ordered
        return False 
Example #4
Source File: utils.py    From scvelo with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_colors(adata, c):
    if is_color_like(c):
        return c
    else:
        if f"{c}_colors" not in adata.uns.keys():
            palette = default_palette(None)
            palette = adjust_palette(palette, length=len(adata.obs[c].cat.categories))
            n_cats = len(adata.obs[c].cat.categories)
            adata.uns[f"{c}_colors"] = palette[:n_cats].by_key()["color"]
        if isinstance(adata.uns[f"{c}_colors"], dict):
            cluster_ix = adata.obs[c].values
        else:
            cluster_ix = adata.obs[c].cat.codes.values
        return np.array(
            [
                adata.uns[f"{c}_colors"][cluster_ix[i]]
                if cluster_ix[i] != -1
                else "lightgrey"
                for i in range(adata.n_obs)
            ]
        ) 
Example #5
Source File: utils.py    From scvelo with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_groups(adata, groups, groupby=None):
    if not isinstance(groupby, str) or groupby not in adata.obs.keys():
        groupby = (
            "clusters"
            if "clusters" in adata.obs.keys()
            else "louvain"
            if "louvain" in adata.obs.keys()
            else None
        )
    if groups is True:
        return None, groupby
    if groups is not None and not isinstance(groups, str) and len(groups) == 1:
        groups = groups[0]
    if isinstance(groups, str):
        cats = [""]
        if is_categorical(adata, groupby):
            cats = adata.obs[groupby].cat.categories
        if ":" in groups and not np.any([":" in cat for cat in cats]):
            groupby, groups = groups.split(":")
            groups = groups.strip()
        if "," in groups and not np.any(["," in cat for cat in cats]):
            groups = [a.strip() for a in groups.split(",")]
    if isinstance(groups, str):
        groups = [groups]
    return groups, groupby 
Example #6
Source File: _create.py    From cooler with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _rename_chroms(grp, rename_dict, h5opts):
    chroms = get(grp["chroms"]).set_index("name")
    n_chroms = len(chroms)
    new_names = np.array(
        chroms.rename(rename_dict).index.values, dtype=CHROM_DTYPE
    )  # auto-adjusts char length

    del grp["chroms/name"]
    grp["chroms"].create_dataset(
        "name", shape=(n_chroms,), dtype=new_names.dtype, data=new_names, **h5opts
    )

    bins = get(grp["bins"])
    n_bins = len(bins)
    idmap = dict(zip(new_names, range(n_chroms)))
    if is_categorical(bins["chrom"]) or is_integer(bins["chrom"]):
        chrom_ids = bins["chrom"].cat.codes
        chrom_dtype = h5py.special_dtype(enum=(CHROMID_DTYPE, idmap))
        del grp["bins/chrom"]
        try:
            grp["bins"].create_dataset(
                "chrom", shape=(n_bins,), dtype=chrom_dtype, data=chrom_ids, **h5opts
            )
        except ValueError:
            # If HDF5 enum header would be too large,
            # try storing chrom IDs as raw int instead
            chrom_dtype = CHROMID_DTYPE
            grp["bins"].create_dataset(
                "chrom", shape=(n_bins,), dtype=chrom_dtype, data=chrom_ids, **h5opts
            ) 
Example #7
Source File: test_readwrite.py    From anndata with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_readwrite_backed(typ, backing_h5ad):
    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    adata_src.filename = backing_h5ad  # change to backed mode
    adata_src.write()

    adata = ad.read(backing_h5ad)
    assert is_categorical(adata.obs["oanno1"])
    assert not is_categorical(adata.obs["oanno2"])
    assert adata.obs.index.tolist() == ["name1", "name2", "name3"]
    assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"]
    assert_equal(adata, adata_src) 
Example #8
Source File: test_readwrite.py    From anndata with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_readwrite_h5ad(typ, dataset_kwargs, backing_h5ad):
    tmpdir = tempfile.TemporaryDirectory()
    tmpdirpth = Path(tmpdir.name)
    mid_pth = tmpdirpth / "mid.h5ad"

    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    assert not is_categorical(adata_src.obs["oanno1"])
    adata_src.raw = adata_src
    adata_src.write(backing_h5ad, **dataset_kwargs)

    adata_mid = ad.read(backing_h5ad)
    adata_mid.write(mid_pth, **dataset_kwargs)

    adata = ad.read_h5ad(mid_pth)
    assert is_categorical(adata.obs["oanno1"])
    assert not is_categorical(adata.obs["oanno2"])
    assert adata.obs.index.tolist() == ["name1", "name2", "name3"]
    assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"]
    assert is_categorical(adata.raw.var["vanno2"])
    assert np.all(adata.obs == adata_src.obs)
    assert np.all(adata.var == adata_src.var)
    assert np.all(adata.var.index == adata_src.var.index)
    assert adata.var.index.dtype == adata_src.var.index.dtype
    assert type(adata.raw.X) is type(adata_src.raw.X)
    assert type(adata.raw.varm) is type(adata_src.raw.varm)
    assert np.allclose(asarray(adata.raw.X), asarray(adata_src.raw.X))
    assert np.all(adata.raw.var == adata_src.raw.var)
    assert isinstance(adata.uns["uns4"]["a"], (int, np.integer))
    assert isinstance(adata_src.uns["uns4"]["a"], (int, np.integer))
    assert type(adata.uns["uns4"]["c"]) is type(adata_src.uns["uns4"]["c"])
    assert_equal(adata, adata_src) 
Example #9
Source File: stat_boxplot.py    From plotnine with GNU General Public License v2.0 5 votes vote down vote up
def compute_group(cls, data, scales, **params):
        labels = ['x', 'y']
        X = np.array(data[labels])
        res = boxplot_stats(X, whis=params['coef'], labels=labels)[1]
        try:
            n = data['weight'].sum()
        except KeyError:
            n = len(data['y'])

        if len(np.unique(data['x'])) > 1:
            width = np.ptp(data['x']) * 0.9
        else:
            width = params['width']

        if pdtypes.is_categorical(data['x']):
            x = data['x'].iloc[0]
        else:
            x = np.mean([data['x'].min(), data['x'].max()])

        d = {'ymin': res['whislo'],
             'lower': res['q1'],
             'middle': [res['med']],
             'upper': res['q3'],
             'ymax': res['whishi'],
             'outliers': [res['fliers']],
             'notchupper': res['med']+1.58*res['iqr']/np.sqrt(n),
             'notchlower': res['med']-1.58*res['iqr']/np.sqrt(n),
             'x': x,
             'width': width,
             'relvarwidth': np.sqrt(n)}
        return pd.DataFrame(d) 
Example #10
Source File: utils.py    From scvelo with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def default_legend_loc(adata, color, legend_loc):
    n_categories = 0
    if is_categorical(adata, color):
        n_categories = len(adata.obs[color].cat.categories)
    if legend_loc is False:
        legend_loc = "none"
    elif legend_loc is None:
        legend_loc = "upper right" if n_categories <= 4 else "on data"
    return legend_loc 
Example #11
Source File: utils.py    From scvelo with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_value_counts(adata, color):
    value_counts = adata.obs[color].value_counts()
    probs = np.array(adata.obs[color])
    for cat in value_counts.index:
        probs[probs == cat] = value_counts[cat]
    return np.array(probs, dtype=np.float32) 
Example #12
Source File: utils.py    From scvelo with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def is_categorical(data, c=None):
    from pandas.api.types import is_categorical as cat

    if c is None:
        return cat(data)  # if data is categorical/array
    if not is_view(data):  # if data is anndata view
        strings_to_categoricals(data)
    return isinstance(c, str) and c in data.obs.keys() and cat(data.obs[c]) 
Example #13
Source File: auto_prep.py    From nyaggle with MIT License 5 votes vote down vote up
def _fill_na_by_unique_value(strain: pd.Series, stest: Optional[pd.Series]) -> Tuple[pd.Series, pd.Series]:
    if is_categorical(strain):
        return strain.cat.codes, stest.cat.codes
    elif is_integer_dtype(strain.dtype):
        fillval = min(strain.min(), stest.min()) - 1
        return strain.fillna(fillval), stest.fillna(fillval)
    else:
        return strain.astype(str), stest.astype(str) 
Example #14
Source File: auto_prep.py    From nyaggle with MIT License 5 votes vote down vote up
def autoprep_gbdt(algorithm_type: str, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],
                  categorical_feature_to_treat: Optional[List[str]] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
    if categorical_feature_to_treat is None:
        categorical_feature_to_treat = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']]

    # LightGBM:
    # Can handle categorical dtype. Otherwise, int, float or bool is acceptable for categorical columns.
    # https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support
    #
    # CatBoost:
    # int, float, bool or str is acceptable for categorical columns. NaN should be filled.
    # https://catboost.ai/docs/concepts/faq.html#why-float-and-nan-values-are-forbidden-for-cat-features
    #
    # XGBoost:
    # All categorical column should be encoded beforehand.

    if algorithm_type == 'lgbm':
        # LightGBM can handle categorical dtype natively
        categorical_feature_to_treat = [c for c in categorical_feature_to_treat if not is_categorical(X_train[c])]

    if algorithm_type == 'cat' and len(categorical_feature_to_treat) > 0:
        X_train = X_train.copy()
        X_test = X_test.copy() if X_test is not None else X_train.iloc[:1, :].copy()  # dummy
        for c in categorical_feature_to_treat:
            X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c])

    if algorithm_type in ('lgbm', 'xgb') and len(categorical_feature_to_treat) > 0:
        assert X_test is not None, "X_test is required for XGBoost with categorical variables"
        X_train = X_train.copy()
        X_test = X_test.copy()

        for c in categorical_feature_to_treat:
            X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c])
            le = LabelEncoder()
            concat = np.concatenate([X_train[c].values, X_test[c].values])
            concat = le.fit_transform(concat)
            X_train[c] = concat[:len(X_train)]
            X_test[c] = concat[len(X_train):]

    return X_train, X_test 
Example #15
Source File: builder.py    From dgl with Apache License 2.0 5 votes vote down vote up
def _series_to_tensor(series):
    if is_categorical(series):
        return torch.LongTensor(series.cat.codes.values.astype('int64'))
    else:       # numeric
        return torch.FloatTensor(series.values) 
Example #16
Source File: cat_tools.py    From plydata with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def as_categorical(c, copy=True):
    """
    Convert input to a categorical

    Parameters
    ----------
    c : categorical_like
        Sequence of objects
    copy : bool
        If `True` and c is alread a categorical, return
        a copy of `c` otherwise return `c`.

    Returns
    -------
    out : categorical
        Categorical made out of `c` or copy of `c`
        if it was a categorical

    """
    if not pdtypes.is_categorical(c):
        c = pd.Categorical(c)
    elif copy:
        c = c.copy()
    return c


# Temporary functions 
Example #17
Source File: cat_tools.py    From plydata with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def cat_concat(*args):
    """
    Concatenate categoricals and combine the categories

    Parameters
    ----------
    *args : tuple
        Categoricals to be concatenated

    Examples
    --------
    >>> c1 = pd.Categorical(['a', 'b'], categories=['b', 'a'])
    >>> c2 = pd.Categorical(['d', 'a', 'c'])
    >>> cat_concat(c1, c2)
    [a, b, d, a, c]
    Categories (4, object): [b, a, c, d]

    Notes
    -----
    The resulting category is not ordered.
    """
    categories = pd.unique(list(chain(*(
        c.categories if pdtypes.is_categorical(c) else c
        for c in args
    ))))
    cs = pd.Categorical(
        list(chain(*(c for c in args))),
        categories=categories
    )
    return cs 
Example #18
Source File: anndata.py    From anndata with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def strings_to_categoricals(self, df: Optional[pd.DataFrame] = None):
        """\
        Transform string annotations to categoricals.

        Only affects string annotations that lead to less categories than the
        total number of observations.

        Params
        ------
        df
            If `df` is `None`, modifies both :attr:`obs` and :attr:`var`,
            otherwise modifies `df` inplace.

        Notes
        -----
        Turns the view of an :class:`~anndata.AnnData` into an actual
        :class:`~anndata.AnnData`.
        """
        dont_modify = False  # only necessary for backed views
        if df is None:
            dfs = [self.obs, self.var]
            if self.is_view and self.isbacked:
                dont_modify = True
        else:
            dfs = [df]
        for df in dfs:
            string_cols = [
                key
                for key in df.columns
                if is_string_dtype(df[key]) and not is_categorical(df[key])
            ]
            for key in string_cols:
                # make sure we only have strings
                # (could be that there are np.nans (float), -666, "-666", for instance)
                c = df[key].astype("U")
                # make a categorical
                c = pd.Categorical(c, categories=natsorted(np.unique(c)))
                if len(c.categories) >= len(c):
                    continue
                if dont_modify:
                    raise RuntimeError(
                        "Please call `.strings_to_categoricals()` on full "
                        "AnnData, not on this view. You might encounter this"
                        "error message while copying or writing to disk."
                    )
                if self.is_view:
                    warnings.warn(
                        "Initializing view as actual.", ImplicitModificationWarning
                    )
                # If `self` is a view, it will be actualized in the next line,
                # therefore the previous warning
                df[key] = c
                logger.info(f"... storing {key!r} as categorical") 
Example #19
Source File: cat_tools.py    From plydata with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def cat_zip(*args, sep=':', keep_empty=False):
    """
    Create a new categorical (zip style) combined from two or more

    Parameters
    ----------
    *args : tuple
        Categoricals to be concatenated.
    sep : str (default: ':')
        Separator for the combined categories.
    keep_empty : bool (default: False)
        If ``True``, include all combinations of categories
        even those without observations.

    Examples
    --------
    >>> c1 = pd.Categorical(list('aba'))
    >>> c2 = pd.Categorical(list('122'))
    >>> cat_zip(c1, c2)
    [a:1, b:2, a:2]
    Categories (3, object): [a:1, a:2, b:2]
    >>> cat_zip(c1, c2, keep_empty=True)
    [a:1, b:2, a:2]
    Categories (4, object): [a:1, a:2, b:1, b:2]
    """
    values = [sep.join(items) for items in zip(*args)]
    cs = [
        c if pdtypes.is_categorical(c) else pd.Categorical(c)
        for c in args
    ]
    categories = [
        sep.join(items)
        for items in product(*(c.categories for c in cs))
    ]

    c = pd.Categorical(values, categories=categories)

    if not keep_empty:
        c.remove_unused_categories(inplace=True)

    return c


# helpers 
Example #20
Source File: cat_tools.py    From plydata with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def cat_remove_unused(c, only=None):
    """
    Remove unused categories

    Parameters
    ----------
    c : list-like
        Values that will make up the categorical.
    only : list-like (optional)
        The categories to remove *if* they are empty. If not given,
        all unused categories are dropped.

    Examples
    --------
    >>> c = pd.Categorical(list('abcdd'), categories=list('bacdefg'))
    >>> c
    [a, b, c, d, d]
    Categories (7, object): [b, a, c, d, e, f, g]
    >>> cat_remove_unused(c)
    [a, b, c, d, d]
    Categories (4, object): [b, a, c, d]
    >>> cat_remove_unused(c, only=['a', 'e', 'g'])
    [a, b, c, d, d]
    Categories (5, object): [b, a, c, d, f]
    """
    if not pdtypes.is_categorical(c):
        # All categories are used
        c = pd.Categorical(c)
        return c
    else:
        c = c.copy()

    if only is None:
        only = c.categories

    used_idx = pd.unique(c.codes)
    used_categories = c.categories[used_idx]
    c = c.remove_categories(
        c.categories
        .difference(used_categories)
        .intersection(only)
    )
    return c 
Example #21
Source File: cat_tools.py    From plydata with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def cat_infreq(c, ordered=None):
    """
    Reorder categorical by frequency of the values

    Parameters
    ----------
    c : list-like
        Values that will make up the categorical.
    ordered : bool
        If ``True``, the categorical is ordered.

    Returns
    -------
    out : categorical
        Values

    Examples
    --------
    >>> x = ['d', 'a', 'b', 'b', 'c', 'c', 'c']
    >>> cat_infreq(x)
    [d, a, b, b, c, c, c]
    Categories (4, object): [c, b, d, a]
    >>> cat_infreq(x, ordered=True)
    [d, a, b, b, c, c, c]
    Categories (4, object): [c < b < d < a]

    When two or more values occur the same number of times, if the
    categorical is ordered, the order is preserved. If it is not
    not ordered, the order depends on that of the values. Above 'd'
    comes before 'a', and below 'a' comes before 'a'.

    >>> c = pd.Categorical(
    ...     x, categories=['a', 'c', 'b', 'd']
    ... )
    >>> cat_infreq(c)
    [d, a, b, b, c, c, c]
    Categories (4, object): [c, b, a, d]
    >>> cat_infreq(c.set_ordered(True))
    [d, a, b, b, c, c, c]
    Categories (4, object): [c < b < a < d]
    """
    kwargs = {} if ordered is None else {'ordered': ordered}
    counts = value_counts(c)
    if pdtypes.is_categorical(c):
        original_cat_order = c.categories
    else:
        original_cat_order = pd.unique(c)
    counts = counts.reindex(index=original_cat_order)
    cats = (_stable_series_sort(counts, ascending=False)
            .index
            .to_list())
    return pd.Categorical(c, categories=cats, **kwargs)