Python pandas.api.types.is_categorical_dtype() Examples
The following are 17
code examples of pandas.api.types.is_categorical_dtype().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas.api.types
, or try the search function
.
Example #1
Source File: data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _fit(self, X: DataFrameType): if self.columns is None: columns = X.select_dtypes(include=["object", "category"]).columns else: columns = self.columns categories = {} for name in columns: col = X[name] if not is_categorical_dtype(col): # This shouldn't ever be hit on a dask.array, since # the object columns would have been converted to known cats # already col = pd.Series(col, index=X.index).astype("category") if _HAS_CTD: categories[name] = col.dtype else: categories[name] = (col.cat.categories, col.cat.ordered) return columns, categories
Example #2
Source File: zarr.py From anndata with BSD 3-Clause "New" or "Revised" License | 6 votes |
def write_series(group, key, series, dataset_kwargs=MappingProxyType({})): if series.dtype == object: group.create_dataset( key, shape=series.shape, dtype=object, object_codec=numcodecs.VLenUTF8(), **dataset_kwargs, ) group[key][:] = series.values elif is_categorical_dtype(series): # This should work for categorical Index and Series categorical: pd.Categorical = series.values categories: np.ndarray = categorical.categories.values codes: np.ndarray = categorical.codes category_key = f"__categories/{key}" write_array(group, category_key, categories, dataset_kwargs=dataset_kwargs) write_array(group, key, codes, dataset_kwargs=dataset_kwargs) group[key].attrs["categories"] = category_key # Must coerce np.bool_ to bool for json writing group[category_key].attrs["ordered"] = bool(categorical.ordered) else: group[key] = series.values
Example #3
Source File: h5ad.py From anndata with BSD 3-Clause "New" or "Revised" License | 6 votes |
def write_series(group, key, series, dataset_kwargs=MappingProxyType({})): # group here is an h5py type, otherwise categoricals won’t write if series.dtype == object: # Assuming it’s string group.create_dataset( key, data=series.values, dtype=h5py.special_dtype(vlen=str), **dataset_kwargs, ) elif is_categorical_dtype(series): # This should work for categorical Index and Series categorical: pd.Categorical = series.values categories: np.ndarray = categorical.categories.values codes: np.ndarray = categorical.codes category_key = f"__categories/{key}" write_array(group, category_key, categories, dataset_kwargs=dataset_kwargs) write_array(group, key, codes, dataset_kwargs=dataset_kwargs) group[key].attrs["categories"] = group[category_key].ref group[category_key].attrs["ordered"] = categorical.ordered else: group[key] = series.values
Example #4
Source File: test_concatenate.py From anndata with BSD 3-Clause "New" or "Revised" License | 5 votes |
def fix_known_differences(orig, result): """ Helper function for reducing anndata's to only the elements we expect to be equivalent after concatenation. Only for the case where orig is the ground truth result of what concatenation should be. """ orig = orig.copy() result = result.copy() result.obs.drop(columns=["batch"], inplace=True) result.strings_to_categoricals() # Should this be implicit in concatenation? # TODO # * merge varm, varp similar to uns # * merge obsp, but some information should be lost del orig.varm del orig.varp del orig.obsp # TODO # Possibly need to fix this, ordered categoricals lose orderedness for k, dtype in orig.obs.dtypes.items(): if is_categorical_dtype(dtype) and dtype.ordered: result.obs[k] = result.obs[k].astype(dtype) return orig, result
Example #5
Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_dask(self): a = dd.from_pandas(raw, npartitions=2) ce = dpp.Categorizer() trn = ce.fit_transform(a) assert is_categorical_dtype(trn["A"]) assert is_categorical_dtype(trn["B"]) assert is_categorical_dtype(trn["C"]) assert trn["D"].dtype == np.dtype("int64") tm.assert_index_equal(ce.columns_, pd.Index(["A", "B", "C"]))
Example #6
Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_ce(self): ce = dpp.Categorizer() original = raw.copy() trn = ce.fit_transform(raw) assert is_categorical_dtype(trn["A"]) assert is_categorical_dtype(trn["B"]) assert is_categorical_dtype(trn["C"]) assert trn["D"].dtype == np.dtype("int64") tm.assert_index_equal(ce.columns_, pd.Index(["A", "B", "C"])) tm.assert_frame_equal(raw, original)
Example #7
Source File: data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def fit( self, X: DataFrameType, y: Optional[Union[ArrayLike, SeriesType]] = None ) -> "OrdinalEncoder": """Determine the categorical columns to be encoded. Parameters ---------- X : pandas.DataFrame or dask.dataframe.DataFrame y : ignored Returns ------- self """ self.columns_ = X.columns columns = self.columns if columns is None: columns = X.select_dtypes(include=["category"]).columns else: for column in columns: assert is_categorical_dtype(X[column]), "Must be categorical" self.categorical_columns_ = columns self.non_categorical_columns_ = X.columns.drop(self.categorical_columns_) if _HAS_CTD: self.dtypes_ = {col: X[col].dtype for col in self.categorical_columns_} else: self.dtypes_ = { col: (X[col].cat.categories, X[col].cat.ordered) for col in self.categorical_columns_ } return self
Example #8
Source File: analysis.py From reportgen with MIT License | 5 votes |
def describe(data): ''' 对每个变量生成统计指标特征 对于每一个变量,生成如下字段: 数据类型: 最大值/频数最大的那个: 最小值/频数最小的那个: 均值/频数中间的那个: 缺失率: 范围/唯一数: ''' data=pd.DataFrame(data) n_sample=len(data) var_type=type_of_var(data,copy=True) summary=pd.DataFrame(columns=data.columns,index=['dtype','max','min','mean','missing_pct','std/nuniue']) for c in data.columns: missing_pct=1-data[c].count()/n_sample if var_type[c] == 'number': max_value,min_value,mean_value=data[c].max(),data[c].min(),data[c].mean() std_value=data[c].std() summary.loc[:,c]=[var_type[c],max_value,min_value,mean_value,missing_pct,std_value] elif var_type[c] == 'category' or is_categorical_dtype(data[c].dtype): tmp=data[c].value_counts() max_value,min_value=tmp.argmax(),tmp.argmin() mean_value_index=tmp[tmp==tmp.median()].index mean_value=mean_value_index[0] if len(mean_value_index)>0 else np.nan summary.loc[:,c]=[var_type[c],max_value,min_value,mean_value,missing_pct,len(tmp)] elif var_type[c] == 'datetime': max_value,min_value=data[c].max(),data[c].min() summary.loc[:,c]=[var_type[c],max_value,min_value,np.nan,missing_pct,np.nan] else: summary.loc[:,c]=[var_type[c],np.nan,np.nan,np.nan,missing_pct,np.nan] return summary
Example #9
Source File: common.py From plydata with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _add_group_columns(data, gdf): """ Add group columns to data with a value from the grouped dataframe It is assumed that the grouped dataframe contains a single group >>> data = pd.DataFrame({ ... 'x': [5, 6, 7]}) >>> gdf = GroupedDataFrame({ ... 'g': list('aaa'), ... 'x': range(3)}, groups=['g']) >>> _add_group_columns(data, gdf) g x 0 a 5 1 a 6 2 a 7 """ n = len(data) if isinstance(gdf, GroupedDataFrame): for i, col in enumerate(gdf.plydata_groups): if col not in data: group_values = [gdf[col].iloc[0]] * n # Need to be careful and maintain the dtypes # of the group columns if pdtypes.is_categorical_dtype(gdf[col]): col_values = pd.Categorical( group_values, categories=gdf[col].cat.categories, ordered=gdf[col].cat.ordered ) else: col_values = pd.Series( group_values, index=data.index, dtype=gdf[col].dtype ) # Group columns come first data.insert(i, col, col_values) return data
Example #10
Source File: density.py From plotnine with GNU General Public License v2.0 | 5 votes |
def get_var_type(col): """ Return var_type (for KDEMultivariate) of the column Parameters ---------- col : pandas.Series A dataframe column. Returns ------- out : str One of ['c', 'o', 'u']. See Also -------- The origin of the character codes is :class:`statsmodels.nonparametric.kernel_density.KDEMultivariate`. """ if pdtypes.is_numeric_dtype(col): # continuous return 'c' elif pdtypes.is_categorical_dtype(col): # ordered or unordered return 'o' if col.cat.ordered else 'u' else: # unordered if unsure, e.g string columns that # are not categorical return 'u'
Example #11
Source File: utils.py From plotnine with GNU General Public License v2.0 | 5 votes |
def _id_var(x, drop=False): """ Assign ids to items in x. If two items are the same, they get the same id. Parameters ---------- x : array-like items to associate ids with drop : bool Whether to drop unused factor levels """ if len(x) == 0: return [] categorical = pdtypes.is_categorical_dtype(x) if categorical: if drop: x = x.cat.remove_unused_categories() lst = list(x.cat.codes + 1) else: has_nan = any(np.isnan(i) for i in x if isinstance(i, float)) if has_nan: # NaNs are -1, we give them the highest code nan_code = -1 new_nan_code = np.max(x.cat.codes) + 1 lst = [val if val != nan_code else new_nan_code for val in x] else: lst = list(x.cat.codes + 1) else: try: levels = np.sort(np.unique(x)) except TypeError: # x probably has NANs levels = multitype_sort(set(x)) lst = match(x, levels) lst = [item + 1 for item in lst] return lst
Example #12
Source File: validation.py From PandasSchema with GNU General Public License v3.0 | 5 votes |
def get_errors(self, series: pd.Series, column: 'column.Column'): errors = [] # Calculate which columns are valid using the child class's validate function, skipping empty entries if the # column specifies to do so simple_validation = ~self.validate(series) if column.allow_empty: # Failing results are those that are not empty, and fail the validation # explicitly check to make sure the series isn't a category because issubdtype will FAIL if it is if is_categorical_dtype(series) or is_numeric_dtype(series): validated = ~series.isnull() & simple_validation else: validated = (series.str.len() > 0) & simple_validation else: validated = simple_validation # Cut down the original series to only ones that failed the validation indices = series.index[validated] # Use these indices to find the failing items. Also print the index which is probably a row number for i in indices: element = series[i] errors.append(ValidationWarning( message=self.message, value=element, row=i, column=series.name )) return errors
Example #13
Source File: test_dataframe.py From plydata with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_group_by_all(): df = pd.DataFrame({ 'alpha': list('aaabbb'), 'beta': list('babruq'), 'theta': list('cdecde'), 'x': [1, 2, 3, 4, 5, 6], 'y': [6, 5, 4, 3, 2, 1], 'z': [7, 9, 11, 8, 10, 12] }) result = df >> group_by_all() assert len(df.columns) == len(result.columns) assert len(df.columns) == len(result.plydata_groups) result = df >> group_by_all(pd.Categorical) assert len(df.columns) == len(result.columns) assert len(df.columns) == len(result.plydata_groups) result = df >> group_by_all(dict(cat=pd.Categorical)) assert len(df.columns)*2 == len(result.columns) for col in df.columns: col_cat = '{}_cat'.format(col) assert not pdtypes.is_categorical_dtype(result[col]) assert pdtypes.is_categorical_dtype(result[col_cat]) result = (df >> group_by('x') >> group_by_all(dict(cat=pd.Categorical))) assert result.plydata_groups == [ '{}_cat'.format(col) for col in df.columns if col != 'x'] assert len(df.columns)*2-1 == len(result.columns) assert 'x_cat' not in result
Example #14
Source File: test_dataframe.py From plydata with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_summarize(): df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0, 4], 'y': [1, 2, 3, 4, 5, 6, 5], 'z': [1, 3, 3, 4, 5, 5, 5]}) result = df >> summarize('np.sum(x)', max='np.max(x)') assert result.loc[0, 'max'] == np.max(df['x']) assert result.loc[0, 'np.sum(x)'] == np.sum(df['x']) result = df >> group_by('y', 'z') >> summarize(mean_x='np.mean(x)') assert 'y' in result assert 'z' in result assert all(result['mean_x'] == [1, 5, 2, 2, 4, 0]) # (Name, Expression) tuples result = df >> summarize(('sum', 'np.sum(x)'), ('max', 'np.max(x)')) assert 'sum' in result assert 'max' in result # Branches result = df >> group_by('y') >> summarize('np.sum(z)', constant=1) assert 'y' in result assert result.loc[0, 'constant'] == 1 # Category stays category df1 = df.copy() df1['z'] = pd.Categorical(df1['z']) result = df1 >> group_by('y', 'z') >> summarize(mean_x='np.mean(x)') assert result['y'].dtype == np.int assert pdtypes.is_categorical_dtype(result['z'])
Example #15
Source File: utils.py From plotnine with GNU General Public License v2.0 | 4 votes |
def add_margins(df, vars, margins=True): """ Add margins to a data frame. All margining variables will be converted to factors. Parameters ---------- df : dataframe input data frame vars : list a list of 2 lists | tuples vectors giving the variables in each dimension margins : bool | list variable names to compute margins for. True will compute all possible margins. """ margin_vars = _margins(vars, margins) if not margin_vars: return df # create margin dataframes margin_dfs = [df] for vlst in margin_vars[1:]: dfx = df.copy() for v in vlst: dfx.loc[0:, v] = '(all)' margin_dfs.append(dfx) merged = pd.concat(margin_dfs, axis=0) merged.reset_index(drop=True, inplace=True) # All margin columns become categoricals. The margin indicator # (all) needs to be added as the last level of the categories. categories = {} for v in itertools.chain(*vars): col = df[v] if not pdtypes.is_categorical_dtype(df[v].dtype): col = pd.Categorical(df[v]) categories[v] = col.categories if '(all)' not in categories[v]: categories[v] = categories[v].insert( len(categories[v]), '(all)') for v in merged.columns.intersection(set(categories)): merged[v] = merged[v].astype( pdtypes.CategoricalDtype(categories[v])) return merged
Example #16
Source File: test_integration.py From pymapd with Apache License 2.0 | 4 votes |
def test_upload_pandas_categorical_ipc(self, con): con.execute("DROP TABLE IF EXISTS test_categorical;") df = pd.DataFrame({"A": ["a", "b", "c", "a"]}) df["B"] = df["A"].astype('category') # test that table created correctly when it doesn't exist on server con.load_table("test_categorical", df) ans = con.execute("select * from test_categorical").fetchall() assert ans == [('a', 'a'), ('b', 'b'), ('c', 'c'), ('a', 'a')] assert con.get_table_details("test_categorical") == [ ColumnDetails( name='A', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT', is_array=False, ), ColumnDetails( name='B', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT', is_array=False, ), ] # load row-wise con.load_table("test_categorical", df, method="rows") # load columnar con.load_table("test_categorical", df, method="columnar") # load arrow con.load_table("test_categorical", df, method="arrow") # test end result df_ipc = con.select_ipc("select * from test_categorical") assert df_ipc.shape == (16, 2) res = df.append([df, df, df]).reset_index(drop=True) res["A"] = res["A"].astype('category') res["B"] = res["B"].astype('category') assert pd.DataFrame.equals(df_ipc, res) # test that input df wasn't mutated # original input is object, categorical # to load via Arrow, converted internally to object, object assert is_object_dtype(df["A"]) assert is_categorical_dtype(df["B"]) con.execute("DROP TABLE IF EXISTS test_categorical;")
Example #17
Source File: data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 4 votes |
def fit( self, X: DataFrameType, y: Optional[Union[ArrayLike, SeriesType]] = None ) -> "DummyEncoder": """Determine the categorical columns to be dummy encoded. Parameters ---------- X : pandas.DataFrame or dask.dataframe.DataFrame y : ignored Returns ------- self """ self.columns_ = X.columns columns = self.columns if columns is None: columns = X.select_dtypes(include=["category"]).columns else: for column in columns: assert is_categorical_dtype(X[column]), "Must be categorical" self.categorical_columns_ = columns self.non_categorical_columns_ = X.columns.drop(self.categorical_columns_) if _HAS_CTD: self.dtypes_ = {col: X[col].dtype for col in self.categorical_columns_} else: self.dtypes_ = { col: (X[col].cat.categories, X[col].cat.ordered) for col in self.categorical_columns_ } left = len(self.non_categorical_columns_) self.categorical_blocks_ = {} for col in self.categorical_columns_: right = left + len(X[col].cat.categories) if self.drop_first: right -= 1 self.categorical_blocks_[col], left = slice(left, right), right if isinstance(X, pd.DataFrame): sample = X.iloc[:1] else: sample = X._meta_nonempty self.transformed_columns_ = pd.get_dummies( sample, drop_first=self.drop_first ).columns return self