Python pandas.api.types.is_numeric_dtype() Examples
The following are 30
code examples of pandas.api.types.is_numeric_dtype().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas.api.types
, or try the search function
.
Example #1
Source File: rfpimp.py From malss with MIT License | 6 votes |
def oob_dependences(rf, X_train, n_samples=5000): """ Given a random forest model, rf, and training observation independent variables in X_train (a dataframe), compute the OOB R^2 score using each var as a dependent variable. We retrain rf for each var. Only numeric columns are considered. By default, sample up to 5000 observations to compute feature dependencies. :return: Return a DataFrame with Feature/Dependence values for each variable. Feature is the dataframe index. """ numcols = [col for col in X_train if is_numeric_dtype(X_train[col])] X_train = sample_rows(X_train, n_samples) df_dep = pd.DataFrame(columns=['Feature','Dependence']) df_dep = df_dep.set_index('Feature') for col in numcols: X, y = X_train.drop(col, axis=1), X_train[col] rf.fit(X, y) df_dep.loc[col] = rf.oob_score_ df_dep = df_dep.sort_values('Dependence', ascending=False) return df_dep
Example #2
Source File: visual_utils.py From ray with Apache License 2.0 | 6 votes |
def generate_plotly_dim_dict(df, field): dim_dict = {} dim_dict["label"] = field column = df[field] if is_numeric_dtype(column): dim_dict["values"] = column elif is_string_dtype(column): texts = column.unique() dim_dict["values"] = [ np.argwhere(texts == x).flatten()[0] for x in column ] dim_dict["tickvals"] = list(range(len(texts))) dim_dict["ticktext"] = texts else: raise Exception("Unidentifiable Type") return dim_dict
Example #3
Source File: evaluate.py From toad with MIT License | 6 votes |
def merger_data(data, var, unique_num,is_merge_high=True): if is_numeric_dtype(data[var]) and data[var].nunique() > unique_num: data_miss = data[data[var] == -9999999] data_nomiss = data[data[var] != -9999999] merge_high_data = data_nomiss[var] if is_merge_high: merge_high_data = toad.utils.clip(data_nomiss[var], quantile=(None, .99)) data_index, bins = toad.merge(merge_high_data, method='step', return_splits=True, n_bins=unique_num) temp = pd.DataFrame(data_index, columns=[var]) temp = temp.append(data_miss[[var]], ignore_index=True)[var] target = data_nomiss.append(data_miss, ignore_index=True)['target'] return temp, target, bins else: return data[var], None, None # 两两指标做透视表
Example #4
Source File: condition_fun.py From scorecardpy with MIT License | 6 votes |
def check_cateCols_uniqueValues(dat, var_skip = None): # character columns with too many unique values char_cols = [i for i in list(dat) if not is_numeric_dtype(dat[i])] if var_skip is not None: char_cols = list(set(char_cols) - set(str_to_list(var_skip))) char_cols_too_many_unique = [i for i in char_cols if len(dat[i].unique()) >= 50] if len(char_cols_too_many_unique) > 0: print('>>> There are {} variables have too many unique non-numberic values, which might cause the binning process slow. Please double check the following variables: \n{}'.format(len(char_cols_too_many_unique), ', '.join(char_cols_too_many_unique))) print('>>> Continue the binning process?') print('1: yes \n2: no') cont = int(input("Selection: ")) while cont not in [1, 2]: cont = int(input("Selection: ")) if cont == 2: raise SystemExit(0) return None # replace blank by NA #' @import data.table #'
Example #5
Source File: __init__.py From pandas-summary with MIT License | 6 votes |
def _get_columns_info(self, stats): column_info = {} column_info[self.TYPE_CONSTANT] = stats['uniques'][stats['uniques'] == 1].index column_info[self.TYPE_BOOL] = stats['uniques'][stats['uniques'] == 2].index rest_columns = self.get_columns(self.df, self.EXCLUDE, column_info['constant'].union(column_info['bool'])) column_info[self.TYPE_NUMERIC] = pd.Index([c for c in rest_columns if types.is_numeric_dtype(self.df[c])]) rest_columns = self.get_columns( self.df[rest_columns], self.EXCLUDE, column_info['numeric']) column_info[self.TYPE_DATE] = pd.Index([c for c in rest_columns if types.is_datetime64_dtype(self.df[c])]) rest_columns = self.get_columns( self.df[rest_columns], self.EXCLUDE, column_info['date']) unique_columns = stats['uniques'][rest_columns] == stats['counts'][rest_columns] column_info[self.TYPE_UNIQUE] = stats['uniques'][rest_columns][unique_columns].index column_info[self.TYPE_CATEGORICAL] = stats['uniques'][rest_columns][~unique_columns].index return column_info
Example #6
Source File: simple_imputer.py From datawig with Apache License 2.0 | 6 votes |
def check_data_types(self, data_frame: pd.DataFrame) -> None: """ Checks whether a column contains string or numeric data :param data_frame: :return: """ self.numeric_columns = [c for c in self.input_columns if is_numeric_dtype(data_frame[c])] self.string_columns = list(set(self.input_columns) - set(self.numeric_columns)) self.output_type = 'numeric' if is_numeric_dtype(data_frame[self.output_column]) else 'string' logger.debug( "Assuming {} numeric input columns: {}".format(len(self.numeric_columns), ", ".join(self.numeric_columns))) logger.debug("Assuming {} string input columns: {}".format(len(self.string_columns), ", ".join(self.string_columns)))
Example #7
Source File: rfpimp.py From malss with MIT License | 6 votes |
def oob_dependences(rf, X_train, n_samples=5000): """ Given a random forest model, rf, and training observation independent variables in X_train (a dataframe), compute the OOB R^2 score using each var as a dependent variable. We retrain rf for each var. Only numeric columns are considered. By default, sample up to 5000 observations to compute feature dependencies. :return: Return a DataFrame with Feature/Dependence values for each variable. Feature is the dataframe index. """ numcols = [col for col in X_train if is_numeric_dtype(X_train[col])] X_train = sample_rows(X_train, n_samples) df_dep = pd.DataFrame(columns=['Feature','Dependence']) df_dep = df_dep.set_index('Feature') for col in numcols: X, y = X_train.drop(col, axis=1), X_train[col] rf.fit(X, y) df_dep.loc[col] = rf.oob_score_ df_dep = df_dep.sort_values('Dependence', ascending=False) return df_dep
Example #8
Source File: iterators.py From datawig with Apache License 2.0 | 5 votes |
def __init__(self, data_frame: pd.DataFrame, data_columns: List[ColumnEncoder], label_columns: List[ColumnEncoder], batch_size: int = 512) -> None: super(ImputerIterDf, self).__init__(data_columns, label_columns, batch_size) if not isinstance(data_frame, pd.core.frame.DataFrame): raise ValueError("Only pandas data frames are supported") # fill string nan with empty string, numerical nan with np.nan numerical_columns = [c for c in data_frame.columns if is_numeric_dtype(data_frame[c])] string_columns = list(set(data_frame.columns) - set(numerical_columns)) data_frame = data_frame.fillna(value={x: "" for x in string_columns}) data_frame = data_frame.fillna(value={x: np.nan for x in numerical_columns}) self.indices = data_frame.index.tolist() data_frame = data_frame.reset_index(drop=True) # custom padding for having to discard the last batch in mxnet for sparse data padding_n_rows = self._n_rows_padding(data_frame) self.start_padding_idx = int(data_frame.index.max() + 1) for idx in range(self.start_padding_idx, self.start_padding_idx + padding_n_rows): data_frame.loc[idx, :] = data_frame.loc[self.start_padding_idx - 1, :] for column_encoder in self.data_columns + self.label_columns: # ensure that column encoder is fitted to data before applying it if not column_encoder.is_fitted(): column_encoder.fit(data_frame) self.df_iterator = self.mxnet_iterator_from_df(data_frame) self.df_iterator.reset() self._provide_data = self.df_iterator.provide_data self._provide_label = self.df_iterator.provide_label
Example #9
Source File: woebin.py From scorecardpy with MIT License | 5 votes |
def woebin_adj_break_plot(dt, y, x_i, breaks, stop_limit, sv_i, method): ''' update breaks and provies a binning plot Params ------ Returns ------ ''' if breaks == '': breaks = None breaks_list = None if breaks is None else {x_i: eval('['+breaks+']')} special_values = None if sv_i is None else {x_i: sv_i} # binx update bins_adj = woebin(dt[[x_i,y]], y, breaks_list=breaks_list, special_values=special_values, stop_limit = stop_limit, method=method) ## print adjust breaks breaks_bin = set(bins_adj[x_i]['breaks']) - set(["-inf","inf","missing"]) breaks_bin = ', '.join(breaks_bin) if is_numeric_dtype(dt[x_i]) else ', '.join(['\''+ i+'\'' for i in breaks_bin]) print(">>> Current breaks:") print(breaks_bin, '\n') # print bin_adj plt.show(woebin_plot(bins_adj)) # return breaks if breaks == '' or breaks is None: breaks = breaks_bin return breaks
Example #10
Source File: woebin.py From scorecardpy with MIT License | 5 votes |
def bins_to_breaks(bins, dt, to_string=False, save_string=None): if isinstance(bins, dict): bins = pd.concat(bins, ignore_index=True) # x variables xs_all = bins['variable'].unique() # dtypes of variables vars_class = pd.DataFrame({ 'variable': xs_all, 'not_numeric': [not is_numeric_dtype(dt[i]) for i in xs_all] }) # breakslist of bins bins_breakslist = bins[~bins['breaks'].isin(["-inf","inf","missing"]) & ~bins['is_special_values']] bins_breakslist = pd.merge(bins_breakslist[['variable', 'breaks']], vars_class, how='left', on='variable') bins_breakslist.loc[bins_breakslist['not_numeric'], 'breaks'] = '\''+bins_breakslist.loc[bins_breakslist['not_numeric'], 'breaks']+'\'' bins_breakslist = bins_breakslist.groupby('variable')['breaks'].agg(lambda x: ','.join(x)) if to_string: bins_breakslist = "breaks_list={\n"+', \n'.join('\''+bins_breakslist.index[i]+'\': ['+bins_breakslist[i]+']' for i in np.arange(len(bins_breakslist)))+"}" if save_string is not None: brk_lst_name = '{}_{}.py'.format(save_string, time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time()))) with open(brk_lst_name, 'w') as f: f.write(bins_breakslist) print('[INFO] The breaks_list is saved as {}'.format(brk_lst_name)) return return bins_breakslist
Example #11
Source File: candlestick_finder.py From candlestick-patterns with MIT License | 5 votes |
def prepare_data(self, candles_df, ohlc): if isinstance(candles_df, pd.DataFrame): if len(candles_df) >= self.required_count: if ohlc and len(ohlc) == 4: if not set(ohlc).issubset(candles_df.columns): raise Exception('Provided columns does not exist in given data frame') self.open_column = ohlc[0] self.high_column = ohlc[1] self.low_column = ohlc[2] self.close_column = ohlc[3] else: raise Exception('Provide list of four elements indicating columns in strings. ' 'Default: [open, high, low, close]') self.data = candles_df.copy() if not is_numeric_dtype(self.data[self.close_column]): self.data[self.close_column] = pd.to_numeric(self.data[self.close_column]) if not is_numeric_dtype(self.data[self.open_column]): self.data[self.open_column] = pd.to_numeric(self.data[self.open_column]) if not is_numeric_dtype(self.data[self.low_column]): self.data[self.low_column] = pd.to_numeric(self.data[self.low_column]) if not is_numeric_dtype(self.data[self.high_column]): self.data[self.high_column] = pd.to_numeric(candles_df[self.high_column]) self.is_data_prepared = True else: raise Exception('{0} requires at least {1} data'.format(self.name, self.required_count)) else: raise Exception('Candles must be in Panda data frame type')
Example #12
Source File: _common.py From cooler with BSD 3-Clause "New" or "Revised" License | 5 votes |
def cooler_cmp(uri1, uri2): c1 = cooler.Cooler(uri1) c2 = cooler.Cooler(uri2) with c1.open("r") as f1, c2.open("r") as f2: for path in ( "chroms/name", "chroms/length", "bins/chrom", "bins/start", "bins/end", "pixels/bin1_id", "pixels/bin2_id", "pixels/count", ): dset1, dset2 = f1[path], f2[path] dtype1 = dset1.dtype dtype2 = dset2.dtype if dtype1.kind == "S": # Null padding of ascii arrays is not guaranteed to be # preserved so we only check the kind. assert dtype2.kind == "S" else: assert dtype1 == dtype2 if is_numeric_dtype(dtype1): assert np.allclose(dset1[:], dset2[:]) else: assert np.all(dset1[:] == dset2[:])
Example #13
Source File: test_datetime_features.py From timeserio with MIT License | 5 votes |
def test_featurizer_callable(df, featurizer): df = featurizer(df) assert len(featurizer.attributes) for attr in featurizer.attributes: assert attr in df assert is_numeric_dtype(df[attr])
Example #14
Source File: expedia_dataset_reader.py From cs-ranking with Apache License 2.0 | 5 votes |
def _filter_dataset(self): train = pd.read_csv(self.RAW_DATASET_FILE) for col in train.columns.values: if "id" in col and col != "srch_id": del train[col] elif is_numeric_dtype(train[col]): arr = np.array(train[col]) fraction = np.isnan(arr).sum() / len(arr) if fraction > 0.0: self.logger.info( "########################################################################" ) self.logger.info( "Missing values {}: {}".format( col, np.isnan(arr).sum() / len(arr) ) ) self.logger.info( "Min {}: Max {}".format(np.nanmin(arr), np.nanmax(arr)) ) if fraction > 0.50: del train[col] else: train.loc[train[col].isnull(), col] = np.nanmin(arr) - 1 else: del train[col] self.train_df = train
Example #15
Source File: test_datetime_features.py From timeserio with MIT License | 5 votes |
def test_featurizer(df, featurizer): df = featurizer.transform(df) assert len(featurizer.attributes) for attr in featurizer.attributes: assert attr in df assert is_numeric_dtype(df[attr])
Example #16
Source File: filter.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def series_to_datetime(series): """ Convert to datetime, or raise UserVisibleError. TODO [adamhooper, 2018-12-19] nix this and quick-fix coltypes """ try: if is_numeric_dtype(series): # numeric columns, just... no. Never really want to interpret as # seconds since 1970 raise ValueError("Refusing to convert numbers to dates") return pd.to_datetime(series, utc=True) except ValueError: raise UserVisibleError("Column is not dates. Please convert to dates.")
Example #17
Source File: types.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def format_series(self, series: pd.Series) -> pd.Series: ret = series.map(self._formatter.format, na_action="ignore") # Pandas will still think all-NA is number. if is_numeric_dtype(ret): ret = ret.astype(object) return ret # override
Example #18
Source File: types.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def class_from_dtype(dtype) -> type: """ Determine ColumnType class, based on pandas/numpy `dtype`. """ if is_numeric_dtype(dtype): return ColumnType.NUMBER elif is_datetime64_dtype(dtype): return ColumnType.DATETIME elif dtype == object or dtype == "category": return ColumnType.TEXT else: raise ValueError(f"Unknown dtype: {dtype}")
Example #19
Source File: errors.py From autoimpute with MIT License | 5 votes |
def _not_num_series(m, s): """Private method to detect columns of Matrix that are not categorical.""" if not is_numeric_dtype(s): t = s.dtype err = f"{m} not appropriate for Series {s.name} of type {t}." raise TypeError(err)
Example #20
Source File: rfpimp.py From malss with MIT License | 5 votes |
def feature_dependence_matrix(rf, X_train, n_samples=5000): """ Given training observation independent variables in X_train (a dataframe), compute the feature importance using each var as a dependent variable. We retrain a random forest for each var as target using the others as independent vars. Only numeric columns are considered. By default, sample up to 5000 observations to compute feature dependencies. :return: a non-symmetric data frame with the dependence matrix where each row is the importance of each var to the row's var used as a model target. """ numcols = [col for col in X_train if is_numeric_dtype(X_train[col])] X_train = sample_rows(X_train, n_samples) df_dep = pd.DataFrame(index=X_train.columns, columns=['Dependence']+X_train.columns.tolist()) for i in range(len(numcols)): col = numcols[i] X, y = X_train.drop(col, axis=1), X_train[col] rf.fit(X,y) #imp = rf.feature_importances_ imp = permutation_importances_raw(rf, X, y, oob_regression_r2_score, n_samples) imp = np.insert(imp, i, 1.0) df_dep.iloc[i] = np.insert(imp, 0, rf.oob_score_) # add overall dependence return df_dep
Example #21
Source File: rfpimp.py From malss with MIT License | 5 votes |
def feature_dependence_matrix(rf, X_train, n_samples=5000): """ Given training observation independent variables in X_train (a dataframe), compute the feature importance using each var as a dependent variable. We retrain a random forest for each var as target using the others as independent vars. Only numeric columns are considered. By default, sample up to 5000 observations to compute feature dependencies. :return: a non-symmetric data frame with the dependence matrix where each row is the importance of each var to the row's var used as a model target. """ numcols = [col for col in X_train if is_numeric_dtype(X_train[col])] X_train = sample_rows(X_train, n_samples) df_dep = pd.DataFrame(index=X_train.columns, columns=['Dependence']+X_train.columns.tolist()) for i in range(len(numcols)): col = numcols[i] X, y = X_train.drop(col, axis=1), X_train[col] rf.fit(X,y) #imp = rf.feature_importances_ imp = permutation_importances_raw(rf, X, y, oob_regression_r2_score, n_samples) imp = np.insert(imp, i, 1.0) df_dep.iloc[i] = np.insert(imp, 0, rf.oob_score_) # add overall dependence return df_dep
Example #22
Source File: pandas.py From vnpy_crypto with MIT License | 5 votes |
def is_numeric_dtype(arr_or_dtype): # Crude implementation only suitable for array-like types try: tipo = arr_or_dtype.dtype.type except AttributeError: tipo = type(None) return (issubclass(tipo, (np.number, np.bool_)) and not issubclass(tipo, (np.datetime64, np.timedelta64)))
Example #23
Source File: validation.py From PandasSchema with GNU General Public License v3.0 | 5 votes |
def get_errors(self, series: pd.Series, column: 'column.Column'): errors = [] # Calculate which columns are valid using the child class's validate function, skipping empty entries if the # column specifies to do so simple_validation = ~self.validate(series) if column.allow_empty: # Failing results are those that are not empty, and fail the validation # explicitly check to make sure the series isn't a category because issubdtype will FAIL if it is if is_categorical_dtype(series) or is_numeric_dtype(series): validated = ~series.isnull() & simple_validation else: validated = (series.str.len() > 0) & simple_validation else: validated = simple_validation # Cut down the original series to only ones that failed the validation indices = series.index[validated] # Use these indices to find the failing items. Also print the index which is probably a row number for i in indices: element = series[i] errors.append(ValidationWarning( message=self.message, value=element, row=i, column=series.name )) return errors
Example #24
Source File: default.py From autoimpute with MIT License | 5 votes |
def fit(self, X, y): """Fit the Imputer to the dataset and determine the right approach. Args: X (pd.Series): Dataset to fit the imputer, or predictors y (pd.Series): None, or dataset to fit predictors Returns: self. Instance of the class. """ # start off with stats blank stats = {"param": None, "strategy": None} # if y is None, fitting simply X. univariate method. if y is None: if is_numeric_dtype(X): stats = {"param": self.num_imputer.fit(X, y), "strategy": self.num_imputer.strategy} if is_string_dtype(X): stats = {"param": self.cat_imputer.fit(X, y), "strategy": self.cat_imputer.strategy} # if y is not None, fitting X to y. predictive method. if not y is None: if is_numeric_dtype(y): stats = {"param": self.num_imputer.fit(X, y), "strategy": self.num_imputer.strategy} if is_string_dtype(y): stats = {"param": self.cat_imputer.fit(X, y), "strategy": self.cat_imputer.strategy} # return final stats self.statistics_ = stats return self
Example #25
Source File: converter.py From hvplot with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _process_chart_y(self, data, x, y, single_y): """This should happen after _process_chart_x""" y = y or self.y if y is None: ys = [c for c in data.columns if c not in [x]+self.by+self.groupby+self.grid] if len(ys) > 1: # if columns have different dtypes, only include numeric columns from pandas.api.types import is_numeric_dtype as isnum num_ys = [dim for dim in ys if isnum(data[dim])] if len(num_ys) >= 1: ys = num_ys y = ys[0] if len(ys) == 1 or single_y else ys return y
Example #26
Source File: density.py From plotnine with GNU General Public License v2.0 | 5 votes |
def get_var_type(col): """ Return var_type (for KDEMultivariate) of the column Parameters ---------- col : pandas.Series A dataframe column. Returns ------- out : str One of ['c', 'o', 'u']. See Also -------- The origin of the character codes is :class:`statsmodels.nonparametric.kernel_density.KDEMultivariate`. """ if pdtypes.is_numeric_dtype(col): # continuous return 'c' elif pdtypes.is_categorical_dtype(col): # ordered or unordered return 'o' if col.cat.ordered else 'u' else: # unordered if unsure, e.g string columns that # are not categorical return 'u'
Example #27
Source File: helpers.py From anndata with BSD 3-Clause "New" or "Revised" License | 5 votes |
def assert_equal_ndarray(a, b, exact=False, elem_name=None): b = asarray(b) if not exact and is_numeric_dtype(a) and is_numeric_dtype(b): assert a.shape == b.shape, format_msg(elem_name) assert np.allclose(a, b, equal_nan=True), format_msg(elem_name) elif ( # Structured dtype not exact and hasattr(a, "dtype") and hasattr(b, "dtype") and len(a.dtype) > 1 and len(b.dtype) > 0 ): assert_equal(pd.DataFrame(a), pd.DataFrame(b), exact, elem_name) else: assert np.all(a == b), format_msg(elem_name)
Example #28
Source File: _sklearn.py From qlik-py-tools with MIT License | 5 votes |
def _add_lags(self, X, y=None, extrapolate=1, update_features_df=False): """ Add lag observations to X. If y is available and self.model.lag_target is True, the previous target will become an additional feature in X. Feature definitions for the model will be updated accordingly. """ # Add lag target to the features if required # This will create an additional feature for each sample i.e. the previous value of y if y is not None and self.model.lag_target: X["previous_y"] = y.shift(1) if update_features_df: # Check the target's data type dt = 'float' if is_numeric_dtype(y.iloc[:,0]) else 'str' # Set the preprocessing feature strategy for the lag targets if self.model.estimator_type == 'classifier': fs = 'one hot encoding' elif self.model.scale_lag_target and not self.model.scale_target: fs = 'scaling' else: fs = 'none' self.model.scale_lag_target # Update feature definitions for the model self.model.features_df.loc['previous_y'] = [self.model.name, 'previous_y', 'feature', dt, fs, ''] if self.model.lags: # Add the lag observations X = utils.add_lags(X, lag=self.model.lags, extrapolate=extrapolate, dropna=True, suffix="t") if update_features_df: # Duplicate the feature definitions by the number of lags self.model.features_df = pd.concat([self.model.features_df] * (self.model.lags+extrapolate)) # Set the new feature names as the index of the feature definitions data frame self.model.features_df['name'] = X.columns self.model.features_df = self.model.features_df.set_index('name', drop=True) if self.model.debug: self._print_log(11, data=X) return X
Example #29
Source File: environment.py From hyperparameter_hunter with MIT License | 5 votes |
def save_transformed_metrics(self, value): if value is None: if all(is_numeric_dtype(self.train_dataset[col]) for col in self.target_column): # If all target columns are numeric, assume metric evaluation should use the # original/inverted targets and predictions value = False else: # If any target column is non-numeric, assume evaluation should use the transformed # targets and predictions because most metrics require numeric inputs value = True self._save_transformed_metrics = value ################################################## # Core Methods ##################################################
Example #30
Source File: pandas.py From Splunking-Crime with GNU Affero General Public License v3.0 | 5 votes |
def is_numeric_dtype(arr_or_dtype): # Crude implementation only suitable for array-like types try: tipo = arr_or_dtype.dtype.type except AttributeError: tipo = type(None) return (issubclass(tipo, (np.number, np.bool_)) and not issubclass(tipo, (np.datetime64, np.timedelta64)))