Python pandas.api.types.is_numeric_dtype() Examples

The following are 30 code examples of pandas.api.types.is_numeric_dtype(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas.api.types , or try the search function .
Example #1
Source File: rfpimp.py    From malss with MIT License 6 votes vote down vote up
def oob_dependences(rf, X_train, n_samples=5000):
    """
    Given a random forest model, rf, and training observation independent
    variables in X_train (a dataframe), compute the OOB R^2 score using each var
    as a dependent variable. We retrain rf for each var.    Only numeric columns are considered.

    By default, sample up to 5000 observations to compute feature dependencies.

    :return: Return a DataFrame with Feature/Dependence values for each variable. Feature is the dataframe index.
    """
    numcols = [col for col in X_train if is_numeric_dtype(X_train[col])]

    X_train = sample_rows(X_train, n_samples)

    df_dep = pd.DataFrame(columns=['Feature','Dependence'])
    df_dep = df_dep.set_index('Feature')
    for col in numcols:
        X, y = X_train.drop(col, axis=1), X_train[col]
        rf.fit(X, y)
        df_dep.loc[col] = rf.oob_score_
    df_dep = df_dep.sort_values('Dependence', ascending=False)
    return df_dep 
Example #2
Source File: visual_utils.py    From ray with Apache License 2.0 6 votes vote down vote up
def generate_plotly_dim_dict(df, field):
    dim_dict = {}
    dim_dict["label"] = field
    column = df[field]
    if is_numeric_dtype(column):
        dim_dict["values"] = column
    elif is_string_dtype(column):
        texts = column.unique()
        dim_dict["values"] = [
            np.argwhere(texts == x).flatten()[0] for x in column
        ]
        dim_dict["tickvals"] = list(range(len(texts)))
        dim_dict["ticktext"] = texts
    else:
        raise Exception("Unidentifiable Type")

    return dim_dict 
Example #3
Source File: evaluate.py    From toad with MIT License 6 votes vote down vote up
def merger_data(data, var, unique_num,is_merge_high=True):
    if is_numeric_dtype(data[var]) and data[var].nunique() > unique_num:
        data_miss = data[data[var] == -9999999]
        data_nomiss = data[data[var] != -9999999]
        merge_high_data = data_nomiss[var]
        if is_merge_high:
            merge_high_data = toad.utils.clip(data_nomiss[var], quantile=(None, .99))
        data_index, bins = toad.merge(merge_high_data, method='step', return_splits=True, n_bins=unique_num)
        temp = pd.DataFrame(data_index, columns=[var])
        temp = temp.append(data_miss[[var]], ignore_index=True)[var]
        target = data_nomiss.append(data_miss, ignore_index=True)['target']
        return temp, target, bins
    else:
        return data[var], None, None


# 两两指标做透视表 
Example #4
Source File: condition_fun.py    From scorecardpy with MIT License 6 votes vote down vote up
def check_cateCols_uniqueValues(dat, var_skip = None):
    # character columns with too many unique values
    char_cols = [i for i in list(dat) if not is_numeric_dtype(dat[i])]
    if var_skip is not None: 
        char_cols = list(set(char_cols) - set(str_to_list(var_skip)))
    char_cols_too_many_unique = [i for i in char_cols if len(dat[i].unique()) >= 50]
    if len(char_cols_too_many_unique) > 0:
        print('>>> There are {} variables have too many unique non-numberic values, which might cause the binning process slow. Please double check the following variables: \n{}'.format(len(char_cols_too_many_unique), ', '.join(char_cols_too_many_unique)))
        print('>>> Continue the binning process?')
        print('1: yes \n2: no')
        cont = int(input("Selection: "))
        while cont not in [1, 2]:
            cont = int(input("Selection: "))
        if cont == 2:
            raise SystemExit(0)
    return None


# replace blank by NA
#' @import data.table
#' 
Example #5
Source File: __init__.py    From pandas-summary with MIT License 6 votes vote down vote up
def _get_columns_info(self, stats):
        column_info = {}
        column_info[self.TYPE_CONSTANT] = stats['uniques'][stats['uniques'] == 1].index
        column_info[self.TYPE_BOOL] = stats['uniques'][stats['uniques'] == 2].index
        rest_columns = self.get_columns(self.df,
                                        self.EXCLUDE,
                                        column_info['constant'].union(column_info['bool']))
        column_info[self.TYPE_NUMERIC] = pd.Index([c for c in rest_columns
                                                   if types.is_numeric_dtype(self.df[c])])
        rest_columns = self.get_columns(
            self.df[rest_columns], self.EXCLUDE, column_info['numeric'])
        column_info[self.TYPE_DATE] = pd.Index([c for c in rest_columns
                                                if types.is_datetime64_dtype(self.df[c])])
        rest_columns = self.get_columns(
            self.df[rest_columns], self.EXCLUDE, column_info['date'])
        unique_columns = stats['uniques'][rest_columns] == stats['counts'][rest_columns]
        column_info[self.TYPE_UNIQUE] = stats['uniques'][rest_columns][unique_columns].index
        column_info[self.TYPE_CATEGORICAL] = stats['uniques'][rest_columns][~unique_columns].index
        return column_info 
Example #6
Source File: simple_imputer.py    From datawig with Apache License 2.0 6 votes vote down vote up
def check_data_types(self, data_frame: pd.DataFrame) -> None:
        """

        Checks whether a column contains string or numeric data

        :param data_frame:
        :return:
        """
        self.numeric_columns = [c for c in self.input_columns if is_numeric_dtype(data_frame[c])]
        self.string_columns = list(set(self.input_columns) - set(self.numeric_columns))
        self.output_type = 'numeric' if is_numeric_dtype(data_frame[self.output_column]) else 'string'

        logger.debug(
            "Assuming {} numeric input columns: {}".format(len(self.numeric_columns),
                                                           ", ".join(self.numeric_columns)))
        logger.debug("Assuming {} string input columns: {}".format(len(self.string_columns),
                                                                  ", ".join(self.string_columns))) 
Example #7
Source File: rfpimp.py    From malss with MIT License 6 votes vote down vote up
def oob_dependences(rf, X_train, n_samples=5000):
    """
    Given a random forest model, rf, and training observation independent
    variables in X_train (a dataframe), compute the OOB R^2 score using each var
    as a dependent variable. We retrain rf for each var.    Only numeric columns are considered.

    By default, sample up to 5000 observations to compute feature dependencies.

    :return: Return a DataFrame with Feature/Dependence values for each variable. Feature is the dataframe index.
    """
    numcols = [col for col in X_train if is_numeric_dtype(X_train[col])]

    X_train = sample_rows(X_train, n_samples)

    df_dep = pd.DataFrame(columns=['Feature','Dependence'])
    df_dep = df_dep.set_index('Feature')
    for col in numcols:
        X, y = X_train.drop(col, axis=1), X_train[col]
        rf.fit(X, y)
        df_dep.loc[col] = rf.oob_score_
    df_dep = df_dep.sort_values('Dependence', ascending=False)
    return df_dep 
Example #8
Source File: iterators.py    From datawig with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 data_frame: pd.DataFrame,
                 data_columns: List[ColumnEncoder],
                 label_columns: List[ColumnEncoder],
                 batch_size: int = 512) -> None:
        super(ImputerIterDf, self).__init__(data_columns, label_columns, batch_size)

        if not isinstance(data_frame, pd.core.frame.DataFrame):
            raise ValueError("Only pandas data frames are supported")

        # fill string nan with empty string, numerical nan with np.nan
        numerical_columns = [c for c in data_frame.columns if is_numeric_dtype(data_frame[c])]
        string_columns = list(set(data_frame.columns) - set(numerical_columns))
        data_frame = data_frame.fillna(value={x: "" for x in string_columns})
        data_frame = data_frame.fillna(value={x: np.nan for x in numerical_columns})

        self.indices = data_frame.index.tolist()
        data_frame = data_frame.reset_index(drop=True)

        # custom padding for having to discard the last batch in mxnet for sparse data
        padding_n_rows = self._n_rows_padding(data_frame)
        self.start_padding_idx = int(data_frame.index.max() + 1)
        for idx in range(self.start_padding_idx, self.start_padding_idx + padding_n_rows):
            data_frame.loc[idx, :] = data_frame.loc[self.start_padding_idx - 1, :]

        for column_encoder in self.data_columns + self.label_columns:
            # ensure that column encoder is fitted to data before applying it
            if not column_encoder.is_fitted():
                column_encoder.fit(data_frame)

        self.df_iterator = self.mxnet_iterator_from_df(data_frame)
        self.df_iterator.reset()
        self._provide_data = self.df_iterator.provide_data
        self._provide_label = self.df_iterator.provide_label 
Example #9
Source File: woebin.py    From scorecardpy with MIT License 5 votes vote down vote up
def woebin_adj_break_plot(dt, y, x_i, breaks, stop_limit, sv_i, method):
    '''
    update breaks and provies a binning plot
    
    Params
    ------
    
    Returns
    ------
    
    '''
    if breaks == '':
        breaks = None
    breaks_list = None if breaks is None else {x_i: eval('['+breaks+']')}
    special_values = None if sv_i is None else {x_i: sv_i}
    # binx update
    bins_adj = woebin(dt[[x_i,y]], y, breaks_list=breaks_list, special_values=special_values, stop_limit = stop_limit, method=method)
    
    ## print adjust breaks
    breaks_bin = set(bins_adj[x_i]['breaks']) - set(["-inf","inf","missing"])
    breaks_bin = ', '.join(breaks_bin) if is_numeric_dtype(dt[x_i]) else ', '.join(['\''+ i+'\'' for i in breaks_bin])
    print(">>> Current breaks:")
    print(breaks_bin, '\n')
    # print bin_adj
    plt.show(woebin_plot(bins_adj))
    # return breaks 
    if breaks == '' or breaks is None: breaks = breaks_bin
    return breaks 
Example #10
Source File: woebin.py    From scorecardpy with MIT License 5 votes vote down vote up
def bins_to_breaks(bins, dt, to_string=False, save_string=None):
    if isinstance(bins, dict):
        bins = pd.concat(bins, ignore_index=True)

    # x variables
    xs_all = bins['variable'].unique()
    # dtypes of  variables
    vars_class = pd.DataFrame({
      'variable': xs_all,
      'not_numeric': [not is_numeric_dtype(dt[i]) for i in xs_all]
    })
    
    # breakslist of bins
    bins_breakslist = bins[~bins['breaks'].isin(["-inf","inf","missing"]) & ~bins['is_special_values']]
    bins_breakslist = pd.merge(bins_breakslist[['variable', 'breaks']], vars_class, how='left', on='variable')
    bins_breakslist.loc[bins_breakslist['not_numeric'], 'breaks'] = '\''+bins_breakslist.loc[bins_breakslist['not_numeric'], 'breaks']+'\''
    bins_breakslist = bins_breakslist.groupby('variable')['breaks'].agg(lambda x: ','.join(x))
    
    if to_string:
        bins_breakslist = "breaks_list={\n"+', \n'.join('\''+bins_breakslist.index[i]+'\': ['+bins_breakslist[i]+']' for i in np.arange(len(bins_breakslist)))+"}"
        if save_string is not None:
            brk_lst_name = '{}_{}.py'.format(save_string, time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())))
            with open(brk_lst_name, 'w') as f:
                f.write(bins_breakslist)
            print('[INFO] The breaks_list is saved as {}'.format(brk_lst_name))
            return 
    return bins_breakslist 
Example #11
Source File: candlestick_finder.py    From candlestick-patterns with MIT License 5 votes vote down vote up
def prepare_data(self, candles_df, ohlc):

        if isinstance(candles_df, pd.DataFrame):

            if len(candles_df) >= self.required_count:
                if ohlc and len(ohlc) == 4:
                    if not set(ohlc).issubset(candles_df.columns):
                        raise Exception('Provided columns does not exist in given data frame')

                    self.open_column = ohlc[0]
                    self.high_column = ohlc[1]
                    self.low_column = ohlc[2]
                    self.close_column = ohlc[3]
                else:
                    raise Exception('Provide list of four elements indicating columns in strings. '
                                    'Default: [open, high, low, close]')

                self.data = candles_df.copy()

                if not is_numeric_dtype(self.data[self.close_column]):
                    self.data[self.close_column] = pd.to_numeric(self.data[self.close_column])

                if not is_numeric_dtype(self.data[self.open_column]):
                    self.data[self.open_column] = pd.to_numeric(self.data[self.open_column])

                if not is_numeric_dtype(self.data[self.low_column]):
                    self.data[self.low_column] = pd.to_numeric(self.data[self.low_column])

                if not is_numeric_dtype(self.data[self.high_column]):
                    self.data[self.high_column] = pd.to_numeric(candles_df[self.high_column])

                self.is_data_prepared = True
            else:
                raise Exception('{0} requires at least {1} data'.format(self.name,
                                                                        self.required_count))
        else:
            raise Exception('Candles must be in Panda data frame type') 
Example #12
Source File: _common.py    From cooler with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def cooler_cmp(uri1, uri2):
    c1 = cooler.Cooler(uri1)
    c2 = cooler.Cooler(uri2)
    with c1.open("r") as f1, c2.open("r") as f2:
        for path in (
            "chroms/name",
            "chroms/length",
            "bins/chrom",
            "bins/start",
            "bins/end",
            "pixels/bin1_id",
            "pixels/bin2_id",
            "pixels/count",
        ):
            dset1, dset2 = f1[path], f2[path]

            dtype1 = dset1.dtype
            dtype2 = dset2.dtype

            if dtype1.kind == "S":
                # Null padding of ascii arrays is not guaranteed to be
                # preserved so we only check the kind.
                assert dtype2.kind == "S"
            else:
                assert dtype1 == dtype2

            if is_numeric_dtype(dtype1):
                assert np.allclose(dset1[:], dset2[:])
            else:
                assert np.all(dset1[:] == dset2[:]) 
Example #13
Source File: test_datetime_features.py    From timeserio with MIT License 5 votes vote down vote up
def test_featurizer_callable(df, featurizer):
    df = featurizer(df)
    assert len(featurizer.attributes)
    for attr in featurizer.attributes:
        assert attr in df
        assert is_numeric_dtype(df[attr]) 
Example #14
Source File: expedia_dataset_reader.py    From cs-ranking with Apache License 2.0 5 votes vote down vote up
def _filter_dataset(self):
        train = pd.read_csv(self.RAW_DATASET_FILE)
        for col in train.columns.values:
            if "id" in col and col != "srch_id":
                del train[col]
            elif is_numeric_dtype(train[col]):
                arr = np.array(train[col])
                fraction = np.isnan(arr).sum() / len(arr)
                if fraction > 0.0:
                    self.logger.info(
                        "########################################################################"
                    )
                    self.logger.info(
                        "Missing values {}: {}".format(
                            col, np.isnan(arr).sum() / len(arr)
                        )
                    )
                    self.logger.info(
                        "Min {}: Max {}".format(np.nanmin(arr), np.nanmax(arr))
                    )
                    if fraction > 0.50:
                        del train[col]
                    else:
                        train.loc[train[col].isnull(), col] = np.nanmin(arr) - 1
            else:
                del train[col]
        self.train_df = train 
Example #15
Source File: test_datetime_features.py    From timeserio with MIT License 5 votes vote down vote up
def test_featurizer(df, featurizer):
    df = featurizer.transform(df)
    assert len(featurizer.attributes)
    for attr in featurizer.attributes:
        assert attr in df
        assert is_numeric_dtype(df[attr]) 
Example #16
Source File: filter.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def series_to_datetime(series):
    """
    Convert to datetime, or raise UserVisibleError.

    TODO [adamhooper, 2018-12-19] nix this and quick-fix coltypes
    """
    try:
        if is_numeric_dtype(series):
            # numeric columns, just... no. Never really want to interpret as
            # seconds since 1970
            raise ValueError("Refusing to convert numbers to dates")

        return pd.to_datetime(series, utc=True)
    except ValueError:
        raise UserVisibleError("Column is not dates. Please convert to dates.") 
Example #17
Source File: types.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def format_series(self, series: pd.Series) -> pd.Series:
        ret = series.map(self._formatter.format, na_action="ignore")
        # Pandas will still think all-NA is number.
        if is_numeric_dtype(ret):
            ret = ret.astype(object)
        return ret

    # override 
Example #18
Source File: types.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def class_from_dtype(dtype) -> type:
        """
        Determine ColumnType class, based on pandas/numpy `dtype`.
        """
        if is_numeric_dtype(dtype):
            return ColumnType.NUMBER
        elif is_datetime64_dtype(dtype):
            return ColumnType.DATETIME
        elif dtype == object or dtype == "category":
            return ColumnType.TEXT
        else:
            raise ValueError(f"Unknown dtype: {dtype}") 
Example #19
Source File: errors.py    From autoimpute with MIT License 5 votes vote down vote up
def _not_num_series(m, s):
    """Private method to detect columns of Matrix that are not categorical."""
    if not is_numeric_dtype(s):
        t = s.dtype
        err = f"{m} not appropriate for Series {s.name} of type {t}."
        raise TypeError(err) 
Example #20
Source File: rfpimp.py    From malss with MIT License 5 votes vote down vote up
def feature_dependence_matrix(rf, X_train, n_samples=5000):
    """
    Given training observation independent variables in X_train (a dataframe),
    compute the feature importance using each var as a dependent variable.
    We retrain a random forest for each var as target using the others as
    independent vars.  Only numeric columns are considered.

    By default, sample up to 5000 observations to compute feature dependencies.

    :return: a non-symmetric data frame with the dependence matrix where each row is the importance of each var to the row's var used as a model target.
    """
    numcols = [col for col in X_train if is_numeric_dtype(X_train[col])]

    X_train = sample_rows(X_train, n_samples)

    df_dep = pd.DataFrame(index=X_train.columns, columns=['Dependence']+X_train.columns.tolist())
    for i in range(len(numcols)):
        col = numcols[i]
        X, y = X_train.drop(col, axis=1), X_train[col]
        rf.fit(X,y)
        #imp = rf.feature_importances_
        imp = permutation_importances_raw(rf, X, y, oob_regression_r2_score, n_samples)
        imp = np.insert(imp, i, 1.0)
        df_dep.iloc[i] = np.insert(imp, 0, rf.oob_score_) # add overall dependence

    return df_dep 
Example #21
Source File: rfpimp.py    From malss with MIT License 5 votes vote down vote up
def feature_dependence_matrix(rf, X_train, n_samples=5000):
    """
    Given training observation independent variables in X_train (a dataframe),
    compute the feature importance using each var as a dependent variable.
    We retrain a random forest for each var as target using the others as
    independent vars.  Only numeric columns are considered.

    By default, sample up to 5000 observations to compute feature dependencies.

    :return: a non-symmetric data frame with the dependence matrix where each row is the importance of each var to the row's var used as a model target.
    """
    numcols = [col for col in X_train if is_numeric_dtype(X_train[col])]

    X_train = sample_rows(X_train, n_samples)

    df_dep = pd.DataFrame(index=X_train.columns, columns=['Dependence']+X_train.columns.tolist())
    for i in range(len(numcols)):
        col = numcols[i]
        X, y = X_train.drop(col, axis=1), X_train[col]
        rf.fit(X,y)
        #imp = rf.feature_importances_
        imp = permutation_importances_raw(rf, X, y, oob_regression_r2_score, n_samples)
        imp = np.insert(imp, i, 1.0)
        df_dep.iloc[i] = np.insert(imp, 0, rf.oob_score_) # add overall dependence

    return df_dep 
Example #22
Source File: pandas.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def is_numeric_dtype(arr_or_dtype):
            # Crude implementation only suitable for array-like types
            try:
                tipo = arr_or_dtype.dtype.type
            except AttributeError:
                tipo = type(None)
            return (issubclass(tipo, (np.number, np.bool_)) and
                    not issubclass(tipo, (np.datetime64, np.timedelta64))) 
Example #23
Source File: validation.py    From PandasSchema with GNU General Public License v3.0 5 votes vote down vote up
def get_errors(self, series: pd.Series, column: 'column.Column'):

        errors = []

        # Calculate which columns are valid using the child class's validate function, skipping empty entries if the
        # column specifies to do so
        simple_validation = ~self.validate(series)
        if column.allow_empty:
            # Failing results are those that are not empty, and fail the validation
            # explicitly check to make sure the series isn't a category because issubdtype will FAIL if it is
            if is_categorical_dtype(series) or is_numeric_dtype(series):
                validated = ~series.isnull() & simple_validation
            else:
                validated = (series.str.len() > 0) & simple_validation

        else:
            validated = simple_validation

        # Cut down the original series to only ones that failed the validation
        indices = series.index[validated]

        # Use these indices to find the failing items. Also print the index which is probably a row number
        for i in indices:
            element = series[i]
            errors.append(ValidationWarning(
                message=self.message,
                value=element,
                row=i,
                column=series.name
            ))

        return errors 
Example #24
Source File: default.py    From autoimpute with MIT License 5 votes vote down vote up
def fit(self, X, y):
        """Fit the Imputer to the dataset and determine the right approach.

        Args:
            X (pd.Series): Dataset to fit the imputer, or predictors
            y (pd.Series): None, or dataset to fit predictors

        Returns:
            self. Instance of the class.
        """
        # start off with stats blank
        stats = {"param": None, "strategy": None}

        # if y is None, fitting simply X. univariate method.
        if y is None:
            if is_numeric_dtype(X):
                stats = {"param": self.num_imputer.fit(X, y),
                         "strategy": self.num_imputer.strategy}
            if is_string_dtype(X):
                stats = {"param": self.cat_imputer.fit(X, y),
                         "strategy": self.cat_imputer.strategy}

        # if y is not None, fitting X to y. predictive method.
        if not y is None:
            if is_numeric_dtype(y):
                stats = {"param": self.num_imputer.fit(X, y),
                         "strategy": self.num_imputer.strategy}
            if is_string_dtype(y):
                stats = {"param": self.cat_imputer.fit(X, y),
                         "strategy": self.cat_imputer.strategy}

        # return final stats
        self.statistics_ = stats
        return self 
Example #25
Source File: converter.py    From hvplot with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _process_chart_y(self, data, x, y, single_y):
        """This should happen after _process_chart_x"""
        y = y or self.y
        if y is None:
            ys = [c for c in data.columns if c not in [x]+self.by+self.groupby+self.grid]
            if len(ys) > 1:
                # if columns have different dtypes, only include numeric columns
                from pandas.api.types import is_numeric_dtype as isnum
                num_ys = [dim for dim in ys if isnum(data[dim])]
                if len(num_ys) >= 1:
                    ys = num_ys
            y = ys[0] if len(ys) == 1 or single_y else ys
        return y 
Example #26
Source File: density.py    From plotnine with GNU General Public License v2.0 5 votes vote down vote up
def get_var_type(col):
    """
    Return var_type (for KDEMultivariate) of the column

    Parameters
    ----------
    col : pandas.Series
        A dataframe column.

    Returns
    -------
    out : str
        One of ['c', 'o', 'u'].

    See Also
    --------
    The origin of the character codes is
    :class:`statsmodels.nonparametric.kernel_density.KDEMultivariate`.
    """
    if pdtypes.is_numeric_dtype(col):
        # continuous
        return 'c'
    elif pdtypes.is_categorical_dtype(col):
        # ordered or unordered
        return 'o' if col.cat.ordered else 'u'
    else:
        # unordered if unsure, e.g string columns that
        # are not categorical
        return 'u' 
Example #27
Source File: helpers.py    From anndata with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def assert_equal_ndarray(a, b, exact=False, elem_name=None):
    b = asarray(b)
    if not exact and is_numeric_dtype(a) and is_numeric_dtype(b):
        assert a.shape == b.shape, format_msg(elem_name)
        assert np.allclose(a, b, equal_nan=True), format_msg(elem_name)
    elif (  # Structured dtype
        not exact
        and hasattr(a, "dtype")
        and hasattr(b, "dtype")
        and len(a.dtype) > 1
        and len(b.dtype) > 0
    ):
        assert_equal(pd.DataFrame(a), pd.DataFrame(b), exact, elem_name)
    else:
        assert np.all(a == b), format_msg(elem_name) 
Example #28
Source File: _sklearn.py    From qlik-py-tools with MIT License 5 votes vote down vote up
def _add_lags(self, X, y=None, extrapolate=1, update_features_df=False):
        """
        Add lag observations to X.
        If y is available and self.model.lag_target is True, the previous target will become an additional feature in X.
        Feature definitions for the model will be updated accordingly.
        """

        # Add lag target to the features if required
        # This will create an additional feature for each sample i.e. the previous value of y 
        if y is not None and self.model.lag_target:
            X["previous_y"] = y.shift(1)
            
            if update_features_df:
                # Check the target's data type
                dt = 'float' if is_numeric_dtype(y.iloc[:,0]) else 'str'
                # Set the preprocessing feature strategy for the lag targets
                if self.model.estimator_type == 'classifier':
                    fs = 'one hot encoding' 
                elif self.model.scale_lag_target and not self.model.scale_target:
                    fs = 'scaling'
                else:
                    fs = 'none'
                self.model.scale_lag_target
                # Update feature definitions for the model
                self.model.features_df.loc['previous_y'] = [self.model.name, 'previous_y', 'feature', dt, fs, '']

        if self.model.lags:
            # Add the lag observations
            X = utils.add_lags(X, lag=self.model.lags, extrapolate=extrapolate, dropna=True, suffix="t")
            
            if update_features_df:
                # Duplicate the feature definitions by the number of lags
                self.model.features_df = pd.concat([self.model.features_df] * (self.model.lags+extrapolate))
                # Set the new feature names as the index of the feature definitions data frame
                self.model.features_df['name'] = X.columns
                self.model.features_df = self.model.features_df.set_index('name', drop=True)

        if self.model.debug:
            self._print_log(11, data=X)

        return X 
Example #29
Source File: environment.py    From hyperparameter_hunter with MIT License 5 votes vote down vote up
def save_transformed_metrics(self, value):
        if value is None:
            if all(is_numeric_dtype(self.train_dataset[col]) for col in self.target_column):
                # If all target columns are numeric, assume metric evaluation should use the
                #   original/inverted targets and predictions
                value = False
            else:
                # If any target column is non-numeric, assume evaluation should use the transformed
                #   targets and predictions because most metrics require numeric inputs
                value = True
        self._save_transformed_metrics = value

    ##################################################
    # Core Methods
    ################################################## 
Example #30
Source File: pandas.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def is_numeric_dtype(arr_or_dtype):
            # Crude implementation only suitable for array-like types
            try:
                tipo = arr_or_dtype.dtype.type
            except AttributeError:
                tipo = type(None)
            return (issubclass(tipo, (np.number, np.bool_)) and
                    not issubclass(tipo, (np.datetime64, np.timedelta64)))