Python sklearn.base.TransformerMixin() Examples

The following are 27 code examples of sklearn.base.TransformerMixin(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.base , or try the search function .
Example #1
Source File: common_utils.py    From interpret-text with MIT License 8 votes vote down vote up
def create_pandas_only_svm_classifier(X, y, probability=True):
    class PandasOnlyEstimator(TransformerMixin):
        def fit(self, X, y=None, **fitparams):
            return self

        def transform(self, X, **transformparams):
            dataset_is_df = isinstance(X, pd.DataFrame)
            if not dataset_is_df:
                raise Exception("Dataset must be a pandas dataframe!")
            return X

    pandas_only = PandasOnlyEstimator()

    clf = svm.SVC(gamma=0.001, C=100.0, probability=probability, random_state=777)
    pipeline = Pipeline([("pandas_only", pandas_only), ("clf", clf)])
    return pipeline.fit(X, y) 
Example #2
Source File: common_utils.py    From interpret-community with MIT License 6 votes vote down vote up
def create_pandas_only_svm_classifier(X, y, probability=True):
    class PandasOnlyEstimator(TransformerMixin):
        def fit(self, X, y=None, **fitparams):
            return self

        def transform(self, X, **transformparams):
            dataset_is_df = isinstance(X, pd.DataFrame)
            if not dataset_is_df:
                raise Exception("Dataset must be a pandas dataframe!")
            return X

    pandas_only = PandasOnlyEstimator()

    clf = svm.SVC(gamma=0.001, C=100., probability=probability, random_state=777)
    pipeline = Pipeline([('pandas_only', pandas_only), ('clf', clf)])
    return pipeline.fit(X, y) 
Example #3
Source File: filters.py    From causallib with Apache License 2.0 6 votes vote down vote up
def track_selected_features(pipeline_stages, num_features):
    """

    Args:
        pipeline_stages (list [tuple[str, TransformerMixin]]): list of steps. each step is a tuple of Name and
                                                               Transformer Object.
        num_features (int):

    Returns:
        np.ndarray:
    """
    selected_features = np.arange(num_features)
    for p_name, p in pipeline_stages:
        if not isinstance(p, BaseFeatureSelector):
            continue
        p_features = p.selected_features
        selected_features = selected_features[p_features]
    return selected_features 
Example #4
Source File: tpot_tests.py    From tpot with GNU Lesser General Public License v3.0 6 votes vote down vote up
def test_template_4():
    """Assert that TPOT template option generates pipeline when one of steps is a specific operator."""

    tpot_obj = TPOTClassifier(
        population_size=5,
        generations=2,
        random_state=42,
        verbosity=0,
        config_dict = 'TPOT light',
        template='SelectPercentile-Transformer-Classifier'
    )
    tpot_obj.fit(pretest_X, pretest_y)

    assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
    assert not (tpot_obj._start_datetime is None)

    sklearn_pipeline = tpot_obj.fitted_pipeline_
    operator_count = tpot_obj._operator_count(tpot_obj._optimized_pipeline)
    assert operator_count == 3
    assert sklearn_pipeline.steps[0][0] == 'SelectPercentile'.lower()
    assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin)
    assert issubclass(sklearn_pipeline.steps[1][1].__class__, TransformerMixin)
    assert issubclass(sklearn_pipeline.steps[2][1].__class__, ClassifierMixin) 
Example #5
Source File: tpot_tests.py    From tpot with GNU Lesser General Public License v3.0 6 votes vote down vote up
def test_template_3():
    """Assert that TPOT template option generates pipeline when one of steps is a specific operator."""

    tpot_obj = TPOTClassifier(
        random_state=42,
        verbosity=0,
        template='SelectPercentile-Transformer-Classifier'
    )
    tpot_obj._fit_init()
    pop = tpot_obj._toolbox.population(n=10)
    for deap_pipeline in pop:
        operator_count = tpot_obj._operator_count(deap_pipeline)
        sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline)
        assert operator_count == 3
        assert sklearn_pipeline.steps[0][0] == 'SelectPercentile'.lower()
        assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin)
        assert issubclass(sklearn_pipeline.steps[1][1].__class__, TransformerMixin)
        assert issubclass(sklearn_pipeline.steps[2][1].__class__, ClassifierMixin) 
Example #6
Source File: tpot_tests.py    From tpot with GNU Lesser General Public License v3.0 6 votes vote down vote up
def test_template_2():
    """Assert that TPOT template option generates pipeline when each step is operator type with a duplicate main type."""

    tpot_obj = TPOTClassifier(
        random_state=42,
        verbosity=0,
        template='Selector-Selector-Transformer-Classifier'
    )
    tpot_obj._fit_init()
    pop = tpot_obj._toolbox.population(n=10)
    for deap_pipeline in pop:
        operator_count = tpot_obj._operator_count(deap_pipeline)
        sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline)
        assert operator_count == 4
        assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin)
        assert issubclass(sklearn_pipeline.steps[1][1].__class__, SelectorMixin)
        assert issubclass(sklearn_pipeline.steps[2][1].__class__, TransformerMixin)
        assert issubclass(sklearn_pipeline.steps[3][1].__class__, ClassifierMixin) 
Example #7
Source File: tpot_tests.py    From tpot with GNU Lesser General Public License v3.0 6 votes vote down vote up
def test_template_1():
    """Assert that TPOT template option generates pipeline when each step is a type of operator."""

    tpot_obj = TPOTClassifier(
        random_state=42,
        verbosity=0,
        template='Selector-Transformer-Classifier'
    )
    tpot_obj._fit_init()
    pop = tpot_obj._toolbox.population(n=10)
    for deap_pipeline in pop:
        operator_count = tpot_obj._operator_count(deap_pipeline)
        sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline)
        assert operator_count == 3
        assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin)
        assert issubclass(sklearn_pipeline.steps[1][1].__class__, TransformerMixin)
        assert issubclass(sklearn_pipeline.steps[2][1].__class__, ClassifierMixin) 
Example #8
Source File: test_base.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_clone_pandas_dataframe():

    class DummyEstimator(BaseEstimator, TransformerMixin):
        """This is a dummy class for generating numerical features

        This feature extractor extracts numerical features from pandas data
        frame.

        Parameters
        ----------

        df: pandas data frame
            The pandas data frame parameter.

        Notes
        -----
        """
        def __init__(self, df=None, scalar_param=1):
            self.df = df
            self.scalar_param = scalar_param

        def fit(self, X, y=None):
            pass

        def transform(self, X):
            pass

    # build and clone estimator
    d = np.arange(10)
    df = MockDataFrame(d)
    e = DummyEstimator(df, scalar_param=1)
    cloned_e = clone(e)

    # the test
    assert_true((e.df == cloned_e.df).values.all())
    assert_equal(e.scalar_param, cloned_e.scalar_param) 
Example #9
Source File: normalization_strategy_selector.py    From Auto-PyTorch with Apache License 2.0 5 votes vote down vote up
def add_normalization_strategy(self, name, normalization_type, is_default_normalization_strategy=False):
        """Add a normalization strategy.
        Will be called with {pipeline_config, X, Y}
        
        Arguments:
            name {string} -- name of normalization strategy for definition in config
            normalization_strategy {function} -- callable with {pipeline_config, X}
            is_default_normalization_strategy {bool} -- should the given normalization_strategy be the default normalization_strategy if not specified in config
        """

        if (not issubclass(normalization_type, BaseEstimator) and not issubclass(normalization_type, TransformerMixin)):
            raise ValueError("normalization_type must be subclass of BaseEstimator")
        self.normalization_strategies[name] = normalization_type 
Example #10
Source File: data_splitters.py    From MAST-ML with MIT License 5 votes vote down vote up
def split(self, X, y, groups):
        n_groups = self.get_n_splits(groups=groups)
        #print('n_groups', n_groups)
        lpgo = ms.LeavePGroupsOut(n_groups=n_groups-1)
        return lpgo.split(X, y, groups)

#class WithoutElement(BaseEstimator, TransformerMixin):
#    " Train the model without each element, then test on the rows with that element "
#    pass 
Example #11
Source File: ABuML.py    From abu with GNU General Public License v3.0 5 votes vote down vote up
def fit_transform(self, **kwargs):
        """
        被装饰器@entry_wrapper()装饰,默认参数即支持有监督和无监督学习,
        内部通过检测isinstance(fiter, TransformerMixin) or hasattr(fiter, 'fit_transform')
        来判定是否可以fit_transform

        eg:
            input:  ttn_abu.x.shape
            output: (891, 14)

            input:  ttn_abu.fit_transform(fiter_type=ml.EMLFitType.E_FIT_PCA).shape
            output: (891, 4)

            input:  ttn_abu.fit_transform(fiter_type=ml.EMLFitType.E_FIT_KMEAN).shape
            output: (891, 2)

        :param kwargs: 外部可以传递x, y, 通过
                                x = kwargs.pop('x', self.x)
                                y = kwargs.pop('y', self.y)
                       以及装饰器使用的fiter_type,eg:ttn_abu.fit_transform(fiter_type=ml.EMLFitType.E_FIT_CLF)
        :return: fit_transform后的转换结果矩阵
        """
        fiter = self.get_fiter()
        if isinstance(fiter, TransformerMixin) or hasattr(fiter, 'fit_transform'):
            x = kwargs.pop('x', self.x)
            y = kwargs.pop('y', self.y)
            if self.is_supervised_learning():
                trans = fiter.fit_transform(x, y)
            else:
                trans = fiter.fit_transform(x)
            return trans
        else:
            self.log_func('{} not support fit_transform'.format(fiter)) 
Example #12
Source File: base.py    From smrt with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def transform(self, X):
        """Inherited from the ``TransformerMixin``. Pass the ``X`` array
        through the inferential MLP layers.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            The array of samples that will be encoded into the new
            hidden layer space.
        """
        return self.encode(X) 
Example #13
Source File: test_preprocessing.py    From skl-groups with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_basic():
    bags = [np.random.normal(5, 3, size=(np.random.randint(10, 100), 20))
            for _ in xrange(50)]
    feats = Features(bags, stack=True)

    stder = BagStandardizer()
    stdized = stder.fit_transform(bags)
    stdized.make_stacked()

    assert np.allclose(np.mean(stdized.stacked_features), 0)
    assert np.allclose(np.std(stdized.stacked_features), 1)

    first_five = stder.transform(bags[:5])
    assert first_five == stdized[:5]

    minmaxer = BagMinMaxScaler([3, 7])
    minmaxed = minmaxer.fit_transform(feats)
    minmaxed.make_stacked()
    assert np.allclose(np.min(minmaxed.stacked_features, 0), 3)
    assert np.allclose(np.max(minmaxed.stacked_features, 0), 7)

    normer = BagNormalizer('l1')
    normed = normer.fit_transform(Features(bags))
    normed.make_stacked()
    assert np.allclose(np.sum(np.abs(normed.stacked_features), 1), 1)

    class GetMean(BaseEstimator, TransformerMixin):
        def fit(self, X, y=None):
            return self
        def transform(self, X):
            return X.mean(axis=1)[None, :]
    m = BagPreprocesser(GetMean())
    assert_raises(ValueError, lambda: m.transform(bags)) 
Example #14
Source File: test_step.py    From baikal with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_get_params_without_init(self, teardown):
        """Test edge case where the base class does not define
        an __init__ method. get_params should resolve to object.__init__
        which results in an empty dict.
        """

        class TransformerWithoutInit(TransformerMixin, BaseEstimator):
            pass

        class TransformerWithoutInitStep(Step, TransformerWithoutInit):
            pass

        step = TransformerWithoutInitStep()
        assert step.get_params() == {} 
Example #15
Source File: _test.py    From ibex with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _generate_bases_test(est, pd_est):
    def test(self):
        self.assertTrue(isinstance(pd_est, FrameMixin), pd_est)
        self.assertFalse(isinstance(est, FrameMixin))
        self.assertTrue(isinstance(pd_est, base.BaseEstimator))
        try:
            mixins = [
                base.ClassifierMixin,
                base.ClusterMixin,
                base.BiclusterMixin,
                base.TransformerMixin,
                base.DensityMixin,
                base.MetaEstimatorMixin,
                base.ClassifierMixin,
                base.RegressorMixin]
        except:
            if _sklearn_ver > 17:
                raise
            mixins = [
                base.ClassifierMixin,
                base.ClusterMixin,
                base.BiclusterMixin,
                base.TransformerMixin,
                base.MetaEstimatorMixin,
                base.ClassifierMixin,
                base.RegressorMixin]
        for mixin in mixins:
            self.assertEqual(
                isinstance(pd_est, mixin),
                isinstance(est, mixin),
                mixin)

    return test 
Example #16
Source File: category_vector.py    From talkingdata-adtracking-fraud-detection with MIT License 5 votes vote down vote up
def transformer_factory(self) -> TransformerMixin:
        return NMF(n_components=self.width, random_state=71) 
Example #17
Source File: category_vector.py    From talkingdata-adtracking-fraud-detection with MIT License 5 votes vote down vote up
def transformer_factory(self) -> TransformerMixin:
        return TruncatedSVD(n_components=self.width, random_state=71) 
Example #18
Source File: category_vector.py    From talkingdata-adtracking-fraud-detection with MIT License 5 votes vote down vote up
def transformer_factory(self) -> TransformerMixin:
        return LatentDirichletAllocation(n_components=self.width, learning_method='online', random_state=71) 
Example #19
Source File: category_vector.py    From talkingdata-adtracking-fraud-detection with MIT License 5 votes vote down vote up
def transformer_factory(self) -> TransformerMixin:
        return LatentDirichletAllocation(n_components=self.width, learning_method='online', random_state=71) 
Example #20
Source File: category_vector.py    From talkingdata-adtracking-fraud-detection with MIT License 5 votes vote down vote up
def vectorizer_factory(self) -> TransformerMixin:
        raise NotImplementedError 
Example #21
Source File: test_base.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_clone_pandas_dataframe():

    class DummyEstimator(BaseEstimator, TransformerMixin):
        """This is a dummy class for generating numerical features

        This feature extractor extracts numerical features from pandas data
        frame.

        Parameters
        ----------

        df: pandas data frame
            The pandas data frame parameter.

        Notes
        -----
        """
        def __init__(self, df=None, scalar_param=1):
            self.df = df
            self.scalar_param = scalar_param

        def fit(self, X, y=None):
            pass

        def transform(self, X):
            pass

    # build and clone estimator
    d = np.arange(10)
    df = MockDataFrame(d)
    e = DummyEstimator(df, scalar_param=1)
    cloned_e = clone(e)

    # the test
    assert (e.df == cloned_e.df).values.all()
    assert_equal(e.scalar_param, cloned_e.scalar_param) 
Example #22
Source File: diff.py    From gordo with GNU Affero General Public License v3.0 5 votes vote down vote up
def __init__(
        self,
        base_estimator: BaseEstimator = KerasAutoEncoder(kind="feedforward_hourglass"),
        scaler: TransformerMixin = RobustScaler(),
        require_thresholds: bool = True,
        window=None,
    ):
        """
        Classifier which wraps a ``base_estimator`` and provides a diff error
        based approach to anomaly detection.

        It trains a ``scaler`` to the target **after** training, purely for
        error calculations. The underlying ``base_estimator`` is trained
        with the original, unscaled, ``y``.

        Parameters
        ----------
        base_estimator: sklearn.base.BaseEstimator
            The model to which normal ``.fit``, ``.predict`` methods will be used.
            defaults to py:class:`gordo.machine.model.models.KerasAutoEncoder` with
            ``kind='feedforward_hourglass``
        scaler: sklearn.base.TransformerMixin
            Defaults to ``sklearn.preprocessing.RobustScaler``
            Used for transforming model output and the original ``y`` to calculate
            the difference/error in model output vs expected.
        require_thresholds: bool
            Requires calculating ``thresholds_`` via a call to :func:`~DiffBasedAnomalyDetector.cross_validate`.
            If this is set (default True), but :func:`~DiffBasedAnomalyDetector.cross_validate`
            was not called before calling :func:`~DiffBasedAnomalyDetector.anomaly` an ``AttributeError``
            will be raised.
        window: int
            Window size for smoothed thresholds
        """
        self.base_estimator = base_estimator
        self.scaler = scaler
        self.require_thresholds = require_thresholds
        self.window = window 
Example #23
Source File: utils.py    From gordo with GNU Affero General Public License v3.0 5 votes vote down vote up
def metric_wrapper(metric, scaler: Optional[TransformerMixin] = None):
    """
    Ensures that a given metric works properly when the model itself returns
    a y which is shorter than the target y, and allows scaling the data
    before applying the metrics.


    Parameters
    ----------
    metric
        Metric which must accept y_true and y_pred of the same length
    scaler :  Optional[TransformerMixin]
        Transformer which will be applied on y and y_pred before the metrics is
        calculated. Must have method `transform`, so for most scalers it must already
        be fitted on `y`.
    """

    @functools.wraps(metric)
    def _wrapper(y_true, y_pred, *args, **kwargs):
        if scaler:
            logger.debug(
                "Transformer provided to metrics wrapper, scaling y and y_pred before "
                "passing to metrics"
            )
            y_true = scaler.transform(y_true)
            y_pred = scaler.transform(y_pred)
        return metric(y_true[-len(y_pred) :], y_pred, *args, **kwargs)

    return _wrapper 
Example #24
Source File: investigate.py    From sklearn-onnx with MIT License 4 votes vote down vote up
def enumerate_pipeline_models(pipe, coor=None, vs=None):
    """
    Enumerates all the models within a pipeline.
    """
    if coor is None:
        coor = (0,)
    yield coor, pipe, vs
    if hasattr(pipe, 'transformer_and_mapper_list') and len(
            pipe.transformer_and_mapper_list):
        # azureml DataTransformer
        raise NotImplementedError("Unable to handle this specific case.")
    elif hasattr(pipe, 'mapper') and pipe.mapper:
        # azureml DataTransformer
        for couple in enumerate_pipeline_models(pipe.mapper, coor + (0,)):
            yield couple
    elif hasattr(pipe, 'built_features'):
        # sklearn_pandas.dataframe_mapper.DataFrameMapper
        for i, (columns, transformers, _) in enumerate(pipe.built_features):
            if isinstance(columns, str):
                columns = (columns,)
            if transformers is None:
                yield (coor + (i,)), None, columns
            else:
                for couple in enumerate_pipeline_models(transformers,
                                                        coor + (i,),
                                                        columns):
                    yield couple
    elif isinstance(pipe, Pipeline):
        for i, (_, model) in enumerate(pipe.steps):
            for couple in enumerate_pipeline_models(model, coor + (i,)):
                yield couple
    elif ColumnTransformer is not None and isinstance(pipe, ColumnTransformer):
        for i, (_, fitted_transformer, column) in enumerate(pipe.transformers):
            for couple in enumerate_pipeline_models(
                    fitted_transformer, coor + (i,), column):
                yield couple
    elif isinstance(pipe, FeatureUnion):
        for i, (_, model) in enumerate(pipe.transformer_list):
            for couple in enumerate_pipeline_models(model, coor + (i,)):
                yield couple
    elif TransformedTargetRegressor is not None and isinstance(
            pipe, TransformedTargetRegressor):
        raise NotImplementedError(
            "Not yet implemented for TransformedTargetRegressor.")
    elif isinstance(pipe, (TransformerMixin, ClassifierMixin, RegressorMixin)):
        pass
    elif isinstance(pipe, BaseEstimator):
        pass
    else:
        raise TypeError(
            "Parameter pipe is not a scikit-learn object: {}\n{}".format(
                type(pipe), pipe)) 
Example #25
Source File: test_algebra_onnx_operators.py    From sklearn-onnx with MIT License 4 votes vote down vote up
def test_sub(self):

        class CustomOpTransformer(BaseEstimator, TransformerMixin):

            def __init__(self, op_version=None):
                self.op_version = op_version

            def fit(self, X, y=None):
                self.W = np.mean(X, axis=0)
                return self

            def transform(self, X):
                return X - self.W

        mat = np.array([[0., 1.], [1., 2.], [3., 4.]])
        tr = CustomOpTransformer(op_version=None)
        tr.fit(mat)
        z = tr.transform(mat)

        def conv(scope, operator, container):
            W = operator.raw_operator.W.astype(container.dtype)
            op = OnnxSub(
                operator.inputs[0], W, output_names=operator.outputs,
                op_version=TARGET_OPSET)
            op.add_to(scope, container)
            text = str(container)
            if 'name:"Su_Sub"' not in text:
                raise AssertionError(
                    "Unnamed operator: '{}'".format(text))
            nin = list(op.enumerate_initial_types())
            nno = list(op.enumerate_nodes())
            nva = list(op.enumerate_variables())
            assert len(nin) == 1
            assert nin[0][0] == 'input'
            assert nin[0][1].shape == [None, 2]
            assert len(nno) == 1
            assert nno[0].output_names == ['variable']
            assert len(nva) == 1
            assert isinstance(nva[0], tuple)
            assert nva[0][1] == 0

        def shape(operator):
            N = operator.inputs[0].type.shape[0]
            W = operator.raw_operator.W
            operator.outputs[0].type.shape = [N, W.shape[0]]

        model_onnx = convert_sklearn(
            tr, 'a-sub', [('input', FloatTensorType([None, 2]))],
            custom_shape_calculators={CustomOpTransformer: shape},
            custom_conversion_functions={CustomOpTransformer: conv})

        sess = InferenceSession(model_onnx.SerializeToString())
        z2 = sess.run(None, {'input': mat.astype(np.float32)})[0]
        assert_almost_equal(z, z2) 
Example #26
Source File: test_algebra_onnx_operators.py    From sklearn-onnx with MIT License 4 votes vote down vote up
def test_sub_div(self):

        class CustomOpTransformer(BaseEstimator, TransformerMixin):

            def __init__(self):
                pass

            def fit(self, X, y=None):
                self.W = np.mean(X, axis=0)
                self.S = np.std(X, axis=0)
                return self

            def transform(self, X):
                return (X - self.W) / self.S

        mat = np.array([[0., 1.], [0., 1.], [2., 2.]])
        tr = CustomOpTransformer()
        tr.fit(mat)
        z = tr.transform(mat)

        def conv(scope, operator, container):
            W = operator.raw_operator.W.astype(np.float32)
            S = operator.raw_operator.S.astype(np.float32)
            X = operator.inputs[0]
            out = operator.outputs
            op = OnnxDiv(
                OnnxSub(X, W, op_version=container.target_opset),
                S, output_names=out,
                op_version=container.target_opset)
            op.add_to(scope, container)

        def shape(operator):
            N = operator.inputs[0].type.shape[0]
            W = operator.raw_operator.W
            operator.outputs[0].type.shape = [N, W.shape[0]]

        model_onnx = convert_sklearn(
            tr, 'a-sub-div', [('input', FloatTensorType([None, 2]))],
            custom_shape_calculators={CustomOpTransformer: shape},
            custom_conversion_functions={CustomOpTransformer: conv},
            target_opset=None)

        try:
            sess = InferenceSession(model_onnx.SerializeToString())
        except RuntimeError as e:
            raise AssertionError(
                "Cannot load model\n---\n{}\n---".format(model_onnx)) from e
        z2 = sess.run(None, {'input': mat.astype(np.float32)})[0]
        assert_almost_equal(z, z2) 
Example #27
Source File: common_tabular_tests.py    From interpret-community with MIT License 4 votes vote down vote up
def verify_explain_model_categorical(self, pass_categoricals=False):
        headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
                   "num_doors", "body_style", "drive_wheels", "engine_location",
                   "wheel_base", "length", "width", "height", "curb_weight",
                   "engine_type", "num_cylinders", "engine_size", "fuel_system",
                   "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
                   "city_mpg", "highway_mpg", "price"]
        df = retrieve_dataset('imports-85.csv', header=None, names=headers, na_values="?")
        df_y = df['price']
        df_X = df.drop(columns='price')
        df_train_X, df_test_X, df_train_y, df_test_y = train_test_split(df_X, df_y, test_size=0.2, random_state=7)
        # Encode strings to ordinal values
        categorical_col_names = list(df_train_X.select_dtypes(include='object').columns)
        categorical_col_indices = [df_train_X.columns.get_loc(col_name) for col_name in categorical_col_names]
        kwargs = {'num_leaves': 31, 'num_trees': 100, 'objective': 'regression',
                  'categorical_feature': categorical_col_indices}
        lgbm_regressor = LGBMRegressor(**kwargs)
        # Impute the x and y values
        imp_X = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        imp_y = SimpleImputer(missing_values=np.nan, strategy='mean')
        # reshape to 2D array since SimpleImputer can't work on 1D array
        df_train_y = df_train_y.values.reshape(df_train_y.shape[0], 1)
        imp_y.fit(df_train_y)
        imp_df_y = imp_y.transform(df_train_y)
        imp_X.fit(df_train_X)
        imp_train_X = pd.DataFrame(imp_X.transform(df_train_X))

        class CustomTextTransformer(BaseEstimator, TransformerMixin):
            def __init__(self):
                return

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                return X.astype('U')

        custom_text = CustomTextTransformer()
        encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        ct1 = ColumnTransformer([('cu', custom_text, categorical_col_indices)], remainder='passthrough')
        ct2 = ColumnTransformer([('ord', encoder, slice(0, len(categorical_col_indices)))], remainder='passthrough')
        pipeline = Pipeline([('cu', ct1), ('ct', ct2), ('lgbm', lgbm_regressor)])
        pipeline.fit(imp_train_X, imp_df_y[:, 0])
        if pass_categoricals:
            explainer = self.create_explainer(pipeline, imp_train_X, categorical_features=categorical_col_indices)
        else:
            explainer = self.create_explainer(pipeline, imp_train_X)
        explanation = explainer.explain_global(imp_X.transform(df_test_X))
        verify_serialization(explanation, exist_ok=True)