python source code of test_keyed

from itertools import chain, repeat, cycle
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN, KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors
import sklearn.base

from spark_sklearn.keyed_models import KeyedEstimator, KeyedModel, SparkSklearnEstimator
from spark_sklearn.test_utils import fixtureReuseSparkSession, assertPandasAlmostEqual, RandomTest


def _sortByComponentWeight(pca):
    zipped = zip(pca.components_, pca.explained_variance_ratio_)
    ordered = sorted(zipped, key=lambda x: x[1])
    return tuple(np.array(unzipped) for unzipped in zip(*ordered))


def _assertPandasAlmostEqual(actual, expected, sortby):
    def convert_estimators(x):  # note conversion makes estimators invariant to training order.
        if isinstance(x, SparkSklearnEstimator):
            x = x.estimator
        if isinstance(x, LinearRegression) or isinstance(x, LogisticRegression):
            return x.coef_, x.intercept_
        if isinstance(x, PCA):
            return _sortByComponentWeight(x)
        if isinstance(x, KMeans):
            return x.cluster_centers_, x.labels_
        return x
    assertPandasAlmostEqual(actual, expected, convert=convert_estimators, sortby=sortby)


@fixtureReuseSparkSession
class KeyedModelTests(RandomTest):

    NDIM = 5

    class _CustomClusterer(sklearn.base.BaseEstimator):
        def fit(X, y=None):
            pass

        def transform(X):
            return X

        def fit_predict(X):
            return np.zeros(len(X))

    class _CustomTransformer(sklearn.base.BaseEstimator):
        def fit(X):  # Only 1 argument expected!
            pass

        def transform(X):
            return X

        def predict(X):  # Dummy predict to throw us off - all sklearn clusterers have fit_predict
            return np.zeros(len(X))

    class _CustomMissingFit(sklearn.base.BaseEstimator):
        def transform(X):
            return X

    # Makes sure that that the parameter estimator and its generated model are of the given type.
    # Simultaneously makes sure that an empty fit() works.
    def checkEstimatorType(self, keyedEstimator, expectedType):
        self.assertEqual(keyedEstimator.sklearnEstimatorType, expectedType)
        schema = StructType().add("features", DoubleType()).add("key", LongType())
        yCol = keyedEstimator.getOrDefault("yCol")
        if yCol is not None:
            schema = schema.add(yCol, DoubleType())
        emptyDF = self.spark.createDataFrame([], schema=schema)
        keyedModel = keyedEstimator.fit(emptyDF)
        self.assertEqual(keyedModel.sklearnEstimatorType, expectedType)

    def test_correct_estimator_type(self):
        self.checkEstimatorType(KeyedEstimator(sklearnEstimator=PCA()), "transformer")

        self.checkEstimatorType(KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y"),
                                "predictor")

        self.checkEstimatorType(KeyedEstimator(sklearnEstimator=DBSCAN()), "clusterer")

        self.checkEstimatorType(KeyedEstimator(sklearnEstimator=KMeans()), "clusterer")

        ke = KeyedEstimator(sklearnEstimator=KMeans(), estimatorType="transformer")
        self.checkEstimatorType(ke, "transformer")

        custom = KeyedModelTests._CustomClusterer()
        ke = KeyedEstimator(sklearnEstimator=custom)
        self.checkEstimatorType(ke, "clusterer")

        ke = KeyedEstimator(sklearnEstimator=custom, estimatorType="transformer")
        self.checkEstimatorType(ke, "transformer")

        custom = KeyedModelTests._CustomTransformer()
        self.checkEstimatorType(KeyedEstimator(sklearnEstimator=custom), "transformer")

    def test_invalid_argument(self):
        # Need to specify sklearnEstimator
        self.assertRaises(ValueError, KeyedEstimator)

        # sklearnEstimator must be a sklearn.base.Estimator
        create = lambda: KeyedEstimator(sklearnEstimator=5)
        self.assertRaises(ValueError, create)

        class SomeUDC(object):
            pass

        create = lambda: KeyedEstimator(sklearnEstimator=SomeUDC())
        self.assertRaises(ValueError, create)

        # Must have fit()
        create = lambda: KeyedEstimator(sklearnEstimator=KeyedModelTests._CustomMissingFit())
        self.assertRaises(AttributeError, create)

        # Must have key columns
        create = lambda: KeyedEstimator(sklearnEstimator=PCA(), keyCols=[])
        self.assertRaises(ValueError, create)

        # Columns can't have "estimator" name in them
        create = lambda: KeyedEstimator(sklearnEstimator=PCA(), keyCols=["key", "estimator"])
        self.assertRaises(ValueError, create)
        create = lambda: KeyedEstimator(sklearnEstimator=PCA(), xCol="estimator")
        self.assertRaises(ValueError, create)
        create = lambda: KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="estimator")
        self.assertRaises(ValueError, create)
        create = lambda: KeyedEstimator(sklearnEstimator=PCA(), yCol="estimator")
        self.assertRaises(ValueError, create)

        # Presence of yCol requires predictor
        create = lambda: KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y",
                                        estimatorType="transformer")
        self.assertRaises(ValueError, create)
        create = lambda: KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y",
                                        estimatorType="clusterer")
        self.assertRaises(ValueError, create)

        # estimatorType must be one of the three options
        create = lambda: KeyedEstimator(sklearnEstimator=PCA(), estimatorType="regressor")
        self.assertRaises(ValueError, create)

    # Checks that only the model throws an AttributeError at transform time.
    def checkPredictionAttrError(self, keyedEstimator):
        schema = StructType().add("features", DoubleType()).add("key", LongType())
        yCol = keyedEstimator.getOrDefault("yCol")
        if yCol is not None:
            schema = schema.add(yCol, DoubleType())
        emptyDF = self.spark.createDataFrame([], schema=schema)
        keyedModel = keyedEstimator.fit(emptyDF)
        self.assertRaises(AttributeError, keyedModel.transform, emptyDF)

    def test_attr_error(self):
        ke = KeyedEstimator(sklearnEstimator=PCA(), estimatorType="clusterer")
        self.checkPredictionAttrError(ke)
        ke = KeyedEstimator(sklearnEstimator=PCA(), yCol="y", estimatorType="predictor")
        self.checkPredictionAttrError(ke)

        ke = KeyedEstimator(sklearnEstimator=DBSCAN(), estimatorType="transformer")
        self.checkPredictionAttrError(ke)
        ke = KeyedEstimator(sklearnEstimator=DBSCAN(), yCol="y", estimatorType="predictor")
        self.checkPredictionAttrError(ke)

        # LinearRegression() or any other predictor would actually fail at fit-time if we used a
        # non-empty DF with the wrong estimatorType since no y value would be passed, so
        # scikit-learn would complain.

    def test_type_error(self):
        df = self.spark.createDataFrame([("a", 0), ("b", 0)]).toDF("features", "key")
        keyedPCA = KeyedEstimator(sklearnEstimator=PCA())
        self.assertRaises(TypeError, keyedPCA.fit, df)

        df = self.spark.createDataFrame([(Vectors.dense([i]), [i], 0) for i in range(10)])
        df = df.toDF("features", "y", "key")
        keyedLR = KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y")
        self.assertRaises(TypeError, keyedLR.fit, df)

    def checkKeyedModelEquivalent(self, minExamples, featureGen, labelGen, **kwargs):
        NUSERS = 10
        # featureGen() should generate a np rank-1 ndarray of equal length
        # labelGen() should generate a scalar
        assert (labelGen is not None) == ("yCol" in kwargs)
        isPredictor = labelGen is not None

        # sklearn's LinearRegression estimator is stable even if undetermined.
        # User keys are just [0, NUSERS), repeated for each key if there are multiple columns.
        # The i-th user has i examples.

        keyCols = kwargs.get("keyCols", KeyedEstimator._paramSpecs["keyCols"]["default"])
        outputCol = kwargs.get("outputCol", KeyedEstimator._paramSpecs["outputCol"]["default"])
        xCol = kwargs.get("xCol", KeyedEstimator._paramSpecs["xCol"]["default"])

        nExamplesPerUser = lambda i: max(minExamples, i + 1)
        userKeys = [[i for _ in keyCols] for i in range(NUSERS)]
        features = [[featureGen() for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)]
        useless = [["useless col" for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)]
        if isPredictor:
            labels = [[labelGen() for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)]
        else:
            labels = None

        Xs = [np.vstack(x) for x in features]
        ys = [np.array(y) for y in labels] if isPredictor else repeat(None)
        localEstimators = [sklearn.base.clone(kwargs["sklearnEstimator"]).fit(X, y)
                           for X, y in zip(Xs, ys)]
        expectedDF = pd.DataFrame(userKeys, columns=keyCols)
        expectedDF["estimator"] = localEstimators

        def flattenAndConvertNumpy(x):
            return [Vectors.dense(i) if isinstance(i, np.ndarray) else i
                    for i in chain.from_iterable(x)]

        inputDF = pd.DataFrame.from_dict(
            {k: [i for i in range(NUSERS) for _ in range(nExamplesPerUser(i))] for k in keyCols})
        inputDF[xCol] = flattenAndConvertNumpy(features)
        inputDF["useless"] = flattenAndConvertNumpy(useless)
        if labels:
            inputDF[kwargs["yCol"]] = flattenAndConvertNumpy(labels)
        inputDF = self.spark.createDataFrame(inputDF)

        ke = KeyedEstimator(**kwargs)
        km = ke.fit(inputDF)

        actualDF = km.keyedModels.toPandas()
        _assertPandasAlmostEqual(actualDF, expectedDF, keyCols)

        # Test users with different amounts of points.
        nTestPerUser = lambda i: NUSERS // 4 if i < NUSERS // 2 else NUSERS * 3 // 4
        testFeatures = [[featureGen() for _ in range(nTestPerUser(i))] for i in range(NUSERS)]
        # "useless" column has nothing to do with computation, but is essential for keeping order
        # the same between the spark and non-spark versions
        useless = [range(nTestPerUser(i)) for i in range(NUSERS)]
        inputDF = pd.DataFrame.from_dict(
            {k: [i for i in range(NUSERS) for _ in range(nTestPerUser(i))] for k in keyCols})
        inputDF[xCol] = flattenAndConvertNumpy(testFeatures)
        inputDF["useless"] = flattenAndConvertNumpy(useless)

        estimatorType = km.sklearnEstimatorType  # tested to be correct elsewhere

        def makeOutput(estimator, X):
            if estimatorType == "transformer":
                return estimator.transform(X)
            else:
                assert estimatorType == "predictor" or estimatorType == "clusterer"
                return estimator.predict(X).tolist()

        Xs = [np.vstack(x) for x in testFeatures]
        expectedOutput = map(makeOutput, localEstimators, Xs)
        expectedDF = inputDF.copy(deep=True)
        expectedDF[outputCol] = flattenAndConvertNumpy(expectedOutput)

        inputDF = self.spark.createDataFrame(inputDF)
        actualDF = km.transform(inputDF).toPandas()

        _assertPandasAlmostEqual(actualDF, expectedDF, keyCols + ["useless"])

    def test_transformer(self):
        minExamples = 2
        featureGen = lambda: np.random.random(KeyedModelTests.NDIM)
        labelGen = None
        self.checkKeyedModelEquivalent(minExamples, featureGen, labelGen,
                                       sklearnEstimator=PCA())

    def test_clusterer(self):
        minExamples = 3
        featureGen = lambda: np.random.random(KeyedModelTests.NDIM)
        labelGen = None
        self.checkKeyedModelEquivalent(minExamples, featureGen, labelGen,
                                       sklearnEstimator=KMeans(random_state=0,
                                                               n_clusters=minExamples))

    def test_regression_predictor(self):
        minExamples = 1
        featureGen = lambda: np.random.random(KeyedModelTests.NDIM)
        labelGen = lambda: np.random.random()
        self.checkKeyedModelEquivalent(minExamples, featureGen, labelGen,
                                       sklearnEstimator=LinearRegression(), yCol="y")

    def test_classification_predictor(self):
        minExamples = 2
        featureGen = lambda: np.random.random(KeyedModelTests.NDIM)
        # Need to ensure each user has at least one of each label to train on.
        cyc = cycle([-1, 1])
        labelGen = lambda: next(cyc)
        lr = LogisticRegression(solver='lbfgs', random_state=0)
        self.checkKeyedModelEquivalent(minExamples, featureGen, labelGen,
                                       sklearnEstimator=lr, yCol="y")

    def test_diff_type_input(self):
        # Integer array
        minExamples = 1
        featureGen = lambda: np.random.randint(low=0, high=10, size=KeyedModelTests.NDIM)
        labelGen = lambda: np.random.random()
        self.checkKeyedModelEquivalent(minExamples, featureGen, labelGen,
                                       sklearnEstimator=LinearRegression(), yCol="y")

        # float input
        featureGen = lambda: np.random.random()
        self.checkKeyedModelEquivalent(minExamples, featureGen, labelGen,
                                       sklearnEstimator=LinearRegression(), yCol="y")

        # integer input
        featureGen = lambda: np.random.randint(100)
        self.checkKeyedModelEquivalent(minExamples, featureGen, labelGen,
                                       sklearnEstimator=LinearRegression(), yCol="y")

    def test_no_defaults(self):
        minExamples = 1
        featureGen = lambda: np.random.random(KeyedModelTests.NDIM)
        labelGen = lambda: np.random.random()
        self.checkKeyedModelEquivalent(minExamples, featureGen, labelGen,
                                       sklearnEstimator=LinearRegression(), yCol="myy",
                                       xCol="myfeatures", keyCols=["mykey1", "mykey2"])

    def test_surprise_key(self):
        ke = KeyedEstimator(sklearnEstimator=PCA())
        schema = StructType().add("features", LongType()).add("key", LongType())
        df = self.spark.createDataFrame([], schema)
        km = ke.fit(df)

        self.assertEqual(km.keyedModels.collect(), [])
        self.assertEqual(km.keyedModels.dtypes,
                         [("key", LongType().simpleString()),
                          ("estimator", "sklearn-estimator")])

        df = self.spark.createDataFrame([(1, 2)], schema)
        df = km.transform(df)

        self.assertEqual(df.collect(), [(1, 2, None)])
        self.assertEqual(df.dtypes,
                         [("features", "bigint"),
                          ("key", "bigint"),
                          ("output", "vector")])