import numpy as np import scipy.sparse as sp from sklearn.base import clone from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import VarianceThreshold from sklearn.linear_model.logistic import LogisticRegression from sklearn.pipeline import FeatureUnion, Pipeline from sklearn.utils.testing import (assert_array_equal, assert_equal, assert_false, assert_raises, assert_true) from splearn import DictRDD, ArrayRDD from splearn.decomposition import SparkTruncatedSVD from splearn.feature_extraction.text import SparkCountVectorizer from splearn.feature_selection import SparkVarianceThreshold from splearn.linear_model.logistic import SparkLogisticRegression from splearn.pipeline import SparkFeatureUnion, SparkPipeline, make_sparkunion from splearn.utils.testing import SplearnTestCase # class PipelineTestCase(SplearnTestCase): # def generate_junkfood(self, blocks=None): # X = ( # "the pizza pizza beer copyright", # "the pizza burger beer copyright", # "the the pizza beer beer copyright", # "the burger beer beer copyright", # "the coke burger coke copyright", # "the coke burger burger", # ) # Z_rdd = self.sc.parallelize(X) # Z = ArrayRDD(Z_rdd, bsize=blocks) # return X, Z class IncorrectT(object): """Small class to test parameter dispatching. """ def __init__(self, a=None, b=None): self.a = a self.b = b class T(IncorrectT): def fit(self, Z, **fit_params): self.fit_params = fit_params return self def get_params(self, deep=False): return {'a': self.a, 'b': self.b} def set_params(self, **params): self.a = params['a'] return self class TransfT(T): def transform(self, Z): return Z class FitParamT(object): """Mock classifier """ def __init__(self): self.successful = False pass def fit(self, Z, should_succeed=False): self.successful = should_succeed def predict(self, Z): return self.successful class TestFeatureUnion(SplearnTestCase): def test_same_result_withdictrdd(self): X, X_rdd = self.make_text_rdd(2) Y_rdd = ArrayRDD(self.sc.parallelize([None] * len(X), 4), bsize=2) Z = DictRDD([X_rdd, Y_rdd], columns=("X", "y"), bsize=2) loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) loc_word = CountVectorizer(analyzer="word") loc_word_2 = CountVectorizer(analyzer="word") dist_word = SparkCountVectorizer(analyzer="word") dist_word_2 = SparkCountVectorizer(analyzer="word") loc_union = FeatureUnion([ ("chars", loc_char), ("words", loc_word), ("words2", loc_word_2) ]) dist_union = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word), ("words2", dist_word_2) ]) # test same feature names loc_union.fit(X) dist_union.fit(Z) converted_union = dist_union.to_scikit() assert_equal( loc_union.get_feature_names(), dist_union.get_feature_names(), converted_union.get_feature_names(), ) # test same results Z_transformed = sp.vstack(dist_union.transform(Z)[:, 'X'].collect()) assert_array_equal(loc_union.transform(X).toarray(), Z_transformed.toarray()) assert_array_equal(loc_union.transform(X).toarray(), converted_union.transform(X).toarray()) # test same results with fit_transform X_transformed = loc_union.fit_transform(X) X_converted_transformed = converted_union.fit_transform(X) Z_transformed = sp.vstack(dist_union.fit_transform(Z)[:, 'X'].collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) assert_array_equal(X_transformed.toarray(), X_converted_transformed.toarray()) # test same results in parallel loc_union_par = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ], n_jobs=2) dist_union_par = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ], n_jobs=2) loc_union_par.fit(X) dist_union_par.fit(Z) converted_union = dist_union_par.to_scikit() X_transformed = loc_union_par.transform(X) Z_transformed = sp.vstack(dist_union_par.transform(Z)[:, 'X'].collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) assert_array_equal(X_transformed.toarray(), converted_union.transform(X).toarray()) def test_same_result(self): X, Z = self.make_text_rdd(2) loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) loc_word = CountVectorizer(analyzer="word") dist_word = SparkCountVectorizer(analyzer="word") loc_union = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ]) dist_union = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ]) # test same feature names loc_union.fit(X) dist_union.fit(Z) assert_equal( loc_union.get_feature_names(), dist_union.get_feature_names() ) # test same results X_transformed = loc_union.transform(X) Z_transformed = sp.vstack(dist_union.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) # test same results with fit_transform X_transformed = loc_union.fit_transform(X) Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) # test same results in parallel loc_union_par = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ], n_jobs=2) dist_union_par = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ], n_jobs=2) loc_union_par.fit(X) dist_union_par.fit(Z) X_transformed = loc_union_par.transform(X) Z_transformed = sp.vstack(dist_union_par.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) def test_same_result_weight(self): X, Z = self.make_text_rdd(2) loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) loc_word = CountVectorizer(analyzer="word") dist_word = SparkCountVectorizer(analyzer="word") loc_union = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ], transformer_weights={"words": 10}) dist_union = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ], transformer_weights={"words": 10}) loc_union.fit(X) dist_union.fit(Z) X_transformed = loc_union.transform(X) Z_transformed = sp.vstack(dist_union.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) def test_make_union(self): svd = SparkTruncatedSVD() mock = TransfT() fu = make_sparkunion(svd, mock) names, transformers = list(zip(*fu.transformer_list)) assert_equal(names, ("sparktruncatedsvd", "transft")) assert_equal(transformers, (svd, mock)) # def test_params_are_forwarded(self): # transformer1 = T() # transformer2 = T() # pipe = SparkFeatureUnion([('t1', transformer1), # ('t2', transformer2)]) # print(pipe.get_params(deep=True)) # expected = dict(t1__a=None, t1__b=None, # t2__a=None, t2__b=None, # t1=transformer1, t2=transformer2, # **pipe.get_params(deep=False) # ) # print(expected) # assert_equal(pipe.get_params(deep=True), expected) # # Check that params are set # pipe.set_params(t1__a=0.1) # assert_equal(transformer1.a, 0.1) # assert_equal(transformer1.b, None) # assert_equal(transformer2.a, None) # assert_equal(transformer2.b, None) # # Check that params are set # _, _, Z = self.make_classification(2, 10000, 2000) # pipe.fit(Z, t1__a=0.2, t2__a=0.3) # assert_equal(transformer1.fit_params, {'a': 0.2}) # assert_equal(transformer2.fit_params, {'a': 0.3}) class TestPipeline(SplearnTestCase): def test_pipeline_init(self): # Test the various init parameters of the pipeline. assert_raises(TypeError, SparkPipeline) # Check that we can't instantiate pipelines with objects without fit # method pipe = assert_raises(TypeError, SparkPipeline, [('svc', IncorrectT)]) # Smoke test with only an estimator clf = T() pipe = SparkPipeline([('svc', clf)]) assert_equal(pipe.get_params(deep=True), dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False) )) # Check that params are set pipe.set_params(svc__a=0.1) assert_equal(clf.a, 0.1) assert_equal(clf.b, None) # Smoke test the repr: repr(pipe) # Test with two objects vect = SparkCountVectorizer() filter = SparkVarianceThreshold() pipe = SparkPipeline([('vect', vect), ('filter', filter)]) # Check that we can't use the same stage name twice assert_raises(ValueError, SparkPipeline, [('vect', vect), ('vect', vect)]) # Check that params are set pipe.set_params(vect__min_df=0.1) assert_equal(vect.min_df, 0.1) # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong assert_raises(ValueError, pipe.set_params, filter__min_df=0.1) # Test clone pipe2 = clone(pipe) assert_false(pipe.named_steps['vect'] is pipe2.named_steps['vect']) # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('vect') params.pop('filter') params2.pop('vect') params2.pop('filter') assert_equal(params, params2) def test_pipeline_same_results(self): X, y, Z = self.make_classification(2, 10000, 2000) loc_clf = LogisticRegression() loc_filter = VarianceThreshold() loc_pipe = Pipeline([ ('threshold', loc_filter), ('logistic', loc_clf) ]) dist_clf = SparkLogisticRegression() dist_filter = SparkVarianceThreshold() dist_pipe = SparkPipeline([ ('threshold', dist_filter), ('logistic', dist_clf) ]) dist_filter.fit(Z) loc_pipe.fit(X, y) dist_pipe.fit(Z, logistic__classes=np.unique(y)) assert_true(np.mean(np.abs( loc_pipe.predict(X) - np.concatenate(dist_pipe.predict(Z[:, 'X']).collect()) )) < 0.1)