Python pyspark.ml.Pipeline() Examples

The following are code examples for showing how to use pyspark.ml.Pipeline(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: sparkflow   Author: lifeomic   File: pipeline_util.py    MIT License 6 votes vote down vote up
def unwrap(pipeline):
        if not (isinstance(pipeline, Pipeline) or isinstance(pipeline, PipelineModel)):
            raise TypeError("Cannot recognize a pipeline of type %s." % type(pipeline))

        stages = pipeline.getStages() if isinstance(pipeline, Pipeline) else pipeline.stages
        for i, stage in enumerate(stages):
            if (isinstance(stage, Pipeline) or isinstance(stage, PipelineModel)):
                stages[i] = PysparkPipelineWrapper.unwrap(stage)
            if isinstance(stage, PysparkObjId._getCarrierClass()) and stage.getStopWords()[-1] == PysparkObjId._getPyObjId():
                swords = stage.getStopWords()[:-1] # strip the id
                py_obj = load_byte_array(swords)
                stages[i] = py_obj

        if isinstance(pipeline, Pipeline):
            pipeline.setStages(stages)
        else:
            pipeline.stages = stages
        return pipeline 
Example 2
Project: spark-deep-learning   Author: databricks   File: named_image_test.py    Apache License 2.0 6 votes vote down vote up
def test_featurizer_in_pipeline(self):
        """
        Tests that featurizer fits into an MLlib Pipeline.
        Does not test how good the featurization is for generalization.
        """
        featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features",
                                         modelName=self.name)
        lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
        pipeline = Pipeline(stages=[featurizer, lr])

        # add arbitrary labels to run logistic regression
        # TODO: it's weird that the test fails on some combinations of labels. check why.
        label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType())
        train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["image"]["origin"]))

        lrModel = pipeline.fit(train_df)
        # see if we at least get the training examples right.
        # with 5 examples and e.g. 131k features (for InceptionV3), it ought to.
        pred_df_collected = lrModel.transform(train_df).collect()
        for row in pred_df_collected:
            self.assertEqual(int(row.prediction), row.label) 
Example 3
Project: SparkClean   Author: NYUBigDataProject   File: df_transformer.py    Apache License 2.0 6 votes vote down vote up
def string_to_index(self, input_cols):
        """
        Maps a string column of labels to an ML column of label indices. If the input column is
        numeric, we cast it to string and index the string values.
        :param input_cols: Columns to be indexed.
        :return: Dataframe with indexed columns.
        """

        # Check if columns argument must be a string or list datatype:
        self._assert_type_str_or_list(input_cols, "input_cols")

        if isinstance(input_cols, str):
            input_cols = [input_cols]

        from pyspark.ml import Pipeline
        from pyspark.ml.feature import StringIndexer

        indexers = [StringIndexer(inputCol=column, outputCol=column + "_index").fit(self._df) for column in
                    list(set(input_cols))]

        pipeline = Pipeline(stages=indexers)
        self._df = pipeline.fit(self._df).transform(self._df)

        return self 
Example 4
Project: SparkClean   Author: NYUBigDataProject   File: df_transformer.py    Apache License 2.0 6 votes vote down vote up
def index_to_string(self, input_cols):
        """
        Maps a column of indices back to a new column of corresponding string values. The index-string mapping is
        either from the ML attributes of the input column, or from user-supplied labels (which take precedence over
        ML attributes).
        :param input_cols: Columns to be indexed.
        :return: Dataframe with indexed columns.
        """

        # Check if columns argument must be a string or list datatype:
        self._assert_type_str_or_list(input_cols, "input_cols")

        if isinstance(input_cols, str):
            input_cols = [input_cols]

        from pyspark.ml import Pipeline
        from pyspark.ml.feature import IndexToString

        indexers = [IndexToString(inputCol=column, outputCol=column + "_string") for column in
                    list(set(input_cols))]

        pipeline = Pipeline(stages=indexers)
        self._df = pipeline.fit(self._df).transform(self._df)

        return self 
Example 5
Project: SparkClean   Author: NYUBigDataProject   File: df_transformer.py    Apache License 2.0 6 votes vote down vote up
def one_hot_encoder(self, input_cols):
        """
        Maps a column of label indices to a column of binary vectors, with at most a single one-value.
        :param input_cols: Columns to be encoded.
        :return: Dataframe with encoded columns.
        """

        # Check if columns argument must be a string or list datatype:
        self._assert_type_str_or_list(input_cols, "input_cols")

        if isinstance(input_cols, str):
            input_cols = [input_cols]

        from pyspark.ml import Pipeline
        from pyspark.ml.feature import OneHotEncoder

        encode = [OneHotEncoder(inputCol=column, outputCol=column + "_encoded") for column in
                  list(set(input_cols))]

        pipeline = Pipeline(stages=encode)
        self._df = pipeline.fit(self._df).transform(self._df)

        return self 
Example 6
Project: SparkClean   Author: NYUBigDataProject   File: df_transformer.py    Apache License 2.0 6 votes vote down vote up
def vector_assembler(self, input_cols):
        """
        Combines a given list of columns into a single vector column.
        :param input_cols: Columns to be assembled.
        :return: Dataframe with assembled column.
        """

        # Check if columns argument must be a string or list datatype:
        self._assert_type_str_or_list(input_cols, "input_cols")

        if isinstance(input_cols, str):
            input_cols = [input_cols]

        from pyspark.ml import Pipeline

        assembler = [VectorAssembler(inputCols=input_cols, outputCol="features")]

        pipeline = Pipeline(stages=assembler)
        self._df = pipeline.fit(self._df).transform(self._df)

        return self 
Example 7
Project: spark-mlpipeline-for-ctr   Author: chenxinye   File: spark_mlpipeline.py    MIT License 6 votes vote down vote up
def Fe_gneer(self):
        """feature selection and transformation
        """
        self.selector = ft.ChiSqSelector(
                                    numTopFeatures=int(self.alpha*len(self.fe_col)), 
                                    featuresCol=self.vecAss.getOutputCol(), 
                                    outputCol='features',
                                    labelCol='label'
                                )
        
        self.pipeline = Pipeline(stages=[self.vecAss,self.selector])
        
        self.data_transformer = self.pipeline.fit(self.train_data)
        
        self.train_data = self.data_transformer.transform(self.train_data)
        self.test_data  = self.data_transformer.transform(self.test_data) 
Example 8
Project: cdsw-simple-serving-python   Author: chezou   File: data_science.py    Apache License 2.0 6 votes vote down vote up
def build_model(training):
  #training = read_data()
  training.cache()
  
  columns = training.columns
  columns.remove("Occupancy")
  
  assembler = VectorAssembler(inputCols=columns, outputCol="featureVec")
  lr = LogisticRegression(featuresCol="featureVec", labelCol="Occupancy")
  
  pipeline = Pipeline(stages=[assembler, lr])
  
  param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.0001, 0.001, 0.01, 0.1, 1.0]) \
    .build()
  
  evaluator = BinaryClassificationEvaluator(labelCol="Occupancy")
  
  validator = TrainValidationSplit(estimator=pipeline,
                             estimatorParamMaps=param_grid,
                             evaluator=evaluator,
                             trainRatio=0.9)
  
  validator_model = validator.fit(training)
  return validator_model.bestModel 
Example 9
Project: python_mozetl   Author: mozilla   File: taar_similarity.py    MIT License 6 votes vote down vote up
def compute_clusters(addons_df, num_clusters, random_seed):
    """ Performs user clustering by using add-on ids as features.
    """

    # Build the stages of the pipeline. We need hashing to make the next
    # steps work.
    hashing_stage = HashingTF(inputCol="addon_ids", outputCol="hashed_features")
    idf_stage = IDF(inputCol="hashed_features", outputCol="features", minDocFreq=1)
    # As a future improvement, we may add a sane value for the minimum cluster size
    # to BisectingKMeans (e.g. minDivisibleClusterSize). For now, just make sure
    # to pass along the random seed if needed for tests.
    kmeans_kwargs = {"seed": random_seed} if random_seed else {}
    bkmeans_stage = BisectingKMeans(k=num_clusters, **kmeans_kwargs)
    pipeline = Pipeline(stages=[hashing_stage, idf_stage, bkmeans_stage])

    # Run the pipeline and compute the results.
    model = pipeline.fit(addons_df)
    return model.transform(addons_df).select(["client_id", "prediction"]) 
Example 10
Project: nsf_data_ingestion   Author: sciosci   File: tfidf_model.py    Apache License 2.0 5 votes vote down vote up
def fit_tfidf_pipeline(content_df):
    tokenizer = RegexTokenizer(). \
        setGaps(False). \
        setPattern('\\p{L}+'). \
        setInputCol('content'). \
        setOutputCol('words')

    sw = StopWordsRemover() \
        .setStopWords(stop_words) \
        .setCaseSensitive(False) \
        .setInputCol("words") \
        .setOutputCol("filtered")

    cv = CountVectorizer(). \
        setInputCol('filtered'). \
        setOutputCol('tf'). \
        setMinTF(1). \
        setMinDF(10). \
        setVocabSize(2 ** 17)

    # fit dataframe_df
    cv_transformer = Pipeline(stages=[tokenizer, sw, cv]).fit(content_df)

    idf = IDF(minDocFreq=10). \
        setInputCol('tf'). \
        setOutputCol('tfidf')

    tfidf_transformer = Pipeline(stages=[cv_transformer, idf]).fit(content_df)

    return tfidf_transformer 
Example 11
Project: elephas   Author: maxpumperla   File: test_ml_model.py    MIT License 5 votes vote down vote up
def test_spark_ml_model(spark_context):

    df = to_data_frame(spark_context, x_train, y_train, categorical=True)
    test_df = to_data_frame(spark_context, x_test, y_test, categorical=True)

    sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    sgd_conf = optimizers.serialize(sgd)

    # Initialize Spark ML Estimator
    estimator = ElephasEstimator()
    estimator.set_keras_model_config(model.to_yaml())
    estimator.set_optimizer_config(sgd_conf)
    estimator.set_mode("synchronous")
    estimator.set_loss("categorical_crossentropy")
    estimator.set_metrics(['acc'])
    estimator.set_epochs(epochs)
    estimator.set_batch_size(batch_size)
    estimator.set_validation_split(0.1)
    estimator.set_categorical_labels(True)
    estimator.set_nb_classes(nb_classes)

    # Fitting a model returns a Transformer
    pipeline = Pipeline(stages=[estimator])
    fitted_pipeline = pipeline.fit(df)

    # Evaluate Spark model by evaluating the underlying model
    prediction = fitted_pipeline.transform(test_df)
    pnl = prediction.select("label", "prediction")
    pnl.show(100)

    prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction))
    metrics = MulticlassMetrics(prediction_and_label)
    print(metrics.precision())
    print(metrics.recall()) 
Example 12
Project: atap   Author: foxbook   File: sc_vectorization.py    Apache License 2.0 5 votes vote down vote up
def make_vectorizer(stopwords=True, tfidf=True, n_features=5000):
    # Creates a vectorization pipeline that starts with tokenization
    stages = [
        Tokenizer(inputCol="text", outputCol="tokens"),
    ]

    # Append stopwords to the pipeline if requested
    if stopwords:
        stages.append(
            StopWordsRemover(
                caseSensitive=False, outputCol="filtered_tokens",
                inputCol=stages[-1].getOutputCol(),
            ),
        )

    # Create the Hashing term frequency vectorizer
    stages.append(
        HashingTF(
            numFeatures=n_features,
            inputCol=stages[-1].getOutputCol(),
            outputCol="frequency"
        )
    )

    # Append the IDF vectorizer if requested
    if tfidf:
        stages.append(
            IDF(inputCol=stages[-1].getOutputCol(), outputCol="tfidf")
        )

    # Return the completed pipeline
    return Pipeline(stages=stages)


## Main functionality 
Example 13
Project: atap   Author: foxbook   File: sc_classification.py    Apache License 2.0 5 votes vote down vote up
def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(
        maxIter=10, regParam=0.3, elasticNetParam=0.8,
        family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[
        vector, labelIndex, clf
    ]).fit(training)

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only 
Example 14
Project: atap   Author: foxbook   File: sc_clustering.py    Apache License 2.0 5 votes vote down vote up
def main(sc, spark):
    # Load the Corpus
    corpus = load_corpus(sc, spark)

    # Create the vector/cluster pipeline
    pipeline = Pipeline(stages=[
        Tokenizer(inputCol="text", outputCol="tokens"),
        Word2Vec(vectorSize=7, minCount=0, inputCol="tokens", outputCol="vecs"),
        BisectingKMeans(k=10, featuresCol="vecs", maxIter=10),
    ])

    # Fit the model
    model = pipeline.fit(corpus)
    corpus = model.transform(corpus)

    # Evaluate clustering.
    bkm = model.stages[-1]
    cost = bkm.computeCost(corpus)
    sizes = bkm.summary.clusterSizes

    # TODO: compute cost of each cluster individually

    # Get the text representation of each cluster.
    wvec = model.stages[-2]
    table = [["Cluster", "Size", "Terms"]]
    for ci, c in enumerate(bkm.clusterCenters()):
        ct = wvec.findSynonyms(c, 7)
        size = sizes[ci]
        terms = " ".join([row.word for row in ct.take(7)])
        table.append([ci, size, terms])

    # Print Results
    print(tabulate(table))
    print("Sum of square distance to center: {:0.3f}".format(cost)) 
Example 15
Project: Yelp-Recommender   Author: xiangdaniel   File: yelp_nlp.py    MIT License 5 votes vote down vote up
def main(inputs, output):
    # 1. Load Data and Select only business_id, stars, text
    data = spark.read.json(inputs, schema=review_schema).repartition(50).select('business_id', 'stars', 'text')
    data = data.where(data['text'].isNotNull())  # filter reviews with no text

    # 2. ML pipeline: Tokenization (with Regular Expression) and Remove Stop Words
    regex_tokenizer = RegexTokenizer(inputCol='text', outputCol='words', pattern='[^A-Za-z]+')
    stopwords_remover = StopWordsRemover(inputCol='words',
                                         outputCol='tokens',
                                         stopWords=StopWordsRemover.loadDefaultStopWords('english'))
    # count_vectorizer = CountVectorizer(inputCol='filtered_words', outputCol='features')
    nlp_pipeline = Pipeline(stages=[regex_tokenizer, stopwords_remover])
    model = nlp_pipeline.fit(data)
    review = model.transform(data).select('business_id', 'stars', 'tokens')

    # 3. Select Features
    review = review.select(review['business_id'], review['stars'], udf_morphy(review['tokens']).alias('tokens'))
    review = review.where(functions.size(review['tokens']) > 0)
    review = review.withColumn('classify_tokens', udf_classify_tokens(review['tokens']))

    # 4. Calculate Feature Weights
    review = review.withColumn('feature_weights', udf_senti_score(review['classify_tokens']))
    review = review.withColumn('food', review['stars'] * review['feature_weights'][0])
    review = review.withColumn('environment', review['stars'] * review['feature_weights'][1])
    review = review.withColumn('service', review['stars'] * review['feature_weights'][2])
    review = review.withColumn('price', review['stars'] * review['feature_weights'][3])

    # 5. Calculate Average Feature Weights
    review_new = review.select('business_id', 'stars', 'food', 'environment', 'service', 'price')
    review_new = review_new.groupby('business_id').agg(
        functions.mean('stars').alias('ave_stars'),
        functions.mean('food').alias('food'),
        functions.mean('environment').alias('environment'),
        functions.mean('service').alias('service'),
        functions.mean('price').alias('price')
    )

    # 6. Save
    review_new.write.csv(output, mode='overwrite') 
Example 16
Project: SparkClean   Author: NYUBigDataProject   File: df_transformer.py    Apache License 2.0 5 votes vote down vote up
def normalizer(self, input_cols, p=2.0):
        """
        Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which
        specifies the p-norm used for normalization. (p=2) by default.
        :param input_cols: Columns to be normalized.
        :param p:  p-norm used for normalization.
        :return: Dataframe with normalized columns.
        """

        # Check if columns argument must be a string or list datatype:
        self._assert_type_str_or_list(input_cols, "input_cols")

        if isinstance(input_cols, str):
            input_cols = [input_cols]

        assert isinstance(p, (float, int)), "Error: p argument must be a numeric value."

        from pyspark.ml import Pipeline
        from pyspark.ml.feature import Normalizer

        normal = [Normalizer(inputCol=column, outputCol=column + "_normalized", p=p) for column in
                  list(set(input_cols))]

        pipeline = Pipeline(stages=normal)
        self._df = pipeline.fit(self._df).transform(self._df)

        return self 
Example 17
Project: PipelineTuning   Author: BryanCutler   File: pipeline_tuning.py    Apache License 2.0 4 votes vote down vote up
def _fit(self, dataset):
        current_estimator = self.getEstimator()

        # Not a Pipeline, use standard CrossValidator
        if not isinstance(current_estimator, Pipeline):
            return super(DagCrossValidator, self)._fit(dataset)
        # Delegate parallelism to DagPipeline
        elif not isinstance(current_estimator, DagPipeline):
            dag_pipeline = DagPipeline(stages=current_estimator.getStages(),
                                       parallelism=self.getParallelism())
        # Already a DagPipeline
        else:
            dag_pipeline = current_estimator

        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        h = 1.0 / nFolds
        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = [0.0] * numModels

        for i in range(nFolds):
            validateLB = i * h
            validateUB = (i + 1) * h
            condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
            validation = df.filter(condition).cache()
            train = df.filter(~condition).cache()

            fold_metrics = dag_pipeline.evaluate(epm, train, validation, eva)

            for j in range(len(metrics)):
                metrics[j] += fold_metrics[j] / nFolds

            validation.unpersist()
            train.unpersist()

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)

        bestModel = current_estimator.fit(dataset, epm[bestIndex])

        return self._copyValues(CrossValidatorModel(bestModel, metrics))