Python pyspark.ml.classification.LogisticRegression() Examples

The following are code examples for showing how to use pyspark.ml.classification.LogisticRegression(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: spark-deep-learning   Author: databricks   File: named_image_test.py    Apache License 2.0 6 votes vote down vote up
def test_featurizer_in_pipeline(self):
        """
        Tests that featurizer fits into an MLlib Pipeline.
        Does not test how good the featurization is for generalization.
        """
        featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features",
                                         modelName=self.name)
        lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
        pipeline = Pipeline(stages=[featurizer, lr])

        # add arbitrary labels to run logistic regression
        # TODO: it's weird that the test fails on some combinations of labels. check why.
        label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType())
        train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["image"]["origin"]))

        lrModel = pipeline.fit(train_df)
        # see if we at least get the training examples right.
        # with 5 examples and e.g. 131k features (for InceptionV3), it ought to.
        pred_df_collected = lrModel.transform(train_df).collect()
        for row in pred_df_collected:
            self.assertEqual(int(row.prediction), row.label) 
Example 2
Project: cdsw-simple-serving-python   Author: chezou   File: data_science.py    Apache License 2.0 6 votes vote down vote up
def build_model(training):
  #training = read_data()
  training.cache()
  
  columns = training.columns
  columns.remove("Occupancy")
  
  assembler = VectorAssembler(inputCols=columns, outputCol="featureVec")
  lr = LogisticRegression(featuresCol="featureVec", labelCol="Occupancy")
  
  pipeline = Pipeline(stages=[assembler, lr])
  
  param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.0001, 0.001, 0.01, 0.1, 1.0]) \
    .build()
  
  evaluator = BinaryClassificationEvaluator(labelCol="Occupancy")
  
  validator = TrainValidationSplit(estimator=pipeline,
                             estimatorParamMaps=param_grid,
                             evaluator=evaluator,
                             trainRatio=0.9)
  
  validator_model = validator.fit(training)
  return validator_model.bestModel 
Example 3
Project: atap   Author: foxbook   File: sc_classification.py    Apache License 2.0 5 votes vote down vote up
def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(
        maxIter=10, regParam=0.3, elasticNetParam=0.8,
        family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[
        vector, labelIndex, clf
    ]).fit(training)

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only 
Example 4
Project: spark-mlpipeline-for-ctr   Author: chenxinye   File: spark_mlpipeline.py    MIT License 5 votes vote down vote up
def lr_cv(self):
        """logistic training
        """
        if self.mode == 'fast':
            _iter = 20
            _regParam = [0.11,0.21]
            
        elif self.mode == 'full':
            _iter = round(len(self.fe_col)*0.3)
            _regParam = [0.01,0.05,0.1,0.15,0.3,0.4]
            
        logistic = cl.LogisticRegression(
            labelCol = 'label',
            maxIter = _iter,
            featuresCol = 'features'
        )
        
        grid = tune.ParamGridBuilder().addGrid(logistic.regParam, _regParam).build()

        tvs = tune.TrainValidationSplit(
            estimator = logistic, 
            estimatorParamMaps = grid, 
            evaluator = self.evaluator
        )
        
        self.lrModel = tvs.fit(self.train_data)
        self.lr_cv_results = self.lrModel.transform(self.test_data)
        self.lrscore = self.evaluator.evaluate(self.lr_cv_results, {self.evaluator.metricName: 'areaUnderROC'})
        
        if self.verbose:
            print("AUC score is:", self.lrscore)
            #print("Area Under PR is:",self.evaluator.evaluate(self.lr_cv_results, {self.evaluator.metricName: 'areaUnderPR'}))

        pass 
Example 5
Project: python_mozetl   Author: mozilla   File: taar_ensemble.py    MIT License 5 votes vote down vote up
def dump_training_info(blorModel):
    """
    This function is useful for debugging when we do not converge to a
    solution during LogisticRegression.
    """
    trainingSummary = blorModel.summary

    print("Total iterations: %d" % trainingSummary.totalIterations)
    print("Intercepts: " + str(blorModel.intercept))
    print("Coefficients: " + str(blorModel.coefficients))
    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective) 
Example 6
Project: python_mozetl   Author: mozilla   File: taar_ensemble.py    MIT License 5 votes vote down vote up
def compute_regression(spark, rdd_list, regParam, elasticNetParam):
    df0 = spark.sparkContext.union(rdd_list).toDF()
    blor = LogisticRegression(
        maxIter=50,
        regParam=regParam,
        weightCol="weight",
        elasticNetParam=elasticNetParam,
    )

    blorModel = blor.fit(df0)
    return blorModel