Python pyspark.ml.feature.HashingTF() Examples

The following are 5 code examples of pyspark.ml.feature.HashingTF(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.ml.feature , or try the search function .
Example #1
Source File: taar_similarity.py    From telemetry-airflow with Mozilla Public License 2.0 6 votes vote down vote up
def compute_clusters(addons_df, num_clusters, random_seed):
    """ Performs user clustering by using add-on ids as features.
    """

    # Build the stages of the pipeline. We need hashing to make the next
    # steps work.
    hashing_stage = HashingTF(inputCol="addon_ids", outputCol="hashed_features")
    idf_stage = IDF(
        inputCol="hashed_features", outputCol="features", minDocFreq=1
    )
    # As a future improvement, we may add a sane value for the minimum cluster size
    # to BisectingKMeans (e.g. minDivisibleClusterSize). For now, just make sure
    # to pass along the random seed if needed for tests.
    kmeans_kwargs = {"seed": random_seed} if random_seed else {}
    bkmeans_stage = BisectingKMeans(k=num_clusters, **kmeans_kwargs)
    pipeline = Pipeline(stages=[hashing_stage, idf_stage, bkmeans_stage])

    # Run the pipeline and compute the results.
    model = pipeline.fit(addons_df)
    return model.transform(addons_df).select(["client_id", "prediction"]) 
Example #2
Source File: taar_similarity.py    From python_mozetl with MIT License 6 votes vote down vote up
def compute_clusters(addons_df, num_clusters, random_seed):
    """ Performs user clustering by using add-on ids as features.
    """

    # Build the stages of the pipeline. We need hashing to make the next
    # steps work.
    hashing_stage = HashingTF(inputCol="addon_ids", outputCol="hashed_features")
    idf_stage = IDF(inputCol="hashed_features", outputCol="features", minDocFreq=1)
    # As a future improvement, we may add a sane value for the minimum cluster size
    # to BisectingKMeans (e.g. minDivisibleClusterSize). For now, just make sure
    # to pass along the random seed if needed for tests.
    kmeans_kwargs = {"seed": random_seed} if random_seed else {}
    bkmeans_stage = BisectingKMeans(k=num_clusters, **kmeans_kwargs)
    pipeline = Pipeline(stages=[hashing_stage, idf_stage, bkmeans_stage])

    # Run the pipeline and compute the results.
    model = pipeline.fit(addons_df)
    return model.transform(addons_df).select(["client_id", "prediction"]) 
Example #3
Source File: ops_names.py    From onnxmltools with MIT License 6 votes vote down vote up
def build_sparkml_operator_name_map():
    res = {k: "pyspark.ml.feature." + k.__name__ for k in [
        Binarizer, BucketedRandomProjectionLSHModel, Bucketizer,
        ChiSqSelectorModel, CountVectorizerModel, DCT, ElementwiseProduct, HashingTF, IDFModel, ImputerModel,
        IndexToString, MaxAbsScalerModel, MinHashLSHModel, MinMaxScalerModel, NGram, Normalizer, OneHotEncoderModel,
        PCAModel, PolynomialExpansion, QuantileDiscretizer, RegexTokenizer,
        StandardScalerModel, StopWordsRemover, StringIndexerModel, Tokenizer, VectorAssembler, VectorIndexerModel,
        VectorSlicer, Word2VecModel
    ]}
    res.update({k: "pyspark.ml.classification." + k.__name__ for k in [
        LinearSVCModel, LogisticRegressionModel, DecisionTreeClassificationModel, GBTClassificationModel,
        RandomForestClassificationModel, NaiveBayesModel, MultilayerPerceptronClassificationModel, OneVsRestModel
    ]})
    res.update({k: "pyspark.ml.regression." + k.__name__ for k in [
        AFTSurvivalRegressionModel, DecisionTreeRegressionModel, GBTRegressionModel, GBTRegressionModel,
        GeneralizedLinearRegressionModel, IsotonicRegressionModel, LinearRegressionModel, RandomForestRegressionModel
    ]})
    return res 
Example #4
Source File: spark_ml_pipline.py    From Hanhan-Spark-Python with MIT License 5 votes vote down vote up
def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output) 
Example #5
Source File: test_search_2.py    From spark-sklearn with Apache License 2.0 5 votes vote down vote up
def test_cv_lasso_with_mllib_featurization(self):
        data = [('hi there', 0.0),
                ('what is up', 1.0),
                ('huh', 1.0),
                ('now is the time', 5.0),
                ('for what', 0.0),
                ('the spark was there', 5.0),
                ('and so', 3.0),
                ('were many socks', 0.0),
                ('really', 1.0),
                ('too cool', 2.0)]
        data = self.sql.createDataFrame(data, ["review", "rating"])

        # Feature extraction using MLlib
        tokenizer = Tokenizer(inputCol="review", outputCol="words")
        hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000)
        pipeline = Pipeline(stages=[tokenizer, hashingTF])
        data = pipeline.fit(data).transform(data)

        df = self.converter.toPandas(data.select(data.features.alias("review"), "rating"))

        pipeline = SKL_Pipeline([
            ('lasso', SKL_Lasso())
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }

        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])