Python pyspark.ml.feature.StringIndexer() Examples

The following are 10 code examples of pyspark.ml.feature.StringIndexer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.ml.feature , or try the search function .
Example #1
Source File: test_index_to_string.py    From onnxmltools with MIT License 8 votes vote down vote up
def test_index_to_string(self):
        original_data = self.spark.createDataFrame(
            [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
            ["id", "category"])
        string_indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
        string_indexer_model = string_indexer.fit(original_data)
        data = string_indexer_model.transform(original_data)

        model = IndexToString(inputCol="categoryIndex", outputCol="originalCategory",
                              labels=['A', 'B', 'C'])
        # the input name should match that of what IndexToString.inputCol
        model_onnx = convert_sparkml(model, 'Sparkml IndexToString', [('categoryIndex', Int64TensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        expected = predicted.select("originalCategory").toPandas().values
        data_np = data.select('categoryIndex').toPandas().values.astype(numpy.int64)
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlIndexToString")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['originalCategory'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #2
Source File: test_pipeline.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_model_pipeline_3_stage(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv')\
            .options(header='true', inferschema='true').load(input_path)
        cols = ['workclass', 'education', 'marital_status']
        training_data, test_data = full_data.select(*cols).limit(1000).randomSplit([0.9, 0.1], seed=1)

        stages = []
        for col in cols:
            stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
            # we need the dropLast option otherwise when assembled together (below)
            # we won't be able to expand the features without difficulties
            stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False))

        stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features'))
        pipeline = Pipeline(stages=stages)

        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
            ('workclass', StringTensorType([1, 1])),
            ('education', StringTensorType([1, 1])),
            ('marital_status', StringTensorType([1, 1]))
        ])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(test_data)
        data_np = {
            'workclass': test_data.select('workclass').toPandas().values,
            'education': test_data.select('education').toPandas().values,
            'marital_status': test_data.select('marital_status').toPandas().values
        }
        expected = predicted.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values
        paths = save_data_models(data_np, expected, model, model_onnx,
                                basename="SparkmlPipeline_3Stage")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['features'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #3
Source File: sc_classification.py    From atap with Apache License 2.0 5 votes vote down vote up
def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(
        maxIter=10, regParam=0.3, elasticNetParam=0.8,
        family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[
        vector, labelIndex, clf
    ]).fit(training)

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only 
Example #4
Source File: random_forest.py    From gnomad_methods with MIT License 5 votes vote down vote up
def get_labels(rf_pipeline: pyspark.ml.PipelineModel) -> List[str]:
    """
    Returns the labels from the StringIndexer stage at index 0 from an RF pipeline model

    :param rf_pipeline: Input pipeline
    :return: labels
    """
    return rf_pipeline.stages[0].labels 
Example #5
Source File: test_index_to_string.py    From onnxmltools with MIT License 5 votes vote down vote up
def test_index_to_string_throws(self):
        original_data = self.spark.createDataFrame(
            [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
            ["id", "category"])
        string_indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
        string_indexer_model = string_indexer.fit(original_data)
        data = string_indexer_model.transform(original_data)

        model = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
        # the input name should match that of what IndexToString.inputCol
        model_onnx = None
        with pytest.raises(SparkMlConversionError):
            model_onnx = convert_sparkml(model, 'Sparkml IndexToString', [('categoryIndex', Int64TensorType([1, 1]))]) 
Example #6
Source File: test_random_forest_regressor.py    From onnxmltools with MIT License 5 votes vote down vote up
def test_random_forrest_regression(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt")
        original_data = self.spark.read.format("libsvm").load(input_path)
        #
        # truncate the features
        #
        feature_count = 5
        self.spark.udf.register("truncateFeatures",
                                lambda x: SparseVector(feature_count, range(0,feature_count), x.toArray()[125:130]),
                                VectorUDT())
        data = original_data.selectExpr("cast(label as string) as label", "truncateFeatures(features) as features")
        label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
        feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures",
                                        maxCategories=10, handleInvalid='error')

        rf = RandomForestRegressor(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)
        pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf])
        model = pipeline.fit(data)
        model_onnx = convert_sparkml(model, 'Sparkml RandomForest Regressor', [
            ('label', StringTensorType([1, 1])),
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data.limit(1))
        data_np = {
            'label': data.limit(1).toPandas().label.values,
            'features': data.limit(1).toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        }
        expected = [
            predicted.toPandas().indexedLabel.values.astype(numpy.int64),
            predicted.toPandas().prediction.values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlRandomForestRegressor")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['indexedLabel', 'prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #7
Source File: test_gbt_classifier.py    From onnxmltools with MIT License 5 votes vote down vote up
def test_gbt_classifier(self):
        raw_data = self.spark.createDataFrame([
            (1.0, Vectors.dense(1.0)),
            (0.0, Vectors.sparse(1, [], []))
        ], ["label", "features"])
        string_indexer = StringIndexer(inputCol="label", outputCol="indexed")
        si_model = string_indexer.fit(raw_data)
        data = si_model.transform(raw_data)
        gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42)
        model = gbt.fit(data)
        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(model, 'Sparkml GBT Classifier', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlGBTClassifier")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #8
Source File: test_random_forest_classifier.py    From onnxmltools with MIT License 5 votes vote down vote up
def test_random_forrest_classification(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt")
        original_data = self.spark.read.format("libsvm").load(input_path)
        #
        # truncate the features
        #
        feature_count = 5
        self.spark.udf.register("truncateFeatures",
                                lambda x: SparseVector(feature_count, range(0,feature_count), x.toArray()[125:130]),
                                VectorUDT())
        data = original_data.selectExpr("cast(label as string) as label", "truncateFeatures(features) as features")
        label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
        feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures",
                                        maxCategories=10, handleInvalid='keep')

        rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)
        pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf])
        model = pipeline.fit(data)
        model_onnx = convert_sparkml(model, 'Sparkml RandomForest Classifier', [
            ('label', StringTensorType([1, 1])),
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        data_np = {
            'label': data.toPandas().label.values,
            'features': data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        }
        expected = [
            predicted.toPandas().indexedLabel.values.astype(numpy.int64),
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlRandomForestClassifier")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['indexedLabel', 'prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #9
Source File: test_pipeline.py    From onnxmltools with MIT License 4 votes vote down vote up
def test_model_pipeline_4_stage(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv')\
            .options(header='true', inferschema='true').load(input_path)
        cols = ['workclass', 'education', 'marital_status']
        training_data, test_data = full_data.select('income', *cols).limit(1000).randomSplit([0.9, 0.1],seed=1)

        stages = []
        for col in cols:
            stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
            stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False))

        stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features'))
        stages.append(StringIndexer(inputCol='income', outputCol='label', handleInvalid='skip'))
        stages.append(LogisticRegression(maxIter=100, tol=0.0001))
        pipeline = Pipeline(stages=stages)

        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
            ('income', StringTensorType([1, 1])),
            ('workclass', StringTensorType([1, 1])),
            ('education', StringTensorType([1, 1])),
            ('marital_status', StringTensorType([1, 1]))
        ])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(test_data)
        data_np = {
            'income': test_data.select('income').toPandas().values,
            'workclass': test_data.select('workclass').toPandas().values,
            'education': test_data.select('education').toPandas().values,
            'marital_status': test_data.select('marital_status').toPandas().values
        }
        expected = [
            predicted.toPandas().label.values.astype(numpy.float32),
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                basename="SparkmlPipeline_4Stage")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['label', 'prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #10
Source File: test_pipeline.py    From onnxmltools with MIT License 4 votes vote down vote up
def test_model_pipeline_2_stage(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv')\
            .options(header='true', inferschema='true').load(input_path)
        cols = ['workclass', 'education', 'marital_status']
        training_data, test_data = full_data.select(*cols).limit(1000).randomSplit([0.9, 0.1], seed=1)

        stages = []
        for col in cols:
            stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
            stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec']))

        pipeline = Pipeline(stages=stages)

        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
            ('workclass', StringTensorType([1, 1])),
            ('education', StringTensorType([1, 1])),
            ('marital_status', StringTensorType([1, 1]))
        ])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(test_data)
        data_np = {
            'workclass': test_data.select('workclass').toPandas().values,
            'education': test_data.select('education').toPandas().values,
            'marital_status': test_data.select('marital_status').toPandas().values
        }
        predicted_np = [
            predicted.toPandas().workclass_vec.apply(lambda x: pandas.Series(x.toArray())).values,
            predicted.toPandas().education_vec.apply(lambda x: pandas.Series(x.toArray())).values,
            predicted.toPandas().marital_status_vec.apply(lambda x: pandas.Series(x.toArray())).values
            ]
        expected = [numpy.asarray([expand_one_hot_vec(x) for x in row]) for row in predicted_np]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                 basename="SparkmlPipeline_2Stage")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['workclass_vec', 'education_vec', 'marital_status_vec'],
                                               data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)