Python pyspark.ml.feature.VectorAssembler() Examples

The following are 11 code examples of pyspark.ml.feature.VectorAssembler(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.ml.feature , or try the search function .
Example #1
Source File: feature_selection.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def select_features(
    wiki: str,
    num_features: int,
    metadata: Dict
) -> mt.Transformer:
    def transform(df: DataFrame) -> DataFrame:
        # Compute the "best" features, per some metric
        sc = df.sql_ctx.sparkSession.sparkContext
        features = metadata['input_feature_meta']['features']
        selected = mjolnir.feature_engineering.select_features(
            sc, df, features, num_features, algo='mrmr')
        metadata['wiki_features'][wiki] = selected

        # Rebuild the `features` col with only the selected features
        keep_cols = metadata['default_cols'] + selected
        df_selected = df.select(*keep_cols)
        assembler = VectorAssembler(
            inputCols=selected, outputCol='features')
        return assembler.transform(df_selected).drop(*selected)
    return transform 
Example #2
Source File: test_spark_model_export.py    From mlflow with Apache License 2.0 6 votes vote down vote up
def spark_model_estimator(iris_df, spark_context):
    feature_names, iris_pandas_df, iris_spark_df = iris_df
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    features_df = assembler.transform(iris_spark_df)
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    # Fit the model
    model = lr.fit(features_df)
    preds_df = model.transform(features_df)
    preds = [x.prediction for x in preds_df.select("prediction").collect()]
    return SparkModelWithData(model=model,
                              spark_df=features_df,
                              pandas_df=iris_pandas_df,
                              predictions=preds) 
Example #3
Source File: random_forest.py    From gnomad_methods with MIT License 6 votes vote down vote up
def get_features_importance(
    rf_pipeline: pyspark.ml.PipelineModel, rf_index: int = -2, assembler_index: int = -3
) -> Dict[str, float]:
    """
    Extract the features importance from a Pipeline model containing a RandomForestClassifier stage.

    :param rf_pipeline: Input pipeline
    :param rf_index: index of the RandomForestClassifier stage
    :param assembler_index: index of the VectorAssembler stage
    :return: feature importance for each feature in the RF model
    """

    feature_names = [
        x[: -len("_indexed")] if x.endswith("_indexed") else x
        for x in rf_pipeline.stages[assembler_index].getInputCols()
    ]

    return dict(zip(feature_names, rf_pipeline.stages[rf_index].featureImportances)) 
Example #4
Source File: ops_names.py    From onnxmltools with MIT License 6 votes vote down vote up
def build_sparkml_operator_name_map():
    res = {k: "pyspark.ml.feature." + k.__name__ for k in [
        Binarizer, BucketedRandomProjectionLSHModel, Bucketizer,
        ChiSqSelectorModel, CountVectorizerModel, DCT, ElementwiseProduct, HashingTF, IDFModel, ImputerModel,
        IndexToString, MaxAbsScalerModel, MinHashLSHModel, MinMaxScalerModel, NGram, Normalizer, OneHotEncoderModel,
        PCAModel, PolynomialExpansion, QuantileDiscretizer, RegexTokenizer,
        StandardScalerModel, StopWordsRemover, StringIndexerModel, Tokenizer, VectorAssembler, VectorIndexerModel,
        VectorSlicer, Word2VecModel
    ]}
    res.update({k: "pyspark.ml.classification." + k.__name__ for k in [
        LinearSVCModel, LogisticRegressionModel, DecisionTreeClassificationModel, GBTClassificationModel,
        RandomForestClassificationModel, NaiveBayesModel, MultilayerPerceptronClassificationModel, OneVsRestModel
    ]})
    res.update({k: "pyspark.ml.regression." + k.__name__ for k in [
        AFTSurvivalRegressionModel, DecisionTreeRegressionModel, GBTRegressionModel, GBTRegressionModel,
        GeneralizedLinearRegressionModel, IsotonicRegressionModel, LinearRegressionModel, RandomForestRegressionModel
    ]})
    return res 
Example #5
Source File: test_pipeline.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_model_pipeline_3_stage(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv')\
            .options(header='true', inferschema='true').load(input_path)
        cols = ['workclass', 'education', 'marital_status']
        training_data, test_data = full_data.select(*cols).limit(1000).randomSplit([0.9, 0.1], seed=1)

        stages = []
        for col in cols:
            stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
            # we need the dropLast option otherwise when assembled together (below)
            # we won't be able to expand the features without difficulties
            stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False))

        stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features'))
        pipeline = Pipeline(stages=stages)

        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
            ('workclass', StringTensorType([1, 1])),
            ('education', StringTensorType([1, 1])),
            ('marital_status', StringTensorType([1, 1]))
        ])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(test_data)
        data_np = {
            'workclass': test_data.select('workclass').toPandas().values,
            'education': test_data.select('education').toPandas().values,
            'marital_status': test_data.select('marital_status').toPandas().values
        }
        expected = predicted.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values
        paths = save_data_models(data_np, expected, model, model_onnx,
                                basename="SparkmlPipeline_3Stage")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['features'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #6
Source File: test_vector_assembler.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_model_vector_assembler(self):
        col_names = ["a", "b", "c"]
        model = VectorAssembler(inputCols=col_names, outputCol='features')
        data = self.spark.createDataFrame([(1., 0., 3.)], col_names)
        model_onnx = convert_sparkml(model, 'Sparkml VectorAssembler',  [
            ('a', FloatTensorType([1, 1])),
            ('b', FloatTensorType([1, 1])),
            ('c', FloatTensorType([1, 1]))
        ])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(data)
        expected = predicted.select("features").toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values
        data_np = {
            'a': data.select('a').toPandas().values.astype(numpy.float32),
            'b': data.select('b').toPandas().values.astype(numpy.float32),
            'c': data.select('c').toPandas().values.astype(numpy.float32)
        }
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlVectorAssembler")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['features'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #7
Source File: feature_engineering.py    From search-MjoLniR with MIT License 5 votes vote down vote up
def _bucketize(df, input_cols):
    def j_str_arr(arr):
        gateway = SparkContext._gateway
        j_str = gateway.jvm.java.lang.String
        j_arr = gateway.new_array(j_str, len(arr))
        for i, val in enumerate(arr):
            j_arr[i] = val
        return j_arr

    output_cols = ['{}-bucketed'.format(x) for x in input_cols]
    # Sadly the multi-col versions are only in scala, pyspark doesn't
    # have them yet.
    j_bucketizer = (
        JavaParams._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer")
        .setInputCols(j_str_arr(input_cols))
        .setOutputCols(j_str_arr(output_cols))
        .setNumBuckets(254)
        .setRelativeError(1/2550)
        .setHandleInvalid('error')
        .fit(df._jdf))
    j_df_bucketized = j_bucketizer.transform(df._jdf)
    df_bucketized = DataFrame(j_df_bucketized, df.sql_ctx).drop(*input_cols)
    # Now we need to assemble the bucketized values into vector
    # form for the feature selector to work with.
    assembler = VectorAssembler(
            inputCols=output_cols, outputCol='features')
    return assembler.transform(df_bucketized).drop(*output_cols) 
Example #8
Source File: ml.py    From koalas with Apache License 2.0 5 votes vote down vote up
def to_numeric_df(kdf: "ks.DataFrame") -> Tuple[pyspark.sql.DataFrame, List[Tuple[str, ...]]]:
    """
    Takes a dataframe and turns it into a dataframe containing a single numerical
    vector of doubles. This dataframe has a single field called '_1'.

    TODO: index is not preserved currently
    :param kdf: the Koalas dataframe.
    :return: a pair of dataframe, list of strings (the name of the columns
             that were converted to numerical types)

    >>> to_numeric_df(ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}))
    (DataFrame[__correlation_output__: vector], [('A',), ('B',)])
    """
    # TODO, it should be more robust.
    accepted_types = {
        np.dtype(dt)
        for dt in [np.int8, np.int16, np.int32, np.int64, np.float32, np.float64, np.bool_]
    }
    numeric_column_labels = [
        label for label in kdf._internal.column_labels if kdf[label].dtype in accepted_types
    ]
    numeric_df = kdf._internal.spark_frame.select(
        *[kdf._internal.spark_column_for(idx) for idx in numeric_column_labels]
    )
    va = VectorAssembler(inputCols=numeric_df.columns, outputCol=CORRELATION_OUTPUT_COLUMN)
    v = va.transform(numeric_df).select(CORRELATION_OUTPUT_COLUMN)
    return v, numeric_column_labels 
Example #9
Source File: test_spark_model_export.py    From mlflow with Apache License 2.0 5 votes vote down vote up
def spark_model_iris(iris_df):
    feature_names, iris_pandas_df, iris_spark_df = iris_df
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(iris_spark_df)
    preds_df = model.transform(iris_spark_df)
    preds = [x.prediction for x in preds_df.select("prediction").collect()]
    return SparkModelWithData(model=model,
                              spark_df=iris_spark_df,
                              pandas_df=iris_pandas_df,
                              predictions=preds) 
Example #10
Source File: test_spark_model_export.py    From mlflow with Apache License 2.0 5 votes vote down vote up
def spark_model_transformer(iris_df):
    feature_names, iris_pandas_df, iris_spark_df = iris_df
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    # Fit the model
    preds_df = assembler.transform(iris_spark_df)
    preds = [x.features for x in preds_df.select("features").collect()]
    return SparkModelWithData(model=assembler,
                              spark_df=iris_spark_df,
                              pandas_df=iris_pandas_df,
                              predictions=preds) 
Example #11
Source File: test_pipeline.py    From onnxmltools with MIT License 4 votes vote down vote up
def test_model_pipeline_4_stage(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv')\
            .options(header='true', inferschema='true').load(input_path)
        cols = ['workclass', 'education', 'marital_status']
        training_data, test_data = full_data.select('income', *cols).limit(1000).randomSplit([0.9, 0.1],seed=1)

        stages = []
        for col in cols:
            stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
            stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False))

        stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features'))
        stages.append(StringIndexer(inputCol='income', outputCol='label', handleInvalid='skip'))
        stages.append(LogisticRegression(maxIter=100, tol=0.0001))
        pipeline = Pipeline(stages=stages)

        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
            ('income', StringTensorType([1, 1])),
            ('workclass', StringTensorType([1, 1])),
            ('education', StringTensorType([1, 1])),
            ('marital_status', StringTensorType([1, 1]))
        ])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(test_data)
        data_np = {
            'income': test_data.select('income').toPandas().values,
            'workclass': test_data.select('workclass').toPandas().values,
            'education': test_data.select('education').toPandas().values,
            'marital_status': test_data.select('marital_status').toPandas().values
        }
        expected = [
            predicted.toPandas().label.values.astype(numpy.float32),
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                basename="SparkmlPipeline_4Stage")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['label', 'prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)