Python pyspark.sql.functions.udf() Examples

The following are 30 code examples of pyspark.sql.functions.udf(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.functions , or try the search function .
Example #1
Source File: metrics.py    From search-MjoLniR with MIT License 7 votes vote down vote up
def _ndcg_at(k, label_col):
    def ndcg_at_k(predicted, actual):
        # TODO: Taking in rn and then re-sorting might not be necessary, but i can't
        # find any real guarantee that they would come in order after a groupBy + collect_list,
        # since they were only ordered within the window function.
        predicted = [row[label_col] for row in sorted(predicted, key=lambda r: r.rn)]
        actual = [row[label_col] for row in sorted(actual, key=lambda r: r.rn)]
        dcg = 0.
        for i, label in enumerate(predicted):
            # This form is used to match EvalNDCG in xgboost
            dcg += ((1 << label) - 1) / math.log(i + 2.0, 2)
        idcg = 0.
        for i, label in enumerate(actual):
            idcg += ((1 << label) - 1) / math.log(i + 2.0, 2)
        if idcg == 0:
            return 0
        else:
            return dcg / idcg
    return F.udf(ndcg_at_k, pyspark.sql.types.DoubleType()) 
Example #2
Source File: named_image_test.py    From spark-deep-learning with Apache License 2.0 6 votes vote down vote up
def test_featurizer_in_pipeline(self):
        """
        Tests that featurizer fits into an MLlib Pipeline.
        Does not test how good the featurization is for generalization.
        """
        featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features",
                                         modelName=self.name)
        lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
        pipeline = Pipeline(stages=[featurizer, lr])

        # add arbitrary labels to run logistic regression
        # TODO: it's weird that the test fails on some combinations of labels. check why.
        label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType())
        train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["image"]["origin"]))

        lrModel = pipeline.fit(train_df)
        # see if we at least get the training examples right.
        # with 5 examples and e.g. 131k features (for InceptionV3), it ought to.
        pred_df_collected = lrModel.transform(train_df).collect()
        for row in pred_df_collected:
            self.assertEqual(int(row.prediction), row.label) 
Example #3
Source File: sample_scaffolds.py    From reinvent-scaffold-decorator with MIT License 6 votes vote down vote up
def _join_results_single(self, scaffolds_df, sampled_df):
        def _join_scaffold(scaff, decs):
            mol = usc.join_joined_attachments(scaff, decs)
            if mol:
                return usc.to_smiles(mol)
        join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType())

        def _create_decorations_map(decorations_smi, attachment_points):
            decorations = decorations_smi.split(usc.ATTACHMENT_SEPARATOR_TOKEN)
            return {idx: _cleanup_decoration(dec) for dec, idx in zip(decorations, attachment_points)}
        create_decorations_map_udf = psf.udf(_create_decorations_map, pst.MapType(pst.IntegerType(), pst.StringType()))

        return sampled_df.join(scaffolds_df, on="id")\
            .select(
                join_scaffold_udf("randomized_scaffold", "decoration_smi").alias("smiles"),
                create_decorations_map_udf("decoration_smi", "attachment_points").alias("decorations"),
                "scaffold") 
Example #4
Source File: sample_scaffolds.py    From reinvent-scaffold-decorator with MIT License 6 votes vote down vote up
def _join_results_multi(self, scaffolds_df, sampled_df):
        def _join_scaffold(scaff, dec):
            mol = usc.join(scaff, dec)
            if mol:
                return usc.to_smiles(mol)

        def _format_attachment_point(smi, num):
            smi = usc.add_first_attachment_point_number(smi, num)
            return usc.to_smiles(uc.to_mol(smi))  # canonicalize

        join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType())
        format_attachment_point_udf = psf.udf(_format_attachment_point, pst.StringType())

        return sampled_df.join(scaffolds_df, on="id")\
            .withColumn("decoration", format_attachment_point_udf("decoration_smi", psf.col("attachment_points")[0]))\
            .select(
                join_scaffold_udf("smiles", "decoration").alias("smiles"),
                psf.map_concat(
                    psf.create_map(psf.col("attachment_points")[0],
                                   SampleScaffolds.cleanup_decoration_udf("decoration")),
                    "decorations",
                ).alias("decorations"),
                "scaffold") 
Example #5
Source File: anomalies_detection_spark_streaming.py    From Hanhan-Spark-Python with MIT License 6 votes vote down vote up
def detect(self, k, t):
        # Encoding categorical features using one-hot.
        df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
        df1.show(n=2, truncate=False)

        # Clustering points using KMeans
        features = df1.select("features").rdd.map(lambda row: row[0]).cache()
        model = StreamingKMeans(k=7, decayFactor=1.0).setRandomCenters(4, 1.0, 0)
        # model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20)

        # Adding the prediction column to df1
        modelBC = sc.broadcast(model)
        predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
        df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
        df2.show(n=3, truncate=False)

        # Adding the score column to df2; The higher the score, the more likely it is an anomaly
        df3 = self.addScore(df2).cache()
        df3.show(n=3, truncate=False)

        return df3.where(df3.score > t) 
Example #6
Source File: anomalies_detection.py    From Hanhan-Spark-Python with MIT License 6 votes vote down vote up
def detect(self, k, t):
        # Encoding categorical features using one-hot.
        df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
        df1.show(n=2, truncate=False)

        # Clustering points using KMeans
        features = df1.select("features").rdd.map(lambda row: row[0]).cache()
        model = KMeans.train(features, k, maxIterations=40, initializationMode="random", seed=20)

        # Adding the prediction column to df1
        modelBC = sparkCt.broadcast(model)
        predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
        df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
        df2.show(n=3, truncate=False)

        # Adding the score column to df2; The higher the score, the more likely it is an anomaly
        df3 = self.addScore(df2).cache()
        df3.show(n=3, truncate=False)

        return df3.where(df3.score > t) 
Example #7
Source File: anomalies_detection.py    From Hanhan-Spark-Python with MIT License 6 votes vote down vote up
def detect(self, k, t):
        #Encoding categorical features using one-hot.
        df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
        df1.show(n=2, truncate=False)

        #Clustering points using KMeans
        features = df1.select("features").rdd.map(lambda row: row[0]).cache()
        model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20)

        #Adding the prediction column to df1
        modelBC = sc.broadcast(model)
        predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
        df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
        df2.show(n=3, truncate=False)

        #Adding the score column to df2; The higher the score, the more likely it is an anomaly
        df3 = self.addScore(df2).cache()
        df3.show(n=3, truncate=False)

        return df3.where(df3.score > t) 
Example #8
Source File: udf.py    From ibis with Apache License 2.0 6 votes vote down vote up
def __call__(self, func):
            """Define a UDF (user-defined function) that operates element wise
            on a Spark DataFrame.

            Parameters
            ----------
            input_type : List[ibis.expr.datatypes.DataType]
                A list of the types found in :mod:`~ibis.expr.datatypes`. The
                length of this list must match the number of arguments to the
                function. Variadic arguments are not yet supported.
            output_type : ibis.expr.datatypes.DataType
                The return type of the function.

            Examples
            --------
            >>> import ibis
            >>> import ibis.expr.datatypes as dt
            >>> from ibis.spark.udf import udf
            >>> @udf.elementwise(input_type=[dt.string], output_type=dt.int64)
            ... def my_string_length(x):
            ...     return len(x) * 2
            """
            return SparkUDF(self._input_type, self._output_type)(func) 
Example #9
Source File: feature_engineering.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def explode_features(df, features=None):
    """Convert feature vector into individual columns

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    features : list of str or None

    Returns
    -------
    pyspark.sql.DataFrame
    """
    if features is None:
        features = df.schema['features'].metadata['features']

    def extract_feature(features, idx):
        return float(features[idx])
    extract_feature_udf = F.udf(extract_feature, pyspark.sql.types.FloatType())
    cols = [extract_feature_udf('features', F.lit(idx)).alias(name) for idx, name in enumerate(features)]
    return df.select('*', *cols) 
Example #10
Source File: feature_engineering.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def zero_features(df, *feature_names):
    """Zero out features in the feature vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    feature_names : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    features = df.schema['features'].metadata['features']
    idxs = [features.index(name) for name in feature_names]

    def zero_features(feat):
        raw = feat.toArray()
        for idx in idxs:
            raw[idx] = 0.
        return Vectors.dense(raw)
    zero_features_udf = F.udf(zero_features, VectorUDT())
    return df.withColumn('features', mjolnir.spark.add_meta(
        df._sc, zero_features_udf('features'), {'features': features})) 
Example #11
Source File: tf_image.py    From spark-deep-learning with Apache License 2.0 6 votes vote down vote up
def _convertOutputToImage(self, df, tfs_output_col, output_shape):
        assert len(output_shape) == 4, str(output_shape) + " does not have 4 dimensions"
        height = int(output_shape[1])
        width = int(output_shape[2])

        def to_image(orig_image, numeric_data):
            # Assume the returned image has float pixels but same #channels as input
            mode = imageIO.imageTypeByName('CV_32FC%d' % orig_image.nChannels)
            data = bytearray(np.array(numeric_data).astype(np.float32).tobytes())
            nChannels = orig_image.nChannels
            return Row(
                origin="",
                mode=mode.ord,
                height=height,
                width=width,
                nChannels=nChannels,
                data=data)

        to_image_udf = udf(to_image, ImageSchema.imageSchema['image'].dataType)
        resDf = df.withColumn(self.getOutputCol(),
                              to_image_udf(df[self.getInputCol()], df[tfs_output_col]))
        return resDf.drop(tfs_output_col) 
Example #12
Source File: feature_engineering.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def append_features(df, *cols):
    """Append features from columns to the features vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    cols : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    def add_features(feat, *other):
        raw = feat.toArray()
        return Vectors.dense(np.append(raw, list(map(float, other))))
    add_features_udf = F.udf(add_features, VectorUDT())
    new_feat_list = df.schema['features'].metadata['features'] + cols
    return df.withColumn('features', mjolnir.spark.add_meta(
        df._sc, add_features_udf('features', *cols), {'features': new_feat_list})) 
Example #13
Source File: image_params.py    From spark-deep-learning with Apache License 2.0 6 votes vote down vote up
def loadImagesInternal(self, dataframe, inputCol):
        """
        Load image files specified in dataset as image format specified in `sparkdl.image.imageIO`.
        """
        # plan 1: udf(loader() + convert from np.array to imageSchema) -> call TFImageTransformer
        # plan 2: udf(loader()) ... we don't support np.array as a dataframe column type...
        loader = self.getImageLoader()
        # Load from external resources can fail, so we should allow None to be returned

        def load_image_uri_impl(uri):
            try:
                return imageArrayToStruct(_reverseChannels(loader(uri)))
            except BaseException:  # pylint: disable=bare-except
                return None
        load_udf = udf(load_image_uri_impl, ImageSchema.imageSchema['image'].dataType)
        return dataframe.withColumn(self._loadedImageCol(), load_udf(dataframe[inputCol])) 
Example #14
Source File: named_image.py    From spark-deep-learning with Apache License 2.0 6 votes vote down vote up
def _decodeOutputAsPredictions(self, df):
        # If we start having different weights than imagenet, we'll need to
        # move this logic to individual model building in NamedImageTransformer.
        # Also, we could put the computation directly in the main computation
        # graph or use a scala UDF for potentially better performance.
        topK = self.getOrDefault(self.topK)

        def decode(predictions):
            pred_arr = np.expand_dims(np.array(predictions), axis=0)
            decoded = decode_predictions(pred_arr, top=topK)[0]
            # convert numpy dtypes to python native types
            return [(t[0], t[1], t[2].item()) for t in decoded]

        decodedSchema = ArrayType(
            StructType([
                StructField("class", StringType(), False),
                StructField("description", StringType(), False),
                StructField("probability", FloatType(), False)
            ]))
        decodeUDF = udf(decode, decodedSchema)
        interim_output = self._getIntermediateOutputCol()
        return df \
            .withColumn(self.getOutputCol(), decodeUDF(df[interim_output])) \
            .drop(interim_output) 
Example #15
Source File: data_fetcher.py    From ReAgent with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def make_get_step_udf(multi_steps: Optional[int]):
    """ Get step count by taking length of next_states_features array. """

    def get_step(col: List):
        return 1 if multi_steps is None else min(len(col), multi_steps)

    return udf(get_step, LongType()) 
Example #16
Source File: anomalies_detection_spark_streaming.py    From Hanhan-Spark-Python with MIT License 5 votes vote down vote up
def addScore(self, df):
        cluster_dict = {}
        clusters_list = df.select("prediction").collect()
        for c in clusters_list:
            cluster_dict[c] = cluster_dict.setdefault(c,0.0)+1.0
        sorted_clusters = sorted(cluster_dict.items(), key=operator.itemgetter(1))  # sort by value
        n_max = sorted_clusters[-1][1]
        n_min = sorted_clusters[0][1]
        score_udf = udf(lambda p: float(n_max - cluster_dict.get(Row(p)))/(n_max - n_min), DoubleType())
        score_df = df.withColumn("score", score_udf(df.prediction))
        return score_df 
Example #17
Source File: base.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _transform(self, dataset):
        self.transformSchema(dataset.schema)
        transformUDF = udf(self.createTransformFunc(), self.outputDataType())
        transformedDataset = dataset.withColumn(self.getOutputCol(),
                                                transformUDF(dataset[self.getInputCol()]))
        return transformedDataset 
Example #18
Source File: functions.py    From SMV with Apache License 2.0 5 votes vote down vote up
def smvCreateLookUp(m, default, outputType):
    """Return a Python UDF which will perform a dictionary lookup on a column

        Args:
            m (dictionary): a Python dictionary to be applied
            default (any): default value if dictionary lookup failed
            outputType (DataType): output value's data type

        Returns:
            (udf): an udf which can apply to a column and apply the lookup
    """
    return udf(lambda k: m.get(k, default), outputType) 
Example #19
Source File: taar_ensemble.py    From telemetry-airflow with Mozilla Public License 2.0 5 votes vote down vote up
def get_addons_per_client(users_df, minimum_addons_count):
    """ Extracts a DataFrame that contains one row
    for each client along with the list of active add-on GUIDs.
    """

    def is_valid_addon(addon):
        return not (
            addon.is_system
            or addon.app_disabled
            or addon.type != "extension"
            or addon.user_disabled
            or addon.foreign_install
            or addon.install_day is None
        )

    # may need additional whitelisting to remove shield addons

    def get_valid_addon_ids(addons):
        sorted_addons = sorted(
            [(a.addon_id, a.install_day) for a in addons if is_valid_addon(a)],
            key=lambda addon_tuple: addon_tuple[1],
        )
        return [addon_id for (addon_id, install_day) in sorted_addons]

    get_valid_addon_ids_udf = udf(get_valid_addon_ids, ArrayType(StringType()))

    # Create an add-ons dataset un-nesting the add-on map from each
    # user to a list of add-on GUIDs. Also filter undesired add-ons.
    return users_df.select(
        "client_id", get_valid_addon_ids_udf("active_addons").alias("addon_ids")
    ).filter(size("addon_ids") > minimum_addons_count) 
Example #20
Source File: data_fetcher.py    From ReAgent with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def make_sparse2dense(df, col_name: str, possible_keys: List):
    """ Given a list of possible keys, convert sparse map to dense array.
        In our example, both value_type is assumed to be a float.
    """
    output_type = StructType(
        [
            StructField("presence", ArrayType(BooleanType()), False),
            StructField("dense", ArrayType(FloatType()), False),
        ]
    )

    def sparse2dense(map_col):
        assert isinstance(
            map_col, dict
        ), f"{map_col} has type {type(map_col)} and is not a dict."
        presence = []
        dense = []
        for key in possible_keys:
            val = map_col.get(key, None)
            if val is not None:
                presence.append(True)
                dense.append(float(val))
            else:
                presence.append(False)
                dense.append(0.0)
        return presence, dense

    sparse2dense_udf = udf(sparse2dense, output_type)
    df = df.withColumn(col_name, sparse2dense_udf(col_name))
    df = df.withColumn(f"{col_name}_presence", col(f"{col_name}.presence"))
    df = df.withColumn(col_name, col(f"{col_name}.dense"))
    return df


#################################################
# Below are some UDFs we use for preprocessing. #
################################################# 
Example #21
Source File: anomalies_detection_spark_streaming.py    From Hanhan-Spark-Python with MIT License 5 votes vote down vote up
def cat2Num(self, df, indices):
        unique_values = []
        for i in indices:
            d = udf(lambda r: r[i], StringType())
            dt = df.select(d(df.rawFeatures)).distinct().collect()
            unique_values.extend(dt)

        unique_count = len(unique_values)
        convertUDF = udf(lambda r: to_onehot(r, indices, unique_values, unique_count), ArrayType(DoubleType()))
        newdf = df.withColumn("features", convertUDF(df.rawFeatures))

        return newdf 
Example #22
Source File: data_fetcher.py    From ReAgent with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def make_next_udf(multi_steps: Optional[int], return_type):
    """ Generic udf to get next (after multi_steps) item, provided item type. """

    def get_next(next_col):
        return (
            next_col
            if multi_steps is None
            else next_col[min(len(next_col), multi_steps) - 1]
        )

    return udf(get_next, return_type) 
Example #23
Source File: data_fetcher.py    From ReAgent with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def make_where_udf(arr: List[str]):
    """ Return index of item in arr, and len(arr) if not found. """

    def find(item: str):
        for i, arr_item in enumerate(arr):
            if arr_item == item:
                return i
        return len(arr)

    return udf(find, LongType()) 
Example #24
Source File: data_fetcher.py    From ReAgent with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def make_existence_bitvector_udf(arr: List[str]):
    """ one-hot encode elements of target depending on their existence in arr. """

    default = [0] * len(arr)

    def encode(target: List[str]):
        bitvec = default.copy()
        for i, arr_item in enumerate(arr):
            if arr_item in target:
                bitvec[i] = 1
        return bitvec

    return udf(encode, ArrayType(LongType())) 
Example #25
Source File: data_fetcher.py    From ReAgent with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def parametric_action_preprocessing(
    df,
    actions: List[str],
    multi_steps: Optional[int] = None,
    include_possible_actions: bool = True,
):
    assert (
        not include_possible_actions
    ), "current we don't support include_possible_actions"

    next_map_udf = make_next_udf(multi_steps, MapType(LongType(), FloatType()))
    df = df.withColumn("next_action", next_map_udf("next_action"))

    def make_not_terminal_udf():
        """ Return true iff next_action is an empty map """

        def get_not_terminal(next_action):
            return len(next_action) > 0

        return udf(get_not_terminal, BooleanType())

    not_terminal_udf = make_not_terminal_udf()
    df = df.withColumn("not_terminal", not_terminal_udf("next_action"))

    df = make_sparse2dense(df, "action", actions)
    df = make_sparse2dense(df, "next_action", actions)
    return df 
Example #26
Source File: taar_ensemble.py    From python_mozetl with MIT License 5 votes vote down vote up
def get_addons_per_client(users_df, minimum_addons_count):
    """ Extracts a DataFrame that contains one row
    for each client along with the list of active add-on GUIDs.
    """

    def is_valid_addon(addon):
        return not (
            addon.is_system
            or addon.app_disabled
            or addon.type != "extension"
            or addon.user_disabled
            or addon.foreign_install
            or addon.install_day is None
        )

    # may need additional whitelisting to remove shield addons

    def get_valid_addon_ids(addons):
        sorted_addons = sorted(
            [(a.addon_id, a.install_day) for a in addons if is_valid_addon(a)],
            key=lambda addon_tuple: addon_tuple[1],
        )
        return [addon_id for (addon_id, install_day) in sorted_addons]

    get_valid_addon_ids_udf = udf(get_valid_addon_ids, ArrayType(StringType()))

    # Create an add-ons dataset un-nesting the add-on map from each
    # user to a list of add-on GUIDs. Also filter undesired add-ons.
    return users_df.select(
        "client_id", get_valid_addon_ids_udf("active_addons").alias("addon_ids")
    ).filter(size("addon_ids") > minimum_addons_count) 
Example #27
Source File: test_imageIO.py    From spark-deep-learning with Apache License 2.0 5 votes vote down vote up
def test_udf_schema(self):
        # Test that utility functions can be used to create a udf that accepts and return
        # imageSchema
        def do_nothing(imgRow):
            array = imageIO.imageStructToArray(imgRow)
            return imageIO.imageArrayToStruct(array)
        do_nothing_udf = udf(do_nothing, ImageSchema.imageSchema['image'].dataType)

        df = imageIO._readImagesWithCustomFn(
            "file/path", decode_f=imageIO.PIL_decode, numPartition=2, sc=self.binaryFilesMock)
        df = df.filter(col('image').isNotNull()).withColumn("test", do_nothing_udf('image'))
        self.assertEqual(df.first().test.data, array.tobytes())
        df.printSchema() 
Example #28
Source File: TutorialClasses.py    From KDD2019-HandsOn-Tutorial with MIT License 5 votes vote down vote up
def cosineSimilarity(s1, s2):
  (m1, v1) = Base64ToFloatArray(s1)
  (m2, v2) = Base64ToFloatArray(s2)
  if (m1 == 0) or (m2 == 0):
    return 0
  else :
    return sum(x*y for x,y in zip(v1, v2))/(m1 * m2)

# Register udf functions so that it could be used in dataframe
#
# Perform same computation as cosineSimilarity()
# 
Example #29
Source File: hostlinks_to_graph.py    From cc-pyspark with MIT License 5 votes vote down vote up
def vertices_assign_ids(self, sc, sqlc, edges):
        source = edges.select(edges.s.alias('name'))
        target = edges.select(edges.t.alias('name'))

        ids = source.union(target) \
            .distinct()

        if self.args.validate_host_names:
            is_valid = sqlf.udf(HostLinksToGraph.reverse_host_is_valid,
                                BooleanType())
            ids = ids.filter(is_valid(ids.name))

        if self.args.vertex_partitions == 1:
            ids = ids \
                    .coalesce(1) \
                    .sort('name') \
                    .withColumn('id', sqlf.monotonically_increasing_id())
        else:
            id_rdd = ids.select(ids.name).rdd \
                        .map(lambda row: tuple(row)[0]) \
                        .sortBy(lambda x: x, True,
                                self.args.vertex_partitions) \
                        .zipWithIndex()
            id_schema = StructType([
                StructField("name", StringType(), True),
                StructField("id", LongType(), True)
            ])
            ids = sqlc.createDataFrame(id_rdd, schema=id_schema)

        if self.args.save_as_text is not None:
            ids = ids.persist()
            ids.select(sqlf.concat_ws('\t', ids.id, ids.name)) \
                .write \
                .text(os.path.join(self.args.save_as_text, "vertices"),
                      compression="gzip")
        ids.write \
           .format(self.args.output_format) \
           .option("compression", self.args.output_compression) \
           .saveAsTable(self.args.output + '_vertices')

        return ids 
Example #30
Source File: converter.py    From spark-sklearn with Apache License 2.0 5 votes vote down vote up
def toPandas(self, df):
        """
        This is similar to the Spark DataFrame built-in toPandas() method, but it handles
        MLlib Vector columns differently.  It converts MLlib Vectors into rows of
        scipy.sparse.csr_matrix, which is generally friendlier for PyData tools like scikit-learn.

        .. note:: Experimental: This will likely be replaced in later releases with improved APIs.

        :param df: Spark DataFrame
        :return:  Pandas dataframe
        """
        cols = df.columns
        # Convert any MLlib Vector columns to scipy.sparse.csr_matrix
        matrixCols = []

        def toscipy(v):
            if isinstance(v, DenseVector):
                return csr_matrix((v.values, np.array(range(v.size)), np.array([0, v.size])),
                                  shape=(1, v.size))
            elif isinstance(v, SparseVector):
                return csr_matrix((v.values, v.indices, np.array([0, len(v.indices)])),
                                  shape=(1, v.size))
            else:
                raise TypeError("Converter.toPandas found unknown Vector type: %s" % type(v))
        tosparse = udf(lambda v: toscipy(v), CSRVectorUDT())
        for i in range(len(cols)):
            c = cols[i]
            if isinstance(df.schema.fields[i].dataType, VectorUDT):
                cols[i] = tosparse(df[c]).alias(c)
                matrixCols.append(c)
            else:
                cols[i] = df[c]
        return df.select(*cols).toPandas()