Python pyspark.sql.DataFrame() Examples

The following are 30 code examples of pyspark.sql.DataFrame(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql , or try the search function .
Example #1
Source File: helpers.py    From search-MjoLniR with MIT License 7 votes vote down vote up
def require_output_table(
        self, partition_spec_spec, metadata_fn=None,
        mode='overwrite',
    ):
        @self._post_process_transform.append
        def post(df: DataFrame, kwargs: Dict):
            mt.write_partition(
                df, kwargs['output_table'], kwargs['output_path'],
                self._resolve_partition_spec(kwargs, partition_spec_spec),
                mode=mode)
            if metadata_fn is not None:
                spark = df.sql_ctx.sparkSession
                metadata = metadata_fn(spark.read.parquet(kwargs['output_path']))
                write_metadata(kwargs['output_path'], metadata)

        self.add_argument('--output-table', required=True)
        self.add_argument('--output-path', required=True) 
Example #2
Source File: helpers.py    From SMV with Apache License 2.0 6 votes vote down vote up
def smvExpandStruct(self, *cols):
        """Expand structure type column to a group of columns

            Args:
                cols (\*string): column names to expand

            Example:
                input DF:
                    [id: string, address: struct<state:string, zip:string, street:string>]

                >>> df.smvExpandStruct("address")

                output DF:
                    [id: string, state: string, zip: string, street: string]

            Returns:
                (DataFrame): DF with expanded columns
        """
        jdf = self._jPythonHelper.smvExpandStruct(self._jdf, smv_copy_array(self._sc, *cols))
        return DataFrame(jdf, self._sql_ctx) 
Example #3
Source File: norm_query_clustering.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def filter_min_sessions_per_norm_query(min_sessions: int) -> mt.Transformer:
    def transform(df: DataFrame) -> DataFrame:
        w = Window.partitionBy('wikiid', 'norm_query')
        return (
            df.withColumn(
                'has_min_sessions',
                at_least_n_distinct('session_id', min_sessions).over(w))
            .where(F.col('has_min_sessions'))
            .drop('has_min_sessions'))
    return transform 
Example #4
Source File: feature_vectors.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def resample_clicks_to_query_page(
    df_cluster: DataFrame,
    random_seed: Optional[int],
    samples_per_wiki: int
) -> mt.Transformer:
    # Resamples the click log by proxy of resampling clusters, such
    # that a complete cluster is either included or excluded from the
    # resulting dataset.
    # TODO: Evaluate alternative resampling, such as perhaps only dropping from
    # clusters where all clicks were to the top result (implying an "easy" search).

    mt.check_schema(df_cluster, mt.QueryClustering)
    return mt.seq_transform([
        # Grab only the parts of the query log we need to make the resulting sampled QueryPage
        lambda df: df.select('query', 'wikiid', 'session_id', 'hit_page_ids'),
        mt.join_cluster_by_query(df_cluster),
        # [1] is because sample returns a tuple of (page_counts, df)
        mt.temp_rename_col('cluster_id', 'norm_query_id', lambda df: mjolnir.sampling.sample(
            df, random_seed, samples_per_wiki)[1]),
        lambda df: df.withColumn(
            'page_id', F.explode('hit_page_ids')).drop('hit_page_ids')
    ]) 
Example #5
Source File: feature_engineering.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def explode_features(df, features=None):
    """Convert feature vector into individual columns

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    features : list of str or None

    Returns
    -------
    pyspark.sql.DataFrame
    """
    if features is None:
        features = df.schema['features'].metadata['features']

    def extract_feature(features, idx):
        return float(features[idx])
    extract_feature_udf = F.udf(extract_feature, pyspark.sql.types.FloatType())
    cols = [extract_feature_udf('features', F.lit(idx)).alias(name) for idx, name in enumerate(features)]
    return df.select('*', *cols) 
Example #6
Source File: feature_vectors.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def transform(
    query_clicks: HivePartition,
    query_clustering: HivePartition,
    samples_per_wiki: int,
    random_seed: Optional[int],
    wikis: List[str],
    brokers: str,
    topic_request: str,
    topic_response: str,
    feature_set: str,
    **kwargs
) -> DataFrame:
    transformer = mt.seq_transform([
        mt.restrict_wikis(wikis),
        resample_clicks_to_query_page(
            query_clustering.df, random_seed, samples_per_wiki),
        feature_vectors.transformer(
            brokers, topic_request, topic_response, feature_set)
    ])
    return transformer(query_clicks.df) 
Example #7
Source File: feature_engineering.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def append_features(df, *cols):
    """Append features from columns to the features vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    cols : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    def add_features(feat, *other):
        raw = feat.toArray()
        return Vectors.dense(np.append(raw, list(map(float, other))))
    add_features_udf = F.udf(add_features, VectorUDT())
    new_feat_list = df.schema['features'].metadata['features'] + cols
    return df.withColumn('features', mjolnir.spark.add_meta(
        df._sc, add_features_udf('features', *cols), {'features': new_feat_list})) 
Example #8
Source File: feature_vectors.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def collect_features(
    kafka_config: ClientConfig, feature_set: str
) -> mt.Transformer:
    def transform(df: DataFrame) -> DataFrame:
        df_features, fnames_accu = mjolnir.features.collect(
            df,
            model='featureset:' + feature_set,
            brokers=kafka_config,
            indices=mt.ContentIndices())
        # Collect the accumulator to get feature names
        df_features.cache().count()
        # Future transformations have to be extra careful to not lose this metadata
        return _add_meta(df_features, 'features', {
            'feature_set': feature_set,
            'features': _check_features(fnames_accu),
            'collected_at': datetime.datetime.now().isoformat()
        })
    return transform 
Example #9
Source File: feature_selection.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def select_features(
    wiki: str,
    num_features: int,
    metadata: Dict
) -> mt.Transformer:
    def transform(df: DataFrame) -> DataFrame:
        # Compute the "best" features, per some metric
        sc = df.sql_ctx.sparkSession.sparkContext
        features = metadata['input_feature_meta']['features']
        selected = mjolnir.feature_engineering.select_features(
            sc, df, features, num_features, algo='mrmr')
        metadata['wiki_features'][wiki] = selected

        # Rebuild the `features` col with only the selected features
        keep_cols = metadata['default_cols'] + selected
        df_selected = df.select(*keep_cols)
        assembler = VectorAssembler(
            inputCols=selected, outputCol='features')
        return assembler.transform(df_selected).drop(*selected)
    return transform 
Example #10
Source File: feature_selection.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def transformer(
    df_label: DataFrame,
    temp_dir: str,
    wikis: List[str],
    num_features: int
) -> mt.Transformer:
    mt.check_schema(df_label, mt.LabeledQueryPage)

    # Hack to transfer metadata between transformations. This is populated in
    # time since `select_features` does direct computation of the features.
    metadata = cast(Dict, {'wiki_features': {}})

    return mt.seq_transform([
        mt.restrict_wikis(wikis),
        mt.join_labels(df_label),
        explode_features(metadata),
        mt.cache_to_disk(temp_dir, partition_by='wikiid'),
        mt.for_each_item('wikiid', wikis, lambda wiki: select_features(
            wiki, num_features, metadata)),
        attach_feature_metadata(metadata),
        # While we used the labels for selecting features, they are not part of the feature vectors.
        # Allow them to be joined with any other label set for export to training.
        lambda df: df.drop('cluster_id', 'label'),
        lambda df: df.repartition(200, 'wikiid', 'query'),
    ]) 
Example #11
Source File: make_folds.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def convert_svmrank_to_xgboost(df: DataFrame) -> DataFrame:
    def convert_one(row: Row) -> Row:
        # For now place the .xgb right next to the svmrank files. Naming/path
        # options could be added if needed later.
        out_path = row.path + '.xgb'
        _convert_xgboost_remote(row.path, out_path)
        return Row(**dict(
            row.asDict(),
            vec_format='xgboost',
            path=out_path))

    # Each row represents potentially gigabytes, convince spark
    # to create a partition per row.
    rdd_xgb = mt.partition_per_row(df.rdd).map(convert_one)
    df_xgb = df.sql_ctx.createDataFrame(rdd_xgb, df.schema)  # type: ignore
    # Return both the xgb and svmrank datasets since
    # we aren't purging the related files. df is safe to reuse since
    # svmrank conversion returns a new dataframe with no lineage.
    return df.union(df_xgb) 
Example #12
Source File: sys_exec.py    From cadCAD with MIT License 6 votes vote down vote up
def to_spark_df(rdd: RDD, spark: SparkSession, init_condition: dict = None):
    # Typefull
    if init_condition is not None:
        return to_spark(rdd, init_condition)
    # Typeless
    else:
        spark.conf.set("spark.sql.execution.arrow.enabled", "true")
        spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true")
        warnings.simplefilter(action='ignore', category=UserWarning)
        pdf_from_rdd: DataFrame = to_pandas(rdd)
        result = spark.createDataFrame(pdf_from_rdd)
        del pdf_from_rdd
        return result 
Example #13
Source File: clustering.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def assignClusters(self, dataset):
        """
        Run the PIC algorithm and returns a cluster assignment for each input vertex.

        :param dataset:
          A dataset with columns src, dst, weight representing the affinity matrix,
          which is the matrix A in the PIC paper. Suppose the src column value is i,
          the dst column value is j, the weight column value is similarity s,,ij,,
          which must be nonnegative. This is a symmetric matrix and hence
          s,,ij,, = s,,ji,,. For any (i, j) with nonzero similarity, there should be
          either (i, j, s,,ij,,) or (j, i, s,,ji,,) in the input. Rows with i = j are
          ignored, because we assume s,,ij,, = 0.0.

        :return:
          A dataset that contains columns of vertex id and the corresponding cluster for
          the id. The schema of it will be:
          - id: Long
          - cluster: Int

        .. versionadded:: 2.4.0
        """
        self._transfer_params_to_java()
        jdf = self._java_obj.assignClusters(dataset._jdf)
        return DataFrame(jdf, dataset.sql_ctx) 
Example #14
Source File: common.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def _py2java(sc, obj):
    """ Convert Python object into Java """
    if isinstance(obj, RDD):
        obj = _to_java_object_rdd(obj)
    elif isinstance(obj, DataFrame):
        obj = obj._jdf
    elif isinstance(obj, SparkContext):
        obj = obj._jsc
    elif isinstance(obj, list):
        obj = [_py2java(sc, x) for x in obj]
    elif isinstance(obj, JavaObject):
        pass
    elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
        pass
    else:
        data = bytearray(PickleSerializer().dumps(obj))
        obj = sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(data)
    return obj 
Example #15
Source File: recommendation.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def _prepare(cls, ratings):
        if isinstance(ratings, RDD):
            pass
        elif isinstance(ratings, DataFrame):
            ratings = ratings.rdd
        else:
            raise TypeError("Ratings should be represented by either an RDD or a DataFrame, "
                            "but got %s." % type(ratings))
        first = ratings.first()
        if isinstance(first, Rating):
            pass
        elif isinstance(first, (tuple, list)):
            ratings = ratings.map(lambda x: Rating(*x))
        else:
            raise TypeError("Expect a Rating or a tuple/list, but got %s." % type(first))
        return ratings 
Example #16
Source File: common.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def _py2java(sc, obj):
    """ Convert Python object into Java """
    if isinstance(obj, RDD):
        obj = _to_java_object_rdd(obj)
    elif isinstance(obj, DataFrame):
        obj = obj._jdf
    elif isinstance(obj, SparkContext):
        obj = obj._jsc
    elif isinstance(obj, list):
        obj = [_py2java(sc, x) for x in obj]
    elif isinstance(obj, JavaObject):
        pass
    elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
        pass
    else:
        data = bytearray(PickleSerializer().dumps(obj))
        obj = sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(data)
    return obj 
Example #17
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_gaussian_mixture_summary(self):
        data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
                (Vectors.sparse(1, [], []),)]
        df = self.spark.createDataFrame(data, ["features"])
        gmm = GaussianMixture(k=2)
        model = gmm.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.probabilityCol, "probability")
        self.assertTrue(isinstance(s.probability, DataFrame))
        self.assertEqual(s.featuresCol, "features")
        self.assertEqual(s.predictionCol, "prediction")
        self.assertTrue(isinstance(s.cluster, DataFrame))
        self.assertEqual(len(s.clusterSizes), 2)
        self.assertEqual(s.k, 2)
        self.assertEqual(s.numIter, 3) 
Example #18
Source File: helpers.py    From SMV with Apache License 2.0 6 votes vote down vote up
def smvTopNRecs(self, maxElems, *cols):
        """For each group, return the top N records according to a given ordering

            Example:

                >>> df.smvGroupBy("id").smvTopNRecs(3, col("amt").desc())

                This will keep the 3 largest amt records for each id

            Args:
                maxElems (int): maximum number of records per group
                cols (\*str): columns defining the ordering

            Returns:
                (DataFrame): result of taking top records from groups

        """
        return DataFrame(self.sgd.smvTopNRecs(maxElems, smv_copy_array(self.df._sc, *cols)), self.df.sql_ctx) 
Example #19
Source File: feature_engineering.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def zero_features(df, *feature_names):
    """Zero out features in the feature vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    feature_names : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    features = df.schema['features'].metadata['features']
    idxs = [features.index(name) for name in feature_names]

    def zero_features(feat):
        raw = feat.toArray()
        for idx in idxs:
            raw[idx] = 0.
        return Vectors.dense(raw)
    zero_features_udf = F.udf(zero_features, VectorUDT())
    return df.withColumn('features', mjolnir.spark.add_meta(
        df._sc, zero_features_udf('features'), {'features': features})) 
Example #20
Source File: helpers.py    From SMV with Apache License 2.0 6 votes vote down vote up
def smvPivotSum(self, pivotCols, valueCols, baseOutput):
        """Perform SmvPivot, then sum the results.
            Please refer smvPivot's document for context and details of the SmvPivot operation.

            Args:
                pivotCols (list(list(str))): list of lists of column names to pivot
                valueCols (list(string)): names of value columns to sum
                baseOutput (list(str)): expected names pivoted column

            Examples:
                For example, given a DataFrame df that represents the table

                +-----+-------+---------+-------+
                | id  | month | product | count |
                +=====+=======+=========+=======+
                | 1   | 5/14  |   A     |   100 |
                +-----+-------+---------+-------+
                | 1   | 6/14  |   B     |   200 |
                +-----+-------+---------+-------+
                | 1   | 5/14  |   B     |   300 |
                +-----+-------+---------+-------+

                we can use

                >>> df.smvGroupBy("id").smvPivotSum([["month", "product"]], ["count"], ["5_14_A", "5_14_B", "6_14_A", "6_14_B"])

                to produce the following output

                +-----+--------------+--------------+--------------+--------------+
                | id  | count_5_14_A | count_5_14_B | count_6_14_A | count_6_14_B |
                +=====+==============+==============+==============+==============+
                | 1   | 100          | 300          | NULL         | 200          |
                +-----+--------------+--------------+--------------+--------------+

            Returns:
                (DataFrame): result of pivot sum
        """
        return DataFrame(self.sgd.smvPivotSum(smv_copy_array(self.df._sc, *pivotCols), smv_copy_array(self.df._sc, *valueCols), smv_copy_array(self.df._sc, *baseOutput)), self.df.sql_ctx) 
Example #21
Source File: helpers.py    From SMV with Apache License 2.0 6 votes vote down vote up
def smvRePartition(self, numParts):
        """Repartition SmvGroupedData using specified partitioner on the keys. A
            HashPartitioner with the specified number of partitions will be used.

            This method is used in the cases that the key-space is very large. In the
            current Spark DF's groupBy method, the entire key-space is actually loaded
            into executor's memory, which is very dangerous when the key space is big.
            The regular DF's repartition function doesn't solve this issue since a random
            repartition will not guaranteed to reduce the key-space on each executor.
            In that case we need to use this function to linearly reduce the key-space.

            Example:

            >>> df.smvGroupBy("k1", "k2").smvRePartition(32).agg(sum("v") as "v")
        """
        jgdadp = self.sgd.smvRePartition(numParts)
        df = DataFrame(jgdadp.toDF(), self.df.sql_ctx)
        return SmvGroupedData(df, self.keys, jgdadp) 
Example #22
Source File: helpers.py    From SMV with Apache License 2.0 6 votes vote down vote up
def smvPercentRank(self, value_cols, ignoreNull=True):
        """Compute the percent rank of a sequence of columns within a group in a given DataFrame.

            Used Spark's `percentRank` window function. The precent rank is defined as
            `R/(N-1)`, where `R` is the base 0 rank, and `N` is the population size. Under
            this definition, min value (R=0) has percent rank `0.0`, and max value has percent
            rank `1.0`.

            For each column for which the percent rank is computed (e.g. "v"), an additional column is
            added to the output, `v_pctrnk`

            All other columns in the input are untouched and propagated to the output.

            Args:
                value_cols (list(str)): columns to calculate percentRank on
                ignoreNull (boolean): if true, null values's percent ranks will be nulls, otherwise,
                    as Spark sort considers null smaller than any value, nulls percent ranks will be
                    zeros. Default true.

            Example:
                >>> df.smvGroupBy('g, 'g2).smvPercentRank(["v1", "v2", "v3"])
        """
        return DataFrame(self.sgd.smvPercentRank(smv_copy_array(self.df._sc, *value_cols), ignoreNull), self.df.sql_ctx) 
Example #23
Source File: clustering.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def cluster(self):
        """
        DataFrame of predicted cluster centers for each training data point.
        """
        return self._call_java("cluster") 
Example #24
Source File: clustering.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def predictions(self):
        """
        DataFrame produced by the model's `transform` method.
        """
        return self._call_java("predictions") 
Example #25
Source File: regression.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def evaluate(self, dataset):
        """
        Evaluates the model on a test dataset.

        :param dataset:
          Test dataset to evaluate model on, where dataset is an
          instance of :py:class:`pyspark.sql.DataFrame`
        """
        if not isinstance(dataset, DataFrame):
            raise ValueError("dataset must be a DataFrame but got %s." % type(dataset))
        java_lr_summary = self._call_java("evaluate", dataset)
        return LinearRegressionSummary(java_lr_summary) 
Example #26
Source File: regression.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def evaluateEachIteration(self, dataset, loss):
        """
        Method to compute error or loss for every iteration of gradient boosting.

        :param dataset:
            Test dataset to evaluate model on, where dataset is an
            instance of :py:class:`pyspark.sql.DataFrame`
        :param loss:
            The loss function used to compute error.
            Supported options: squared, absolute
        """
        return self._call_java("evaluateEachIteration", dataset, loss) 
Example #27
Source File: regression.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def evaluate(self, dataset):
        """
        Evaluates the model on a test dataset.

        :param dataset:
          Test dataset to evaluate model on, where dataset is an
          instance of :py:class:`pyspark.sql.DataFrame`
        """
        if not isinstance(dataset, DataFrame):
            raise ValueError("dataset must be a DataFrame but got %s." % type(dataset))
        java_glr_summary = self._call_java("evaluate", dataset)
        return GeneralizedLinearRegressionSummary(java_glr_summary) 
Example #28
Source File: regression.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def numInstances(self):
        """
        Number of instances in DataFrame predictions.
        """
        return self._call_java("numInstances") 
Example #29
Source File: util.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def convertMatrixColumnsFromML(dataset, *cols):
        """
        Converts matrix columns in an input DataFrame to the
        :py:class:`pyspark.mllib.linalg.Matrix` type from the new
        :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml`
        package.

        :param dataset:
          input dataset
        :param cols:
          a list of matrix columns to be converted.
          Old matrix columns will be ignored. If unspecified, all new
          matrix columns will be converted except nested ones.
        :return:
          the input dataset with new matrix columns converted to the
          old matrix type

        >>> import pyspark
        >>> from pyspark.ml.linalg import Matrices
        >>> from pyspark.mllib.util import MLUtils
        >>> df = spark.createDataFrame(
        ...     [(0, Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]),
        ...     Matrices.dense(2, 2, range(4)))], ["id", "x", "y"])
        >>> r1 = MLUtils.convertMatrixColumnsFromML(df).first()
        >>> isinstance(r1.x, pyspark.mllib.linalg.SparseMatrix)
        True
        >>> isinstance(r1.y, pyspark.mllib.linalg.DenseMatrix)
        True
        >>> r2 = MLUtils.convertMatrixColumnsFromML(df, "x").first()
        >>> isinstance(r2.x, pyspark.mllib.linalg.SparseMatrix)
        True
        >>> isinstance(r2.y, pyspark.ml.linalg.DenseMatrix)
        True
        """
        if not isinstance(dataset, DataFrame):
            raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset)))
        return callMLlibFunc("convertMatrixColumnsFromML", dataset, list(cols)) 
Example #30
Source File: helpers.py    From search-MjoLniR with MIT License 5 votes vote down vote up
def _wiki_features(df: DataFrame, wiki: str) -> List[str]:
    meta = df.schema['features'].metadata
    if 'wiki_features' in meta:
        return meta['wiki_features'][wiki]
    else:
        return meta['features']