Python pyspark.sql.functions.row_number() Examples

The following are 20 code examples of pyspark.sql.functions.row_number(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.functions , or try the search function .
Example #1
Source File: candidate_sets.py    From listenbrainz-server with GNU General Public License v2.0 7 votes vote down vote up
def get_top_artists(mapped_listens_subset, top_artist_limit):
    """ Get top artists listened to by users who have a listening history in
        the past X days where X = RECOMMENDATION_GENERATION_WINDOW.

        Args:
            df (dataframe): A subset of mapped_df containing user history.
            top_artist_limit (int): number of top artist to calculate

        Returns:
            top_artists_df (dataframe): Top Y artists listened to by a user for all users where
                                        Y = TOP_ARTISTS_LIMIT
    """
    df = mapped_listens_subset.select('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name') \
                              .groupBy('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name') \
                              .agg(func.count('mb_artist_credit_id').alias('count'))

    window = Window.partitionBy('user_name').orderBy(col('count').desc())

    top_artists_df = df.withColumn('rank', row_number().over(window)) \
                       .where(col('rank') <= top_artist_limit) \
                       .select('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name')

    return top_artists_df 
Example #2
Source File: sampler.py    From python_mozetl with MIT License 5 votes vote down vote up
def transform(landfill, n_documents=1000):
    meta_schema = StructType(
        [StructField(k, StringType(), True) for k in META_WHITELIST]
    )

    schema = StructType(
        [
            StructField("namespace", StringType(), False),
            StructField("doc_type", StringType(), False),
            StructField("doc_version", StringType(), True),
            StructField("doc_id", StringType(), True),
            StructField("meta", meta_schema, False),
            StructField("content", StringType(), False),
        ]
    )

    documents = (
        landfill.map(_process)
        .filter(lambda x: x[0] and x[1] and x[-2] and x[-1])
        .toDF(schema)
    )

    window_spec = Window.partitionBy("namespace", "doc_type", "doc_version").orderBy(
        "doc_id"
    )

    df = (
        documents.fillna("0", "doc_version")
        .withColumn("row_id", row_number().over(window_spec))
        .where(col("row_id") <= n_documents)
        .drop("row_id")
    )

    return df 
Example #3
Source File: spark_evaluation.py    From azure-python-labs with MIT License 5 votes vote down vote up
def _get_top_k_items(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=PREDICTION_COL,
        k=DEFAULT_K
):
    """Get the input customer-item-rating tuple in the format of Spark
    DataFrame, output a Spark DataFrame in the dense format of top k items
    for each user.
    NOTE: if it is implicit rating, just append a column of constants to be ratings.

    Args:
        dataframe (spark.DataFrame): DataFrame of rating data (in the format of
        customerID-itemID-rating tuple).
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        k (int): number of items for each user.

    Return:
        spark.DataFrame: DataFrame of top k items for each user.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())

    # this does not work for rating of the same value.
    items_for_user = (
        dataframe.select(
            col_user,
            col_item,
            col_rating,
            row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .groupby(col_user)
        .agg(F.collect_list(col_item).alias(col_prediction))
    )

    return items_for_user 
Example #4
Source File: window.py    From koalas with Apache License 2.0 5 votes vote down vote up
def sum(self):
        def sum(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.sum(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(sum) 
Example #5
Source File: window.py    From koalas with Apache License 2.0 5 votes vote down vote up
def min(self):
        def min(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.min(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(min) 
Example #6
Source File: window.py    From koalas with Apache License 2.0 5 votes vote down vote up
def max(self):
        def max(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.max(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(max) 
Example #7
Source File: window.py    From koalas with Apache License 2.0 5 votes vote down vote up
def mean(self):
        def mean(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.mean(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(mean) 
Example #8
Source File: window.py    From koalas with Apache License 2.0 5 votes vote down vote up
def std(self):
        def std(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.stddev(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(std) 
Example #9
Source File: spark_evaluation.py    From azure-python-labs with MIT License 5 votes vote down vote up
def _get_relevant_items_by_timestamp(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_timestamp=DEFAULT_TIMESTAMP_COL,
        col_prediction=PREDICTION_COL,
        k=DEFAULT_K
):
    """Get relevant items for each customer defined by timestamp.

    Relevant items are defined as k items that appear mostly recently
    according to timestamps.

    Args:
        dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
            tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_timestamp (str): column name for timestamp.
        col_prediction (str): column name for prediction.
        k: number of relevent items to be filtered by the function.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())

    items_for_user = (
        dataframe.select(
            col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates([col_user, col_prediction])
    )

    return items_for_user 
Example #10
Source File: spark_evaluation.py    From azure-python-labs with MIT License 5 votes vote down vote up
def _get_top_k_items(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=PREDICTION_COL,
        k=DEFAULT_K
):
    """Get the input customer-item-rating tuple in the format of Spark
    DataFrame, output a Spark DataFrame in the dense format of top k items
    for each user.
    NOTE: if it is implicit rating, just append a column of constants to be ratings.

    Args:
        dataframe (spark.DataFrame): DataFrame of rating data (in the format of
        customerID-itemID-rating tuple).
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        k (int): number of items for each user.

    Return:
        spark.DataFrame: DataFrame of top k items for each user.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())

    # this does not work for rating of the same value.
    items_for_user = (
        dataframe.select(
            col_user,
            col_item,
            col_rating,
            row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .groupby(col_user)
        .agg(F.collect_list(col_item).alias(col_prediction))
    )

    return items_for_user 
Example #11
Source File: internal.py    From koalas with Apache License 2.0 5 votes vote down vote up
def attach_sequence_column(sdf, column_name):
        scols = [scol_for(sdf, column) for column in sdf.columns]
        sequential_index = F.row_number().over(Window.orderBy(F.monotonically_increasing_id())) - 1
        return sdf.select(sequential_index.alias(column_name), *scols) 
Example #12
Source File: groupby.py    From koalas with Apache License 2.0 4 votes vote down vote up
def nsmallest(self, n=5):
        """
        Return the first n rows ordered by columns in ascending order in group.

        Return the first n rows with the smallest values in columns, in ascending order.
        The columns that are not specified are returned as well, but not used for ordering.

        Parameters
        ----------
        n : int
            Number of items to retrieve.

        See Also
        --------
        databricks.koalas.Series.nsmallest
        databricks.koalas.DataFrame.nsmallest

        Examples
        --------
        >>> df = ks.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],
        ...                    'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])

        >>> df.groupby(['a'])['b'].nsmallest(1).sort_index()  # doctest: +NORMALIZE_WHITESPACE
        a
        1  0    1
        2  3    2
        3  6    3
        Name: b, dtype: int64
        """
        if len(self._kdf._internal.index_names) > 1:
            raise ValueError("nsmallest do not support multi-index now")

        sdf = self._kdf._internal.spark_frame
        name = self._agg_columns[0]._internal.data_spark_column_names[0]
        window = Window.partitionBy(self._groupkeys_scols).orderBy(
            self._agg_columns[0].spark.column, NATURAL_ORDER_COLUMN_NAME
        )
        sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n)

        internal = InternalFrame(
            spark_frame=sdf.drop(NATURAL_ORDER_COLUMN_NAME),
            index_map=OrderedDict(
                [
                    (s._internal.data_spark_column_names[0], s._internal.column_labels[0])
                    for s in self._groupkeys
                ]
                + list(self._kdf._internal.index_map.items())
            ),
            data_spark_columns=[scol_for(sdf, name)],
        )
        return first_series(DataFrame(internal))

    # TODO: add keep parameter 
Example #13
Source File: spark_splitters.py    From azure-python-labs with MIT License 4 votes vote down vote up
def spark_timestamp_split(
    data,
    ratio=0.75,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    """Spark timestamp based splitter
    The splitter splits the data into sets by timestamps without stratification on either
    user or item.
    The ratios are applied on the timestamp column which is divided accordingly into
    several partitions.

    Args:
        data (spark.DataFrame): Spark DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two sets and the ratio argument indicates the ratio of
            training data set; if it is a list of float numbers, the splitter splits
            data into several portions corresponding to the split ratios. If a list is
            provided and the ratios are not summed to 1, they will be normalized.
            Earlier indexed splits will have earlier times
            (e.g the latest time in split[0] <= the earliest time in split[1])
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.
        col_timestamp (str): column name of timestamps. Float number represented in
        seconds since Epoch.

    Returns:
        list: Splits of the input data as spark.DataFrame.
    """
    multi_split, ratio = process_split_ratio(ratio)

    ratio = ratio if multi_split else [ratio, 1 - ratio]
    ratio_index = np.cumsum(ratio)

    window_spec = Window.orderBy(col(col_timestamp))
    rating = data.withColumn("rank", row_number().over(window_spec))

    data_count = rating.count()
    rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count)

    splits = []
    for i, _ in enumerate(ratio_index):
        if i == 0:
            rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop(
                "rank"
            )
        else:
            rating_split = rating_rank.filter(
                (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1])
            ).drop("rank")

        splits.append(rating_split)

    return splits 
Example #14
Source File: spark_splitters.py    From azure-python-labs with MIT License 4 votes vote down vote up
def spark_timestamp_split(
    data,
    ratio=0.75,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    """Spark timestamp based splitter
    The splitter splits the data into sets by timestamps without stratification on either
    user or item.
    The ratios are applied on the timestamp column which is divided accordingly into
    several partitions.

    Args:
        data (spark.DataFrame): Spark DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two sets and the ratio argument indicates the ratio of
            training data set; if it is a list of float numbers, the splitter splits
            data into several portions corresponding to the split ratios. If a list is
            provided and the ratios are not summed to 1, they will be normalized.
            Earlier indexed splits will have earlier times
            (e.g the latest time in split[0] <= the earliest time in split[1])
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.
        col_timestamp (str): column name of timestamps. Float number represented in
        seconds since Epoch.

    Returns:
        list: Splits of the input data as spark.DataFrame.
    """
    multi_split, ratio = process_split_ratio(ratio)

    ratio = ratio if multi_split else [ratio, 1 - ratio]
    ratio_index = np.cumsum(ratio)

    window_spec = Window.orderBy(col(col_timestamp))
    rating = data.withColumn("rank", row_number().over(window_spec))

    data_count = rating.count()
    rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count)

    splits = []
    for i, _ in enumerate(ratio_index):
        if i == 0:
            rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop(
                "rank"
            )
        else:
            rating_split = rating_rank.filter(
                (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1])
            ).drop("rank")

        splits.append(rating_split)

    return splits 
Example #15
Source File: series.py    From koalas with Apache License 2.0 4 votes vote down vote up
def _rank(self, method="average", ascending=True, part_cols=()):
        if method not in ["average", "min", "max", "first", "dense"]:
            msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'"
            raise ValueError(msg)

        if len(self._internal.index_spark_column_names) > 1:
            raise ValueError("rank do not support index now")

        if ascending:
            asc_func = lambda scol: scol.asc()
        else:
            asc_func = lambda scol: scol.desc()

        if method == "first":
            window = (
                Window.orderBy(
                    asc_func(self.spark.column), asc_func(F.col(NATURAL_ORDER_COLUMN_NAME)),
                )
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            scol = F.row_number().over(window)
        elif method == "dense":
            window = (
                Window.orderBy(asc_func(self.spark.column))
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            scol = F.dense_rank().over(window)
        else:
            if method == "average":
                stat_func = F.mean
            elif method == "min":
                stat_func = F.min
            elif method == "max":
                stat_func = F.max
            window1 = (
                Window.orderBy(asc_func(self.spark.column))
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            window2 = Window.partitionBy([self.spark.column] + list(part_cols)).rowsBetween(
                Window.unboundedPreceding, Window.unboundedFollowing
            )
            scol = stat_func(F.row_number().over(window1)).over(window2)
        kser = self._with_new_scol(scol).rename(self.name)
        return kser.astype(np.float64) 
Example #16
Source File: groupby.py    From koalas with Apache License 2.0 4 votes vote down vote up
def nlargest(self, n=5):
        """
        Return the first n rows ordered by columns in descending order in group.

        Return the first n rows with the smallest values in columns, in descending order.
        The columns that are not specified are returned as well, but not used for ordering.

        Parameters
        ----------
        n : int
            Number of items to retrieve.

        See Also
        --------
        databricks.koalas.Series.nlargest
        databricks.koalas.DataFrame.nlargest

        Examples
        --------
        >>> df = ks.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],
        ...                    'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])

        >>> df.groupby(['a'])['b'].nlargest(1).sort_index()  # doctest: +NORMALIZE_WHITESPACE
        a
        1  1    2
        2  4    3
        3  7    4
        Name: b, dtype: int64
        """
        if len(self._kdf._internal.index_names) > 1:
            raise ValueError("nlargest do not support multi-index now")

        sdf = self._kdf._internal.spark_frame
        name = self._agg_columns[0]._internal.data_spark_column_names[0]
        window = Window.partitionBy(self._groupkeys_scols).orderBy(
            self._agg_columns[0].spark.column.desc(), NATURAL_ORDER_COLUMN_NAME
        )
        sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n)

        internal = InternalFrame(
            spark_frame=sdf.drop(NATURAL_ORDER_COLUMN_NAME),
            index_map=OrderedDict(
                [
                    (s._internal.data_spark_column_names[0], s._internal.column_labels[0])
                    for s in self._groupkeys
                ]
                + list(self._kdf._internal.index_map.items())
            ),
            data_spark_columns=[scol_for(sdf, name)],
        )
        return first_series(DataFrame(internal))

    # TODO: add bins, normalize parameter 
Example #17
Source File: metrics.py    From search-MjoLniR with MIT License 4 votes vote down vote up
def ndcg(df, k, label_col='label', position_col='hit_position', wiki_col='wikiid',
         query_cols=['wikiid', 'query', 'session_id']):
    """
    Calculate ndcg@k for the provided dataframe

    Parameters
    ----------
    df : pyspark.sql.DataFrame
        Input dataframe to calculate against
    k : int
        Cutoff for ndcg calculation
    label_col : str
        Column name containing integer label, higher is better, of the hit
    position_col : str
        Column name containing order displayed to user, lowest first, of the hit
    query_cols : list of str
        Column names to group by, which indicate a unique query displayed to a user

    Returns
    -------
    float
        The ndcg@k value, always between 0 and 1
    """
    if wiki_col not in query_cols:
        query_cols = query_cols + [wiki_col]

    # ideal results per labels
    w = Window.partitionBy(*query_cols).orderBy(F.col(label_col).desc())
    topAtK = (
        df
        .select(label_col, *query_cols)
        .withColumn('rn', F.row_number().over(w))
        .where(F.col('rn') <= k)
        .groupBy(*query_cols)
        .agg(F.collect_list(F.struct(label_col, 'rn')).alias('topAtK')))
    # top k results shown to user
    w = Window.partitionBy(*query_cols).orderBy(F.col(position_col).asc())
    predictedTopAtK = (
        df
        .select(label_col, position_col, *query_cols)
        .withColumn('rn', F.row_number().over(w))
        .where(F.col('rn') <= k)
        .groupBy(*query_cols)
        .agg(F.collect_list(F.struct(label_col, 'rn')).alias('predictedTopAtK')))
    return {row[wiki_col]: row.ndcgAtK for row in topAtK
            .join(predictedTopAtK, query_cols, how='inner')
            .select(wiki_col, _ndcg_at(k, label_col)('predictedTopAtK', 'topAtK').alias('ndcgAtK'))
            .groupBy(wiki_col)
            .agg(F.mean('ndcgAtK').alias('ndcgAtK'))
            .collect()} 
Example #18
Source File: window.py    From koalas with Apache License 2.0 4 votes vote down vote up
def count(self):
        """
        The expanding count of any non-NaN observations inside the window.

        .. note:: the current implementation of this API uses Spark's Window without
            specifying partition specification. This leads to move all data into
            single partition in single machine and could cause serious
            performance degradation. Avoid this method against very large dataset.

        Returns
        -------
        Series or DataFrame
            Returned object type is determined by the caller of the expanding
            calculation.

        See Also
        --------
        Series.expanding : Calling object with Series data.
        DataFrame.expanding : Calling object with DataFrames.
        Series.count : Count of the full Series.
        DataFrame.count : Count of the full DataFrame.

        Examples
        --------
        >>> s = ks.Series([2, 3, float("nan"), 10])
        >>> s.expanding().count()
        0    1.0
        1    2.0
        2    2.0
        3    3.0
        Name: 0, dtype: float64

        >>> s.to_frame().expanding().count()
             0
        0  1.0
        1  2.0
        2  2.0
        3  3.0
        """

        def count(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.count(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(count).astype("float64") 
Example #19
Source File: candidate_sets.py    From listenbrainz-server with GNU General Public License v2.0 4 votes vote down vote up
def get_top_similar_artists(top_artists_df, artists_relation_df, similar_artist_limit):
    """ Get artists similar to top artists.

        Args:
            top_artists_df: Dataframe containing top artists listened to by users
            artist_relation_df: Dataframe containing artists and similar artists.
                                For columns refer to artist_relation_schema in listenbrainz_spark/schema.py.
            similar_artist_limit (int): number of similar artist to calculate

        Returns:
            top_similar_artists_df (dataframe): Top Z artists similar to top artists where
                                                Z = SIMILAR_ARTISTS_LIMIT.
    """
    condition = [top_artists_df.mb_artist_credit_id == artists_relation_df.id_0]

    df1 = top_artists_df.join(artists_relation_df, condition, 'inner') \
                        .select(col('id_0').alias('top_artist_credit_id'),
                                col('name_0').alias('top_artist_name'),
                                col('id_1').alias('similar_artist_credit_id'),
                                col('name_1').alias('similar_artist_name'),
                                'score',
                                'user_name')

    condition = [top_artists_df.mb_artist_credit_id == artists_relation_df.id_1]

    df2 = top_artists_df.join(artists_relation_df, condition, 'inner') \
                        .select(col('id_1').alias('top_artist_credit_id'),
                                col('name_1').alias('top_artist_name'),
                                col('id_0').alias('similar_artist_credit_id'),
                                col('name_0').alias('similar_artist_name'),
                                'score',
                                'user_name')

    similar_artists_df = df1.union(df2)

    window = Window.partitionBy('top_artist_credit_id', 'user_name')\
                   .orderBy(col('score').desc())

    top_similar_artists_df = similar_artists_df.withColumn('rank', row_number().over(window)) \
                                               .where(col('rank') <= similar_artist_limit)\
                                               .select('top_artist_credit_id', 'top_artist_name',
                                                       'similar_artist_credit_id', 'similar_artist_name',
                                                       'score', 'user_name')

    return top_similar_artists_df 
Example #20
Source File: compiler.py    From ibis with Apache License 2.0 4 votes vote down vote up
def compile_row_number(t, expr, scope, *, window, **kwargs):
    return F.row_number().over(window).cast('long') - 1


# -------------------------- Temporal Operations ----------------------------

# Ibis value to PySpark value