Python pyspark.sql.functions.collect_list() Examples

The following are 19 code examples for showing how to use pyspark.sql.functions.collect_list(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module pyspark.sql.functions , or try the search function .

Example 1
Project: azure-python-labs   Author: Azure-Samples   File: spark_evaluation.py    License: MIT License 7 votes vote down vote up
def _calculate_metrics(self):
        """Calculate ranking metrics."""
        self._items_for_user_pred = self.rating_pred

        self._items_for_user_true = (
            self.rating_true
            .groupBy(self.col_user)
            .agg(expr("collect_list(" + self.col_item + ") as ground_truth"))
            .select(self.col_user, "ground_truth")
        )

        self._items_for_user_all = self._items_for_user_pred.join(
            self._items_for_user_true, on=self.col_user
        ).drop(self.col_user)

        return RankingMetrics(self._items_for_user_all.rdd) 
Example 2
Project: search-MjoLniR   Author: wikimedia   File: metrics.py    License: MIT License 6 votes vote down vote up
def _ndcg_at(k, label_col):
    def ndcg_at_k(predicted, actual):
        # TODO: Taking in rn and then re-sorting might not be necessary, but i can't
        # find any real guarantee that they would come in order after a groupBy + collect_list,
        # since they were only ordered within the window function.
        predicted = [row[label_col] for row in sorted(predicted, key=lambda r: r.rn)]
        actual = [row[label_col] for row in sorted(actual, key=lambda r: r.rn)]
        dcg = 0.
        for i, label in enumerate(predicted):
            # This form is used to match EvalNDCG in xgboost
            dcg += ((1 << label) - 1) / math.log(i + 2.0, 2)
        idcg = 0.
        for i, label in enumerate(actual):
            idcg += ((1 << label) - 1) / math.log(i + 2.0, 2)
        if idcg == 0:
            return 0
        else:
            return dcg / idcg
    return F.udf(ndcg_at_k, pyspark.sql.types.DoubleType()) 
Example 3
Project: azure-python-labs   Author: Azure-Samples   File: spark_evaluation.py    License: MIT License 6 votes vote down vote up
def _calculate_metrics(self):
        """Calculate ranking metrics."""
        self._items_for_user_pred = self.rating_pred

        self._items_for_user_true = (
            self.rating_true
            .groupBy(self.col_user)
            .agg(expr("collect_list(" + self.col_item + ") as ground_truth"))
            .select(self.col_user, "ground_truth")
        )

        self._items_for_user_all = self._items_for_user_pred.join(
            self._items_for_user_true, on=self.col_user
        ).drop(self.col_user)

        return RankingMetrics(self._items_for_user_all.rdd) 
Example 4
Project: search-MjoLniR   Author: wikimedia   File: norm_query_clustering.py    License: MIT License 5 votes vote down vote up
def cluster_within_norm_query_groups(df: DataFrame) -> DataFrame:
    make_groups = F.udf(_make_query_groups, T.ArrayType(T.StructType([
        T.StructField('query', T.StringType(), nullable=False),
        T.StructField('norm_query_group_id', T.IntegerType(), nullable=False),
    ])))
    return (
        df
        .groupBy('wikiid', 'norm_query')
        .agg(F.collect_list(F.struct('query', 'hit_page_ids')).alias('source'))
        .select(
            'wikiid', 'norm_query',
            F.explode(make_groups('source')).alias('group'))
        .select('wikiid', 'norm_query', 'group.query', 'group.norm_query_group_id')) 
Example 5
Project: search-MjoLniR   Author: wikimedia   File: norm_query_clustering.py    License: MIT License 5 votes vote down vote up
def with_unique_cluster_id(df: DataFrame) -> DataFrame:
    return (
        df
        .groupby('wikiid', 'norm_query', 'norm_query_group_id')
        .agg(F.collect_list('query').alias('queries'))
        .select(
            'wikiid', 'queries',
            F.monotonically_increasing_id().alias('cluster_id'))
        .select('wikiid', F.explode('queries').alias('query'), 'cluster_id')) 
Example 6
Project: ibis   Author: ibis-project   File: compiler.py    License: Apache License 2.0 5 votes vote down vote up
def compile_group_concat(t, expr, scope, context=None, **kwargs):
    sep = expr.op().sep.op().value

    def fn(col):
        return F.concat_ws(sep, F.collect_list(col))

    return compile_aggregator(t, expr, scope, fn, context) 
Example 7
Project: ibis   Author: ibis-project   File: compiler.py    License: Apache License 2.0 5 votes vote down vote up
def compile_array_collect(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    return F.collect_list(src_column)


# --------------------------- Null Operations ----------------------------- 
Example 8
Project: listenbrainz-server   Author: metabrainz   File: listening_activity.py    License: GNU General Public License v2.0 5 votes vote down vote up
def get_listening_activity():
    """ Calculate number of listens for each user in time ranges given in the 'time_range' table """
    # Calculate the number of listens in each time range for each user except the time ranges which have zero listens.
    result_without_zero_days = run_query("""
            SELECT listens.user_name
                 , time_range.time_range
                 , count(listens.user_name) as listen_count
              FROM listens
              JOIN time_range
                ON listens.listened_at >= time_range.start
               AND listens.listened_at <= time_range.end
          GROUP BY listens.user_name
                 , time_range.time_range
            """)
    result_without_zero_days.createOrReplaceTempView('result_without_zero_days')

    # Add the time ranges which have zero listens to the previous dataframe
    result = run_query("""
            SELECT dist_user_name.user_name
                 , time_range.time_range
                 , to_unix_timestamp(time_range.start) as from_ts
                 , to_unix_timestamp(time_range.end) as to_ts
                 , ifnull(result_without_zero_days.listen_count, 0) as listen_count
              FROM (SELECT DISTINCT user_name FROM listens) dist_user_name
        CROSS JOIN time_range
         LEFT JOIN result_without_zero_days
                ON result_without_zero_days.user_name = dist_user_name.user_name
               AND result_without_zero_days.time_range = time_range.time_range
            """)

    # Create a table with a list of time ranges and corresponding listen count for each user
    iterator = result \
        .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \
        .groupBy("user_name") \
        .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \
        .toLocalIterator()

    return iterator 
Example 9
Project: listenbrainz-server   Author: metabrainz   File: listening_activity.py    License: GNU General Public License v2.0 5 votes vote down vote up
def get_listening_activity_all_time() -> Iterator[Optional[UserListeningActivityStatMessage]]:
    """ Calculate the number of listens for an user in each year starting from LAST_FM_FOUNDING_YEAR (2002). """
    current_app.logger.debug("Calculating listening_activity_all_time")

    to_date = get_latest_listen_ts()
    from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1)

    result_without_zero_years = None
    for year in range(from_date.year, to_date.year+1):
        year_start = datetime(year, 1, 1)
        year_end = get_year_end(year)
        try:
            _get_listens(year_start, year_end)
        except HDFSException:
            # Skip if no listens present in df
            continue
        year_df = run_query("""
                    SELECT user_name,
                           count(user_name) as listen_count
                      FROM listens
                  GROUP BY user_name
                  """)
        year_df = year_df.withColumn('time_range', lit(str(year))).withColumn(
            'from_ts', lit(year_start.timestamp())).withColumn('to_ts', lit(year_end.timestamp()))
        result_without_zero_years = result_without_zero_years.union(year_df) if result_without_zero_years else year_df

    # Create a table with a list of time ranges and corresponding listen count for each user
    data = result_without_zero_years \
        .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \
        .groupBy("user_name") \
        .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \
        .toLocalIterator()

    messages = create_messages(data=data, stats_range='all_time', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    current_app.logger.debug("Done!")

    return messages 
Example 10
Project: azure-python-labs   Author: Azure-Samples   File: spark_evaluation.py    License: MIT License 5 votes vote down vote up
def _get_top_k_items(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=PREDICTION_COL,
        k=DEFAULT_K
):
    """Get the input customer-item-rating tuple in the format of Spark
    DataFrame, output a Spark DataFrame in the dense format of top k items
    for each user.
    NOTE: if it is implicit rating, just append a column of constants to be ratings.

    Args:
        dataframe (spark.DataFrame): DataFrame of rating data (in the format of
        customerID-itemID-rating tuple).
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        k (int): number of items for each user.

    Return:
        spark.DataFrame: DataFrame of top k items for each user.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())

    # this does not work for rating of the same value.
    items_for_user = (
        dataframe.select(
            col_user,
            col_item,
            col_rating,
            row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .groupby(col_user)
        .agg(F.collect_list(col_item).alias(col_prediction))
    )

    return items_for_user 
Example 11
Project: azure-python-labs   Author: Azure-Samples   File: spark_evaluation.py    License: MIT License 5 votes vote down vote up
def _get_relevant_items_by_threshold(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=PREDICTION_COL,
        threshold=DEFAULT_THRESHOLD
):
    """Get relevant items for each customer in the input rating data.

    Relevant items are defined as those having ratings above certain threshold.
    The threshold is defined as a statistical measure of the ratings for a
    user, e.g., median.

    Args:
        dataframe: Spark DataFrame of customerID-itemID-rating tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        threshold (float): threshold for determining the relevant recommended items.
            This is used for the case that predicted ratings follow a known
            distribution.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant
            items.
    """
    items_for_user = (
        dataframe
        .orderBy(col_rating, ascending=False)
        .where(col_rating + " >= " + str(threshold))
        .select(
            col_user, col_item, col_rating
        )
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates()
    )

    return items_for_user 
Example 12
Project: azure-python-labs   Author: Azure-Samples   File: spark_evaluation.py    License: MIT License 5 votes vote down vote up
def _get_relevant_items_by_timestamp(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_timestamp=DEFAULT_TIMESTAMP_COL,
        col_prediction=PREDICTION_COL,
        k=DEFAULT_K
):
    """Get relevant items for each customer defined by timestamp.

    Relevant items are defined as k items that appear mostly recently
    according to timestamps.

    Args:
        dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
            tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_timestamp (str): column name for timestamp.
        col_prediction (str): column name for prediction.
        k: number of relevent items to be filtered by the function.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())

    items_for_user = (
        dataframe.select(
            col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates([col_user, col_prediction])
    )

    return items_for_user 
Example 13
Project: azure-python-labs   Author: Azure-Samples   File: spark_evaluation.py    License: MIT License 5 votes vote down vote up
def _get_relevant_items_by_threshold(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=PREDICTION_COL,
        threshold=DEFAULT_THRESHOLD
):
    """Get relevant items for each customer in the input rating data.

    Relevant items are defined as those having ratings above certain threshold.
    The threshold is defined as a statistical measure of the ratings for a
    user, e.g., median.

    Args:
        dataframe: Spark DataFrame of customerID-itemID-rating tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        threshold (float): threshold for determining the relevant recommended items.
            This is used for the case that predicted ratings follow a known
            distribution.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant
            items.
    """
    items_for_user = (
        dataframe
        .orderBy(col_rating, ascending=False)
        .where(col_rating + " >= " + str(threshold))
        .select(
            col_user, col_item, col_rating
        )
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates()
    )

    return items_for_user 
Example 14
Project: azure-python-labs   Author: Azure-Samples   File: spark_evaluation.py    License: MIT License 5 votes vote down vote up
def _get_relevant_items_by_timestamp(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_timestamp=DEFAULT_TIMESTAMP_COL,
        col_prediction=PREDICTION_COL,
        k=DEFAULT_K
):
    """Get relevant items for each customer defined by timestamp.

    Relevant items are defined as k items that appear mostly recently
    according to timestamps.

    Args:
        dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
            tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_timestamp (str): column name for timestamp.
        col_prediction (str): column name for prediction.
        k: number of relevent items to be filtered by the function.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())

    items_for_user = (
        dataframe.select(
            col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates([col_user, col_prediction])
    )

    return items_for_user 
Example 15
Project: search-MjoLniR   Author: wikimedia   File: metrics.py    License: MIT License 4 votes vote down vote up
def ndcg(df, k, label_col='label', position_col='hit_position', wiki_col='wikiid',
         query_cols=['wikiid', 'query', 'session_id']):
    """
    Calculate ndcg@k for the provided dataframe

    Parameters
    ----------
    df : pyspark.sql.DataFrame
        Input dataframe to calculate against
    k : int
        Cutoff for ndcg calculation
    label_col : str
        Column name containing integer label, higher is better, of the hit
    position_col : str
        Column name containing order displayed to user, lowest first, of the hit
    query_cols : list of str
        Column names to group by, which indicate a unique query displayed to a user

    Returns
    -------
    float
        The ndcg@k value, always between 0 and 1
    """
    if wiki_col not in query_cols:
        query_cols = query_cols + [wiki_col]

    # ideal results per labels
    w = Window.partitionBy(*query_cols).orderBy(F.col(label_col).desc())
    topAtK = (
        df
        .select(label_col, *query_cols)
        .withColumn('rn', F.row_number().over(w))
        .where(F.col('rn') <= k)
        .groupBy(*query_cols)
        .agg(F.collect_list(F.struct(label_col, 'rn')).alias('topAtK')))
    # top k results shown to user
    w = Window.partitionBy(*query_cols).orderBy(F.col(position_col).asc())
    predictedTopAtK = (
        df
        .select(label_col, position_col, *query_cols)
        .withColumn('rn', F.row_number().over(w))
        .where(F.col('rn') <= k)
        .groupBy(*query_cols)
        .agg(F.collect_list(F.struct(label_col, 'rn')).alias('predictedTopAtK')))
    return {row[wiki_col]: row.ndcgAtK for row in topAtK
            .join(predictedTopAtK, query_cols, how='inner')
            .select(wiki_col, _ndcg_at(k, label_col)('predictedTopAtK', 'topAtK').alias('ndcgAtK'))
            .groupBy(wiki_col)
            .agg(F.mean('ndcgAtK').alias('ndcgAtK'))
            .collect()} 
Example 16
Project: listenbrainz-server   Author: metabrainz   File: release.py    License: GNU General Public License v2.0 4 votes vote down vote up
def get_releases(table):
    """
    Get release information (release_name, release_mbid etc) for every user
    ordered by listen count (number of times a user has listened to tracks
    which belong to a particular release).

    Args:
        table: name of the temporary table

    Returns:
        iterator (iter): an iterator over result
                {
                    'user1' : [{
                        'release_name': str
                        'release_msid': str,
                        'release_mbid': str,
                        'artist_name': str,
                        'artist_msid': str,
                        'artist_mbids': list(str),
                        'listen_count': int
                    }],
                    'user2' : [{...}],
                }
    """
    result = run_query("""
            SELECT user_name
                 , nullif(release_name, '') as release_name
                 , CASE
                     WHEN release_mbid IS NOT NULL AND release_mbid != '' THEN NULL
                     ELSE nullif(release_msid, '')
                   END as release_msid
                 , nullif(release_mbid, '') as release_mbid
                 , artist_name
                 , CASE
                     WHEN cardinality(artist_mbids) > 0 THEN NULL
                     ELSE nullif(artist_msid, '')
                   END as artist_msid
                 , artist_mbids
                 , count(release_name) as listen_count
              FROM {}
             WHERE release_name IS NOT NULL AND release_name != ''
          GROUP BY user_name
                 , release_name
                 , release_msid
                 , release_mbid
                 , artist_name
                 , artist_msid
                 , artist_mbids
        """.format(table))

    iterator = result \
        .withColumn("releases", struct("listen_count", "release_name", "release_msid", "release_mbid", "artist_name",
                                       "artist_msid", "artist_mbids")) \
        .groupBy("user_name") \
        .agg(sort_array(collect_list("releases"), asc=False).alias("releases")) \
        .toLocalIterator()

    return iterator 
Example 17
Project: listenbrainz-server   Author: metabrainz   File: artist.py    License: GNU General Public License v2.0 4 votes vote down vote up
def get_artists(table):
    """ Get artist information (artist_name, artist_msid etc) for every user
        ordered by listen count

        Args:
            table (str): name of the temporary table.

        Returns:
            iterator (iter): an iterator over result
                    {
                        user1: [{
                            'artist_name': str,
                            'artist_msid': str,
                            'artist_mbids': list(str),
                            'listen_count': int
                        }],
                        user2: [{...}],
                    }
    """

    result = run_query("""
            SELECT user_name
                 , artist_name
                 , CASE
                     WHEN cardinality(artist_mbids) > 0 THEN NULL
                     ELSE nullif(artist_msid, '')
                   END as artist_msid
                 , artist_mbids
                 , count(artist_name) as listen_count
              FROM {table}
          GROUP BY user_name
                 , artist_name
                 , artist_msid
                 , artist_mbids
            """.format(table=table))

    iterator = result \
        .withColumn("artists", struct("listen_count", "artist_name", "artist_msid", "artist_mbids")) \
        .groupBy("user_name") \
        .agg(sort_array(collect_list("artists"), asc=False).alias("artists")) \
        .toLocalIterator()

    return iterator 
Example 18
Project: telemetry-airflow   Author: mozilla   File: taar_lite_guidguid.py    License: Mozilla Public License 2.0 4 votes vote down vote up
def transform(longitudinal_addons):
    # Only for logging, not used, but may be interesting for later analysis.
    guid_set_unique = (
        longitudinal_addons.withColumn(
            "exploded", F.explode(longitudinal_addons.installed_addons)
        )
        .select("exploded")  # noqa: E501 - long lines
        .rdd.flatMap(lambda x: x)
        .distinct()
        .collect()
    )
    logging.info(
        "Number of unique guids co-installed in sample: " + str(len(guid_set_unique))
    )

    restructured = longitudinal_addons.rdd.flatMap(
        lambda x: key_all(x.installed_addons)
    ).toDF(["key_addon", "coinstalled_addons"])

    # Explode the list of co-installs and count pair occurrences.
    addon_co_installations = (
        restructured.select(
            "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon")
        )  # noqa: E501 - long lines
        .groupBy("key_addon", "coinstalled_addon")
        .count()
    )

    # Collect the set of coinstalled_addon, count pairs for each key_addon.
    combine_and_map_cols = F.udf(
        lambda x, y: (x, y),
        StructType([StructField("id", StringType()), StructField("n", LongType())]),
    )

    # Spark functions are sometimes long and unwieldy. Tough luck.
    # Ignore E128 and E501 long line errors
    addon_co_installations_collapsed = (
        addon_co_installations.select(  # noqa: E128
            "key_addon",
            combine_and_map_cols("coinstalled_addon", "count").alias(  # noqa: E501
                "id_n"
            ),
        )
        .groupby("key_addon")
        .agg(F.collect_list("id_n").alias("coinstallation_counts"))
    )
    logging.info(addon_co_installations_collapsed.printSchema())
    logging.info("Collecting final result of co-installations.")

    return addon_co_installations_collapsed 
Example 19
Project: python_mozetl   Author: mozilla   File: taar_lite_guidguid.py    License: MIT License 4 votes vote down vote up
def transform(longitudinal_addons):
    # Only for logging, not used, but may be interesting for later analysis.
    guid_set_unique = (
        longitudinal_addons.withColumn(
            "exploded", F.explode(longitudinal_addons.installed_addons)
        )
        .select("exploded")  # noqa: E501 - long lines
        .rdd.flatMap(lambda x: x)
        .distinct()
        .collect()
    )
    logging.info(
        "Number of unique guids co-installed in sample: " + str(len(guid_set_unique))
    )

    restructured = longitudinal_addons.rdd.flatMap(
        lambda x: key_all(x.installed_addons)
    ).toDF(["key_addon", "coinstalled_addons"])

    # Explode the list of co-installs and count pair occurrences.
    addon_co_installations = (
        restructured.select(
            "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon")
        )  # noqa: E501 - long lines
        .groupBy("key_addon", "coinstalled_addon")
        .count()
    )

    # Collect the set of coinstalled_addon, count pairs for each key_addon.
    combine_and_map_cols = F.udf(
        lambda x, y: (x, y),
        StructType([StructField("id", StringType()), StructField("n", LongType())]),
    )

    # Spark functions are sometimes long and unwieldy. Tough luck.
    # Ignore E128 and E501 long line errors
    addon_co_installations_collapsed = (
        addon_co_installations.select(  # noqa: E128
            "key_addon",
            combine_and_map_cols("coinstalled_addon", "count").alias(  # noqa: E501
                "id_n"
            ),
        )
        .groupby("key_addon")
        .agg(F.collect_list("id_n").alias("coinstallation_counts"))
    )
    logging.info(addon_co_installations_collapsed.printSchema())
    logging.info("Collecting final result of co-installations.")

    return addon_co_installations_collapsed