Python pyspark.sql.functions.count() Examples

The following are 30 code examples of pyspark.sql.functions.count(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.functions , or try the search function .
Example #1
Source File: candidate_sets.py    From listenbrainz-server with GNU General Public License v2.0 7 votes vote down vote up
def get_top_artists(mapped_listens_subset, top_artist_limit):
    """ Get top artists listened to by users who have a listening history in
        the past X days where X = RECOMMENDATION_GENERATION_WINDOW.

        Args:
            df (dataframe): A subset of mapped_df containing user history.
            top_artist_limit (int): number of top artist to calculate

        Returns:
            top_artists_df (dataframe): Top Y artists listened to by a user for all users where
                                        Y = TOP_ARTISTS_LIMIT
    """
    df = mapped_listens_subset.select('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name') \
                              .groupBy('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name') \
                              .agg(func.count('mb_artist_credit_id').alias('count'))

    window = Window.partitionBy('user_name').orderBy(col('count').desc())

    top_artists_df = df.withColumn('rank', row_number().over(window)) \
                       .where(col('rank') <= top_artist_limit) \
                       .select('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name')

    return top_artists_df 
Example #2
Source File: groupby.py    From koalas with Apache License 2.0 6 votes vote down vote up
def count(self):
        """
        Compute count of group, excluding missing values.

        See Also
        --------
        databricks.koalas.Series.groupby
        databricks.koalas.DataFrame.groupby

        Examples
        --------
        >>> df = ks.DataFrame({'A': [1, 1, 2, 1, 2],
        ...                    'B': [np.nan, 2, 3, 4, 5],
        ...                    'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])
        >>> df.groupby('A').count().sort_index()  # doctest: +NORMALIZE_WHITESPACE
            B  C
        A
        1  2  3
        2  2  2
        """
        return self._reduce_for_stat_function(F.count, only_numeric=False)

    # TODO: We should fix See Also when Series implementation is finished. 
Example #3
Source File: create_dataframes.py    From listenbrainz-server with GNU General Public License v2.0 6 votes vote down vote up
def get_users_dataframe(mapped_listens_df, metadata):
    """ Prepare users dataframe

        Args:
            mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping.

        Returns:
            users_df : Dataframe containing user names and user ids.
    """
    # We use window function to give rank to distinct user_names
    # Note that if user_names are not distinct rank would repeat and give unexpected results.
    user_window = Window.orderBy('user_name')
    users_df = mapped_listens_df.select('user_name').distinct() \
                                .withColumn('user_id', rank().over(user_window))

    metadata['users_count'] = users_df.count()
    save_dataframe(users_df, path.USERS_DATAFRAME_PATH)
    return users_df 
Example #4
Source File: create_dataframes.py    From listenbrainz-server with GNU General Public License v2.0 6 votes vote down vote up
def get_recordings_df(mapped_listens_df, metadata):
    """ Prepare recordings dataframe.

        Args:
            mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping.

        Returns:
            recordings_df: Dataframe containing distinct recordings and corresponding
                mbids and names.
    """
    recording_window = Window.orderBy('mb_recording_mbid')

    recordings_df = mapped_listens_df.select('mb_artist_credit_id',
                                             'mb_artist_credit_mbids',
                                             'mb_recording_mbid',
                                             'mb_release_mbid',
                                             'msb_artist_credit_name_matchable',
                                             'track_name') \
                                     .distinct() \
                                     .withColumn('recording_id', rank().over(recording_window))

    metadata['recordings_count'] = recordings_df.count()
    save_dataframe(recordings_df, path.RECORDINGS_DATAFRAME_PATH)
    return recordings_df 
Example #5
Source File: plot.py    From koalas with Apache License 2.0 6 votes vote down vote up
def _make_plot(self):
        # 'num_colors' requires to calculate `shape` which has to count all.
        # Use 1 for now to save the computation.
        colors = self._get_colors(num_colors=1)
        stacking_id = self._get_stacking_id()

        sdf = self.data._internal.spark_frame

        for i, label in enumerate(self.data._internal.column_labels):
            # 'y' is a Spark DataFrame that selects one column.
            y = sdf.select(self.data._internal.spark_column_for(label))
            ax = self._get_ax(i)

            kwds = self.kwds.copy()

            label = pprint_thing(label if len(label) > 1 else label[0])
            kwds["label"] = label

            style, kwds = self._apply_style_colors(colors, kwds, i, label)
            if style is not None:
                kwds["style"] = style

            kwds = self._make_plot_keywords(kwds, y)
            artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds)
            self._add_legend_handle(artists[0], label, index=i) 
Example #6
Source File: create_dataframes.py    From listenbrainz-server with GNU General Public License v2.0 6 votes vote down vote up
def get_playcounts_df(listens_df, recordings_df, users_df, metadata):
    """ Prepare playcounts dataframe.

        Args:
            listens_df : Dataframe containing recording_mbids corresponding to a user.
            recordings_df : Dataframe containing distinct recordings and corresponding
                                       mbids and names.
            users_df : Dataframe containing user names and user ids.

        Returns:
            playcounts_df: Dataframe containing play(listen) counts of users.
    """
    # listens_df is joined with users_df on user_name.
    # The output is then joined with recording_df on recording_mbid.
    # The final step uses groupBy which create groups on user_id and recording_id and counts the number of recording_ids.
    # The final dataframe tells us about the number of times a user has listend to a particular track for all users.
    playcounts_df = listens_df.join(users_df, 'user_name', 'inner') \
                              .join(recordings_df, 'mb_recording_mbid', 'inner') \
                              .groupBy('user_id', 'recording_id') \
                              .agg(func.count('recording_id').alias('count'))

    metadata['playcounts_count'] = playcounts_df.count()
    save_dataframe(playcounts_df, path.PLAYCOUNTS_DATAFRAME_PATH)
    return playcounts_df 
Example #7
Source File: window.py    From koalas with Apache License 2.0 6 votes vote down vote up
def __init__(self, kdf_or_kser, window, min_periods=None):
        from databricks.koalas import DataFrame, Series

        if window < 0:
            raise ValueError("window must be >= 0")
        if (min_periods is not None) and (min_periods < 0):
            raise ValueError("min_periods must be >= 0")
        if min_periods is None:
            # TODO: 'min_periods' is not equivalent in pandas because it does not count NA as
            #  a value.
            min_periods = window

        if not isinstance(kdf_or_kser, (DataFrame, Series)):
            raise TypeError(
                "kdf_or_kser must be a series or dataframe; however, got: %s" % type(kdf_or_kser)
            )

        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(
            Window.currentRow - (window - 1), Window.currentRow
        )

        super(Rolling, self).__init__(kdf_or_kser, window, min_periods) 
Example #8
Source File: series.py    From koalas with Apache License 2.0 6 votes vote down vote up
def count(self):
        """
        Return number of non-NA/null observations in the Series.

        Returns
        -------
        nobs : int

        Examples
        --------
        Constructing DataFrame from a dictionary:

        >>> df = ks.DataFrame({"Person":
        ...                    ["John", "Myla", "Lewis", "John", "Myla"],
        ...                    "Age": [24., np.nan, 21., 33, 26]})

        Notice the uncounted NA values:

        >>> df['Person'].count()
        5

        >>> df['Age'].count()
        4
        """
        return self._reduce_for_stat_function(Frame._count_expr, name="count") 
Example #9
Source File: dataframe.py    From sparklingpandas with Apache License 2.0 6 votes vote down vote up
def stats(self, columns):
        """Compute the stats for each column provided in columns.
        Parameters
        ----------
        columns : list of str, contains all columns to compute stats on.
        """
        assert (not isinstance(columns, basestring)), "columns should be a " \
                                                      "list of strs,  " \
                                                      "not a str!"
        assert isinstance(columns, list), "columns should be a list!"

        from pyspark.sql import functions as F
        functions = [F.min, F.max, F.avg, F.count]
        aggs = list(
            self._flatmap(lambda column: map(lambda f: f(column), functions),
                          columns))
        return PStats(self.from_schema_rdd(self._schema_rdd.agg(*aggs))) 
Example #10
Source File: groupby.py    From sparklingpandas with Apache License 2.0 6 votes vote down vote up
def count(self):
        """Compute the number of elements in each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.count)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).count()

        def merge_value(x, y):
            return x.append(create_combiner(y)).count()

        def merge_combiner(x, y):
            return x.append(y).count(level=0)

        rddOfCounts = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfCounts, self.sql_ctx) 
Example #11
Source File: indexes.py    From koalas with Apache License 2.0 6 votes vote down vote up
def _summary(self, name=None):
        """
        Return a summarized representation.

        Parameters
        ----------
        name : str
            name to use in the summary representation

        Returns
        -------
        String with a summarized representation of the index
        """
        head, tail, total_count = self._internal.spark_frame.select(
            F.first(self.spark.column), F.last(self.spark.column), F.count(F.expr("*"))
        ).first()

        if total_count > 0:
            index_summary = ", %s to %s" % (pprint_thing(head), pprint_thing(tail))
        else:
            index_summary = ""

        if name is None:
            name = type(self).__name__
        return "%s: %s entries%s" % (name, total_count, index_summary) 
Example #12
Source File: helpers.py    From SMV with Apache License 2.0 6 votes vote down vote up
def smvDupeCheck(self, keys, n=10000):
        """For a given list of potential keys, check for duplicated records with the number of duplications and all the columns.

            Null values are allowed in the potential keys, so duplication on Null valued keys will also be reported.

            Args:
                keys (list(string)): the key column list which the duplicate check applied
                n (integer): number of rows from input data for checking duplications, defaults to 10000

            Returns:
                (DataFrame): returns key columns + "_N" + the rest columns for the records with more key duplication records, 
                    where "_N" has the count of duplications of the key values of that record
        """
        dfTopN = self.df.limit(n).cache()

        res = dfTopN.groupBy(*keys)\
            .agg(F.count(F.lit(1)).alias('_N'))\
            .where(F.col('_N') > 1)\
            .smvJoinByKey(dfTopN, keys, 'inner', True)\
            .orderBy(*keys)

        dfTopN.unpersist()
        return res 
Example #13
Source File: helpers.py    From SMV with Apache License 2.0 6 votes vote down vote up
def smvSelectPlus(self, *cols):
        """Selects all the current columns in current DataFrame plus the supplied expressions

            The new columns are added to the end of the current column list.

            Args:
                cols (\*Column): expressions to add to the DataFrame

            Example:
                >>> df.smvSelectPlus((col("price") * col("count")).alias("amt"))

            Returns:
                (DataFrame): the resulting DataFrame after removal of columns
        """
        jdf = self._jDfHelper.smvSelectPlus(_to_seq(cols, _jcol))
        return DataFrame(jdf, self._sql_ctx) 
Example #14
Source File: helpers.py    From SMV with Apache License 2.0 6 votes vote down vote up
def smvPivotSum(self, pivotCols, valueCols, baseOutput):
        """Perform SmvPivot, then sum the results.
            Please refer smvPivot's document for context and details of the SmvPivot operation.

            Args:
                pivotCols (list(list(str))): list of lists of column names to pivot
                valueCols (list(string)): names of value columns to sum
                baseOutput (list(str)): expected names pivoted column

            Examples:
                For example, given a DataFrame df that represents the table

                +-----+-------+---------+-------+
                | id  | month | product | count |
                +=====+=======+=========+=======+
                | 1   | 5/14  |   A     |   100 |
                +-----+-------+---------+-------+
                | 1   | 6/14  |   B     |   200 |
                +-----+-------+---------+-------+
                | 1   | 5/14  |   B     |   300 |
                +-----+-------+---------+-------+

                we can use

                >>> df.smvGroupBy("id").smvPivotSum([["month", "product"]], ["count"], ["5_14_A", "5_14_B", "6_14_A", "6_14_B"])

                to produce the following output

                +-----+--------------+--------------+--------------+--------------+
                | id  | count_5_14_A | count_5_14_B | count_6_14_A | count_6_14_B |
                +=====+==============+==============+==============+==============+
                | 1   | 100          | 300          | NULL         | 200          |
                +-----+--------------+--------------+--------------+--------------+

            Returns:
                (DataFrame): result of pivot sum
        """
        return DataFrame(self.sgd.smvPivotSum(smv_copy_array(self.df._sc, *pivotCols), smv_copy_array(self.df._sc, *valueCols), smv_copy_array(self.df._sc, *baseOutput)), self.df.sql_ctx) 
Example #15
Source File: sample_scaffolds.py    From reinvent-scaffold-decorator with MIT License 6 votes vote down vote up
def _join_results(self, scaffolds_df):

        def _read_rows(row):
            idx, _, dec = row.split("\t")
            return ps.Row(id=idx, decoration_smi=dec)

        sampled_df = SPARK.createDataFrame(SC.textFile(self._tmp_path(
            "sampled_decorations"), self.num_partitions).map(_read_rows))

        if self.decorator_type == "single":
            processed_df = self._join_results_single(scaffolds_df, sampled_df)
        elif self.decorator_type == "multi":
            processed_df = self._join_results_multi(scaffolds_df, sampled_df)
        else:
            raise ValueError("decorator_type has an invalid value '{}'".format(self.decorator_type))

        return processed_df\
            .where("smiles IS NOT NULL")\
            .groupBy("smiles")\
            .agg(
                psf.first("scaffold").alias("scaffold"),
                psf.first("decorations").alias("decorations"),
                psf.count("smiles").alias("count")) 
Example #16
Source File: base.py    From koalas with Apache License 2.0 5 votes vote down vote up
def _nunique(self, dropna=True, approx=False, rsd=0.05):
        colname = self._internal.data_spark_column_names[0]
        count_fn = partial(F.approx_count_distinct, rsd=rsd) if approx else F.countDistinct
        if dropna:
            return count_fn(self.spark.column).alias(colname)
        else:
            return (
                count_fn(self.spark.column)
                + F.when(
                    F.count(F.when(self.spark.column.isNull(), 1).otherwise(None)) >= 1, 1
                ).otherwise(0)
            ).alias(colname) 
Example #17
Source File: indexes.py    From koalas with Apache License 2.0 5 votes vote down vote up
def _get_level_number(self, level):
        """
        Return the level number if a valid level is given.
        """
        count = self.names.count(level)
        if (count > 1) and not isinstance(level, int):
            raise ValueError("The name %s occurs multiple times, use a level number" % level)
        if level in self.names:
            level = self.names.index(level)
        elif isinstance(level, int):
            nlevels = self.nlevels
            if level >= nlevels:
                raise IndexError(
                    "Too many levels: Index has only %d "
                    "levels, %d is not a valid level number" % (nlevels, level)
                )
            if level < 0:
                if (level + nlevels) < 0:
                    raise IndexError(
                        "Too many levels: Index has only %d levels, "
                        "not %d" % (nlevels, level + 1)
                    )
                level = level + nlevels
        else:
            raise KeyError("Level %s not found" % str(level))

        return level 
Example #18
Source File: compiler.py    From ibis with Apache License 2.0 5 votes vote down vote up
def _handle_shift_operation(t, expr, scope, fn, *, window, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    default = op.default.op().value if op.default is not None else op.default
    offset = op.offset.op().value if op.offset is not None else op.offset

    if offset:
        return fn(src_column, count=offset, default=default).over(window)
    else:
        return fn(src_column, default=default).over(window) 
Example #19
Source File: plot.py    From koalas with Apache License 2.0 5 votes vote down vote up
def _compute_hist(sdf, bins):
        # 'data' is a Spark DataFrame that selects one column.
        assert isinstance(bins, (np.ndarray, np.generic))

        colname = sdf.columns[-1]

        bucket_name = "__{}_bucket".format(colname)
        # creates a Bucketizer to get corresponding bin of each value
        bucketizer = Bucketizer(
            splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip"
        )
        # after bucketing values, groups and counts them
        result = (
            bucketizer.transform(sdf)
            .select(bucket_name)
            .groupby(bucket_name)
            .agg(F.count("*").alias("count"))
            .toPandas()
            .sort_values(by=bucket_name)
        )

        # generates a pandas DF with one row for each bin
        # we need this as some of the bins may be empty
        indexes = pd.DataFrame({bucket_name: np.arange(0, len(bins) - 1), "bucket": bins[:-1]})
        # merges the bins with counts on it and fills remaining ones with zeros
        pdf = indexes.merge(result, how="left", on=[bucket_name]).fillna(0)[["count"]]
        pdf.columns = [bucket_name]

        return pdf[bucket_name] 
Example #20
Source File: series.py    From koalas with Apache License 2.0 5 votes vote down vote up
def is_unique(self):
        """
        Return boolean if values in the object are unique

        Returns
        -------
        is_unique : boolean

        >>> ks.Series([1, 2, 3]).is_unique
        True
        >>> ks.Series([1, 2, 2]).is_unique
        False
        >>> ks.Series([1, 2, 3, None]).is_unique
        True
        """
        scol = self.spark.column

        # Here we check:
        #   1. the distinct count without nulls and count without nulls for non-null values
        #   2. count null values and see if null is a distinct value.
        #
        # This workaround is in order to calculate the distinct count including nulls in
        # single pass. Note that COUNT(DISTINCT expr) in Spark is designed to ignore nulls.
        return self._internal.spark_frame.select(
            (F.count(scol) == F.countDistinct(scol))
            & (F.count(F.when(scol.isNull(), 1).otherwise(None)) <= 1)
        ).collect()[0][0] 
Example #21
Source File: groupby.py    From sparklingpandas with Apache License 2.0 5 votes vote down vote up
def __len__(self):
        """Number of groups."""
        # TODO: use Spark SQL
        self._prep_pandas_groupby()
        return self._mergedRDD.count() 
Example #22
Source File: groupby.py    From sparklingpandas with Apache License 2.0 5 votes vote down vote up
def ngroups(self):
        """Number of groups."""
        if self._can_use_new_school():
            return self._grouped_spark_sql.count()
        self._prep_pandas_groupby()
        return self._mergedRDD.count() 
Example #23
Source File: test_landfill_sampler.py    From python_mozetl with MIT License 5 votes vote down vote up
def test_transform_contains_at_most_n_documents(generate_data):
    n = 3
    params = [
        # (namespace, doc_type, doc_version, n_documents)
        ("custom", "main", 4, n + 1),
        ("custom", "main", 3, n - 1),
        ("custom", "crash", 4, n),
    ]
    snippets = []
    for ns, dt, dv, n in params:
        # generate n different generic uri's for each of the parameters
        snippets += [{"uri": build_generic_uri(ns, dt, dv, uid)} for uid in range(n)]

    df = sampler.transform(generate_data(snippets), n)

    # assert there are at most n documents
    res = (
        df.groupBy("namespace", "doc_type", "doc_version")
        .agg(F.count("*").alias("n_documents"))
        .withColumn("at_most_n", F.col("n_documents") <= n)
        .collect()
    )

    # check that the different combinations are the same as the generated ones
    assert len(res) == len(params)
    # check that all of the combinations have at most n documents
    assert all([row.at_most_n for row in res]) 
Example #24
Source File: sample_scaffolds.py    From reinvent-scaffold-decorator with MIT License 5 votes vote down vote up
def _initialize_results(self, scaffolds):
        data = [ps.Row(smiles=scaffold, scaffold=scaffold,
                       decorations={}, count=1) for scaffold in scaffolds]
        data_schema = pst.StructType([
            pst.StructField("smiles", pst.StringType()),
            pst.StructField("scaffold", pst.StringType()),
            pst.StructField("decorations", pst.MapType(pst.IntegerType(), pst.StringType())),
            pst.StructField("count", pst.IntegerType())
        ])
        return SPARK.createDataFrame(data, schema=data_schema) 
Example #25
Source File: create_dataframes.py    From listenbrainz-server with GNU General Public License v2.0 5 votes vote down vote up
def get_listens_df(mapped_listens_df, metadata):
    """ Prepare listens dataframe.

        Args:
            mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping.

        Returns:
            listens_df : Dataframe containing recording_mbids corresponding to a user.
    """
    listens_df = mapped_listens_df.select('mb_recording_mbid', 'user_name')
    metadata['listens_count'] = listens_df.count()
    return listens_df 
Example #26
Source File: compiler.py    From ibis with Apache License 2.0 5 votes vote down vote up
def compile_count(t, expr, scope, context=None, **kwargs):
    return compile_aggregator(t, expr, scope, F.count, context, **kwargs) 
Example #27
Source File: testDataFrameHelper.py    From SMV with Apache License 2.0 5 votes vote down vote up
def test_smvRenameField_preserve_meta_for_renamed_fields(self):
        df = self.createDF("a:Integer; b:String", "1,abc;1,def;2,ghij")
        desc = "c description"
        res1 = df.groupBy(col("a")).agg(count(col("a")).alias("c"))\
                 .smvDesc(("c", desc))
        self.assertEqual(res1.smvGetDesc(), [("a", ""), ("c", desc)])

        res2 = res1.smvRenameField(("c", "d"))
        self.assertEqual(res2.smvGetDesc(), [("a", ""), ("d", desc)]) 
Example #28
Source File: testDataFrameHelper.py    From SMV with Apache License 2.0 5 votes vote down vote up
def test_smvRenameField_preserve_meta_for_unrenamed_fields(self):
        df = self.createDF("a:Integer; b:String", "1,abc;1,def;2,ghij")
        desc = "c description"
        res1 = df.groupBy(col("a")).agg(count(col("a")).alias("c"))\
                 .smvDesc(("c", desc))
        self.assertEqual(res1.smvGetDesc(), [("a", ""), ("c", desc)])

        res2 = res1.smvRenameField(("a", "d"))
        self.assertEqual(res2.smvGetDesc(), [("d", ""), ("c", desc)]) 
Example #29
Source File: testDataFrameHelper.py    From SMV with Apache License 2.0 5 votes vote down vote up
def test_smvHistInt(self):
        df = self.createDF("k:String;v:Integer", "a,1;b,2")
        res = dfhelper(df)._smvHist("v")
        self.assertEqual(res, """Histogram of v: Numeric sort by Key
key                      count      Pct    cumCount   cumPct
1.0                          1   50.00%           1   50.00%
2.0                          1   50.00%           2  100.00%
-------------------------------------------------""") 
Example #30
Source File: testDataFrameHelper.py    From SMV with Apache License 2.0 5 votes vote down vote up
def test_smvHistDate(self):
        df = self.createDF("k:Date;v:Integer", "2010-01-01,1;2010-01-02,2")
        res = dfhelper(df)._smvHist("k")
        self.assertEqual(res, """Histogram of k: String sort by Key
key                      count      Pct    cumCount   cumPct
2010-01-01                   1   50.00%           1   50.00%
2010-01-02                   1   50.00%           2  100.00%
-------------------------------------------------""")