Python pyspark.sql.functions.sum() Examples

The following are 20 code examples of pyspark.sql.functions.sum(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.functions , or try the search function .
Example #1
Source File: groupby.py    From sparklingpandas with Apache License 2.0 6 votes vote down vote up
def sum(self):
        """Compute the sum for each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.sum)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).sum()

        def merge_value(x, y):
            return pd.concat([x, create_combiner(y)])

        def merge_combiner(x, y):
            return x + y

        rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfSum, self.sql_ctx) 
Example #2
Source File: employment.py    From SMV with Apache License 2.0 5 votes vote down vote up
def run(self, i):
        df = i[Employment]
        return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP")) 
Example #3
Source File: employment.py    From SMV with Apache License 2.0 5 votes vote down vote up
def run(self, i):
        df = i[_DEP_NAME_]
        return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP")) 
Example #4
Source File: employment.py    From SMV with Apache License 2.0 5 votes vote down vote up
def run(self, i):
        df = i[inputdata.Employment]
        return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP")) 
Example #5
Source File: employment.py    From SMV with Apache License 2.0 5 votes vote down vote up
def run(self, i):
        df = i[_DEP_NAME_]
        return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP")) 
Example #6
Source File: window.py    From koalas with Apache License 2.0 5 votes vote down vote up
def sum(self):
        def sum(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.sum(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(sum) 
Example #7
Source File: compiler.py    From ibis with Apache License 2.0 5 votes vote down vote up
def compile_sum(t, expr, scope, context=None, **kwargs):
    return compile_aggregator(t, expr, scope, F.sum, context, **kwargs) 
Example #8
Source File: fields.py    From python_mozetl with MIT License 5 votes vote down vote up
def agg_sum(field_name, alias=None, expression=None):
    field_alias = get_alias(field_name, alias, "sum")
    field_expression = expression
    if field_expression is None:
        field_expression = field_name
    return F.sum(field_expression).alias(field_alias) 
Example #9
Source File: sample_scaffolds.py    From reinvent-scaffold-decorator with MIT License 4 votes vote down vote up
def run(self, initial_scaffolds):
        randomized_scaffold_udf = psf.udf(self._generate_func, pst.ArrayType(pst.StringType()))
        get_attachment_points_udf = psf.udf(usc.get_attachment_points, pst.ArrayType(pst.IntegerType()))
        remove_attachment_point_numbers_udf = psf.udf(usc.remove_attachment_point_numbers, pst.StringType())

        results_df = self._initialize_results(initial_scaffolds)
        scaffolds_df = results_df.select("smiles", "scaffold", "decorations")
        i = 0
        while scaffolds_df.count() > 0:
            # generate randomized SMILES
            self._log("info", "Starting iteration #%d.", i)
            scaffolds_df = scaffolds_df.withColumn("randomized_scaffold", randomized_scaffold_udf("smiles"))\
                .select(
                    "smiles", "scaffold", "decorations",
                    psf.explode("randomized_scaffold").alias("randomized_scaffold"))\
                .withColumn("attachment_points", get_attachment_points_udf("randomized_scaffold"))\
                .withColumn("randomized_scaffold", remove_attachment_point_numbers_udf("randomized_scaffold"))\
                .withColumn("id", psf.monotonically_increasing_id())\
                .persist()
            self._log("info", "Generated %d randomized SMILES from %d scaffolds.",
                      scaffolds_df.count(), scaffolds_df.select("smiles").distinct().count())

            # sample each randomized scaffold N times
            scaffolds = scaffolds_df.select("id", "randomized_scaffold")\
                .rdd.map(lambda row: (row["id"], row["randomized_scaffold"])).toLocalIterator()
            self._sample_and_write_scaffolds_to_disk(scaffolds, scaffolds_df.count())
            self._log("info", "Sampled %d scaffolds.", scaffolds_df.count())

            # merge decorated molecules
            joined_df = self._join_results(scaffolds_df).persist()

            if joined_df.count() > 0:
                self._log("info", "Joined %d -> %d (valid) -> %d unique sampled scaffolds",
                          scaffolds_df.count(), joined_df.agg(psf.sum("count")).head()[0], joined_df.count())

            scaffolds_df = joined_df.join(results_df, on="smiles", how="left_anti")\
                .select("smiles", "scaffold", "decorations")\
                .where("smiles LIKE '%*%'")
            self._log("info", "Obtained %d scaffolds for next iteration.", scaffolds_df.count())

            results_df = results_df.union(joined_df)\
                .groupBy("smiles")\
                .agg(
                    psf.first("scaffold").alias("scaffold"),
                    psf.first("decorations").alias("decorations"),
                    psf.sum("count").alias("count"))\
                .persist()
            i += 1

        return results_df 
Example #10
Source File: bookmark_validation.py    From python_mozetl with MIT License 4 votes vote down vote up
def transform(spark):
    """Create the bookmark problem and summary tables."""

    query = """
    SELECT s.app_build_id,
           s.app_version,
           s.app_display_version,
           s.app_name,
           s.app_channel,
           s.uid,
           s.device_id AS device_id,
           s.submission_date_s3 AS submission_day,
           date_format(from_unixtime(s.when / 1000), 'YYYYMMdd') AS sync_day,
           s.when,
           s.status,
           e.name AS engine_name,
           e.status AS engine_status,
           e.failure_reason AS engine_failure_reason,
           e.validation.problems IS NOT NULL AS engine_has_problems,
           e.validation.version AS engine_validation_version,
           e.validation.checked AS engine_validation_checked,
           e.validation.took AS engine_validation_took,
           p.name AS engine_validation_problem_name,
           p.count AS engine_validation_problem_count
    FROM sync_summary s
    LATERAL VIEW explode(s.engines) AS e
    LATERAL VIEW OUTER explode(e.validation.problems) AS p
    WHERE s.failure_reason IS NULL
    """
    engine_validations = spark.sql(query)

    bookmark_validations = engine_validations.where(
        F.col("engine_name").isin("bookmarks", "bookmarks-buffered")
    )

    bookmark_validation_problems = bookmark_validations.where(
        F.col("engine_has_problems")
    )

    # Generate aggregates over all bookmarks
    bookmark_aggregates = (
        bookmark_validations.where(F.col("engine_validation_checked").isNotNull())
        # see bug 1410963 for submission date vs sync date
        .groupBy("submission_day").agg(
            F.countDistinct("uid", "device_id", "when").alias(
                "total_bookmark_validations"
            ),
            F.countDistinct("uid").alias("total_validated_users"),
            F.sum("engine_validation_checked").alias("total_bookmarks_checked"),
        )
    )

    bookmark_validation_problems.createOrReplaceTempView("bmk_validation_problems")
    bookmark_aggregates.createOrReplaceTempView("bmk_total_per_day") 
Example #11
Source File: addon_aggregates.py    From python_mozetl with MIT License 4 votes vote down vote up
def aggregate_addons(df):
    """
    Aggregates add-on indicators by client, channel, version and locale.
    The result is a DataFrame with the additional aggregate columns:

    n_self_installed_addons (int)
    n_shield_addons (int)
    n_foreign_installed_addons (int)
    n_system_addons (int)
    n_web_extensions (int)
    first_addon_install_date (str %Y%m%d)
    profile_creation_date (str %Y%m%d)

    for each of the above facets.

    :param df: an expoded instance of main_summary by active_addons
               with various additional indicator columns
    :return SparkDF: an aggregated dataset with each of the above columns
    """
    addon_aggregates = (
        df.distinct()
        .groupBy("client_id", "normalized_channel", "app_version", "locale")
        .agg(
            fun.sum("is_self_install").alias("n_self_installed_addons"),
            fun.sum("is_shield_addon").alias("n_shield_addons"),
            fun.sum("is_foreign_install").alias("n_foreign_installed_addons"),
            fun.sum("is_system").alias("n_system_addons"),
            fun.sum("is_web_extension").alias("n_web_extensions"),
            fun.min(
                fun.when(
                    df.is_self_install == 1,
                    fun.date_format(
                        fun.from_unixtime(fun.col("install_day") * 60 * 60 * 24),
                        "yyyyMMdd",
                    ),
                ).otherwise(None)
            ).alias("first_addon_install_date"),
            fun.date_format(
                fun.from_unixtime(fun.min("profile_creation_date") * 60 * 60 * 24),
                "yyyyMMdd",
            ).alias("profile_creation_date"),
        )
    )
    return addon_aggregates 
Example #12
Source File: groupby.py    From koalas with Apache License 2.0 4 votes vote down vote up
def normalize_keyword_aggregation(kwargs):
    """
    Normalize user-provided kwargs.

    Transforms from the new ``Dict[str, NamedAgg]`` style kwargs
    to the old OrderedDict[str, List[scalar]]].

    Parameters
    ----------
    kwargs : dict

    Returns
    -------
    aggspec : dict
        The transformed kwargs.
    columns : List[str]
        The user-provided keys.
    order : List[Tuple[str, str]]
        Pairs of the input and output column names.

    Examples
    --------
    >>> normalize_keyword_aggregation({'output': ('input', 'sum')})
    (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')])
    """
    # this is due to python version issue, not sure the impact on koalas
    PY36 = sys.version_info >= (3, 6)
    if not PY36:
        kwargs = OrderedDict(sorted(kwargs.items()))

    # TODO(Py35): When we drop python 3.5, change this to defaultdict(list)
    aggspec = OrderedDict()
    order = []
    columns, pairs = list(zip(*kwargs.items()))

    for column, aggfunc in pairs:
        if column in aggspec:
            aggspec[column].append(aggfunc)
        else:
            aggspec[column] = [aggfunc]

        order.append((column, aggfunc))
    # For MultiIndex, we need to flatten the tuple, e.g. (('y', 'A'), 'max') needs to be
    # flattened to ('y', 'A', 'max'), it won't do anything on normal Index.
    if isinstance(order[0][0], tuple):
        order = [(*levs, method) for levs, method in order]
    return aggspec, columns, order 
Example #13
Source File: groupby.py    From koalas with Apache License 2.0 4 votes vote down vote up
def cumsum(self):
        """
        Cumulative sum for each group.

        Returns
        -------
        Series or DataFrame

        See Also
        --------
        Series.cumsum
        DataFrame.cumsum

        Examples
        --------
        >>> df = ks.DataFrame(
        ...     [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],
        ...     columns=list('ABC'))
        >>> df
           A     B  C
        0  1   NaN  4
        1  1   0.1  3
        2  1  20.0  2
        3  4  10.0  1

        By default, iterates over rows and finds the sum in each column.

        >>> df.groupby("A").cumsum().sort_index()
              B  C
        0   NaN  4
        1   0.1  7
        2  20.1  9
        3  10.0  1

        It works as below in Series.

        >>> df.B.groupby(df.A).cumsum().sort_index()
        0     NaN
        1     0.1
        2    20.1
        3    10.0
        Name: B, dtype: float64

        """
        return self._apply_series_op(
            lambda sg: sg._kser._cum(F.sum, True, part_cols=sg._groupkeys_scols),
            should_resolve=True,
        ) 
Example #14
Source File: groupby.py    From koalas with Apache License 2.0 4 votes vote down vote up
def cumprod(self):
        """
        Cumulative product for each group.

        Returns
        -------
        Series or DataFrame

        See Also
        --------
        Series.cumprod
        DataFrame.cumprod

        Examples
        --------
        >>> df = ks.DataFrame(
        ...     [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],
        ...     columns=list('ABC'))
        >>> df
           A     B  C
        0  1   NaN  4
        1  1   0.1  3
        2  1  20.0  2
        3  4  10.0  1

        By default, iterates over rows and finds the sum in each column.

        >>> df.groupby("A").cumprod().sort_index()
              B     C
        0   NaN   4.0
        1   0.1  12.0
        2   2.0  24.0
        3  10.0   1.0

        It works as below in Series.

        >>> df.B.groupby(df.A).cumprod().sort_index()
        0     NaN
        1     0.1
        2     2.0
        3    10.0
        Name: B, dtype: float64

        """
        return self._apply_series_op(
            lambda sg: sg._kser._cumprod(True, part_cols=sg._groupkeys_scols), should_resolve=True
        ) 
Example #15
Source File: groupby.py    From koalas with Apache License 2.0 4 votes vote down vote up
def cummin(self):
        """
        Cumulative min for each group.

        Returns
        -------
        Series or DataFrame

        See Also
        --------
        Series.cummin
        DataFrame.cummin

        Examples
        --------
        >>> df = ks.DataFrame(
        ...     [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],
        ...     columns=list('ABC'))
        >>> df
           A     B  C
        0  1   NaN  4
        1  1   0.1  3
        2  1  20.0  2
        3  4  10.0  1

        By default, iterates over rows and finds the sum in each column.

        >>> df.groupby("A").cummin().sort_index()
              B  C
        0   NaN  4
        1   0.1  3
        2   0.1  2
        3  10.0  1

        It works as below in Series.

        >>> df.B.groupby(df.A).cummin().sort_index()
        0     NaN
        1     0.1
        2     0.1
        3    10.0
        Name: B, dtype: float64
        """
        return self._apply_series_op(
            lambda sg: sg._kser._cum(F.min, True, part_cols=sg._groupkeys_scols),
            should_resolve=True,
        ) 
Example #16
Source File: groupby.py    From koalas with Apache License 2.0 4 votes vote down vote up
def cummax(self):
        """
        Cumulative max for each group.

        Returns
        -------
        Series or DataFrame

        See Also
        --------
        Series.cummax
        DataFrame.cummax

        Examples
        --------
        >>> df = ks.DataFrame(
        ...     [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],
        ...     columns=list('ABC'))
        >>> df
           A     B  C
        0  1   NaN  4
        1  1   0.1  3
        2  1  20.0  2
        3  4  10.0  1

        By default, iterates over rows and finds the sum in each column.

        >>> df.groupby("A").cummax().sort_index()
              B  C
        0   NaN  4
        1   0.1  4
        2  20.0  4
        3  10.0  1

        It works as below in Series.

        >>> df.C.groupby(df.A).cummax().sort_index()
        0    4
        1    4
        2    4
        3    1
        Name: C, dtype: int64

        """
        return self._apply_series_op(
            lambda sg: sg._kser._cum(F.max, True, part_cols=sg._groupkeys_scols),
            should_resolve=True,
        ) 
Example #17
Source File: generic.py    From koalas with Apache License 2.0 4 votes vote down vote up
def sum(self, axis=None, numeric_only=True):
        """
        Return the sum of the values.

        Parameters
        ----------
        axis : {index (0), columns (1)}
            Axis for the function to be applied on.
        numeric_only : bool, default True
            Include only float, int, boolean columns. False is not supported. This parameter
            is mainly for pandas compatibility.

        Returns
        -------
        sum : scalar for a Series, and a Series for a DataFrame.

        Examples
        --------

        >>> df = ks.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
        ...                   columns=['a', 'b'])

        On a DataFrame:

        >>> df.sum()
        a    6.0
        b    0.6
        Name: 0, dtype: float64

        >>> df.sum(axis=1)
        0    1.1
        1    2.2
        2    3.3
        3    0.0
        Name: 0, dtype: float64

        On a Series:

        >>> df['a'].sum()
        6.0
        """
        return self._reduce_for_stat_function(
            F.sum, name="sum", numeric_only=numeric_only, axis=axis
        ) 
Example #18
Source File: window.py    From koalas with Apache License 2.0 4 votes vote down vote up
def sum(self):
        """
        Calculate expanding summation of given DataFrame or Series.

        Returns
        -------
        Series or DataFrame
            Same type as the input, with the same index, containing the
            expanding summation.

        See Also
        --------
        Series.expanding : Calling object with Series data.
        DataFrame.expanding : Calling object with DataFrames.
        Series.sum : Reducing sum for Series.
        DataFrame.sum : Reducing sum for DataFrame.

        Examples
        --------
        >>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
        >>> s.groupby(s).expanding(3).sum().sort_index()  # doctest: +NORMALIZE_WHITESPACE
        0
        2  0      NaN
           1      NaN
        3  2      NaN
           3      NaN
           4      9.0
        4  5      NaN
           6      NaN
           7     12.0
           8     16.0
        5  9      NaN
           10     NaN
        Name: 0, dtype: float64

        For DataFrame, each expanding summation is computed column-wise.

        >>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
        >>> df.groupby(df.A).expanding(2).sum().sort_index()  # doctest: +NORMALIZE_WHITESPACE
                 A     B
        A
        2 0    NaN   NaN
          1    4.0   8.0
        3 2    NaN   NaN
          3    6.0  18.0
          4    9.0  27.0
        4 5    NaN   NaN
          6    8.0  32.0
          7   12.0  48.0
          8   16.0  64.0
        5 9    NaN   NaN
          10  10.0  50.0
        """
        return super(ExpandingGroupby, self).sum() 
Example #19
Source File: window.py    From koalas with Apache License 2.0 4 votes vote down vote up
def sum(self):
        """
        Calculate expanding summation of given DataFrame or Series.

        .. note:: the current implementation of this API uses Spark's Window without
            specifying partition specification. This leads to move all data into
            single partition in single machine and could cause serious
            performance degradation. Avoid this method against very large dataset.

        Returns
        -------
        Series or DataFrame
            Same type as the input, with the same index, containing the
            expanding summation.

        See Also
        --------
        Series.expanding : Calling object with Series data.
        DataFrame.expanding : Calling object with DataFrames.
        Series.sum : Reducing sum for Series.
        DataFrame.sum : Reducing sum for DataFrame.

        Examples
        --------
        >>> s = ks.Series([1, 2, 3, 4, 5])
        >>> s
        0    1
        1    2
        2    3
        3    4
        4    5
        Name: 0, dtype: int64

        >>> s.expanding(3).sum()
        0     NaN
        1     NaN
        2     6.0
        3    10.0
        4    15.0
        Name: 0, dtype: float64

        For DataFrame, each expanding summation is computed column-wise.

        >>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
        >>> df
           A   B
        0  1   1
        1  2   4
        2  3   9
        3  4  16
        4  5  25

        >>> df.expanding(3).sum()
              A     B
        0   NaN   NaN
        1   NaN   NaN
        2   6.0  14.0
        3  10.0  30.0
        4  15.0  55.0
        """
        return super(Expanding, self).sum() 
Example #20
Source File: window.py    From koalas with Apache License 2.0 4 votes vote down vote up
def sum(self):
        """
        The rolling summation of any non-NaN observations inside the window.

        Returns
        -------
        Series or DataFrame
            Returned object type is determined by the caller of the rolling
            calculation.

        See Also
        --------
        Series.rolling : Calling object with Series data.
        DataFrame.rolling : Calling object with DataFrames.
        Series.sum : Sum of the full Series.
        DataFrame.sum : Sum of the full DataFrame.

        Examples
        --------
        >>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
        >>> s.groupby(s).rolling(3).sum().sort_index()  # doctest: +NORMALIZE_WHITESPACE
        0
        2  0      NaN
           1      NaN
        3  2      NaN
           3      NaN
           4      9.0
        4  5      NaN
           6      NaN
           7     12.0
           8     12.0
        5  9      NaN
           10     NaN
        Name: 0, dtype: float64

        For DataFrame, each rolling summation is computed column-wise.

        >>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
        >>> df.groupby(df.A).rolling(2).sum().sort_index()  # doctest: +NORMALIZE_WHITESPACE
                 A     B
        A
        2 0    NaN   NaN
          1    4.0   8.0
        3 2    NaN   NaN
          3    6.0  18.0
          4    6.0  18.0
        4 5    NaN   NaN
          6    8.0  32.0
          7    8.0  32.0
          8    8.0  32.0
        5 9    NaN   NaN
          10  10.0  50.0
        """
        return super(RollingGroupby, self).sum()