Python pyspark.sql.functions.sum() Examples

The following are 20 code examples of pyspark.sql.functions.sum(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.functions , or try the search function

Example #1

Source File: groupby.py From sparklingpandas with Apache License 2.0

6 votes

def sum(self):
        """Compute the sum for each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.sum)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).sum()

        def merge_value(x, y):
            return pd.concat([x, create_combiner(y)])

        def merge_combiner(x, y):
            return x + y

        rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfSum, self.sql_ctx)

Example #2

Source File: employment.py From SMV with Apache License 2.0

5 votes

def run(self, i):
        df = i[Employment]
        return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP"))

Example #3

Source File: employment.py From SMV with Apache License 2.0

5 votes

def run(self, i):
        df = i[_DEP_NAME_]
        return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP"))

Example #4

Source File: employment.py From SMV with Apache License 2.0

5 votes

def run(self, i):
        df = i[inputdata.Employment]
        return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP"))

Example #5

Source File: employment.py From SMV with Apache License 2.0

5 votes

def run(self, i):
        df = i[_DEP_NAME_]
        return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP"))

Example #6

Source File: window.py From koalas with Apache License 2.0

5 votes

def sum(self):
        def sum(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.sum(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(sum)

Example #7

Source File: compiler.py From ibis with Apache License 2.0

5 votes

def compile_sum(t, expr, scope, context=None, **kwargs):
    return compile_aggregator(t, expr, scope, F.sum, context, **kwargs)

Example #8

Source File: fields.py From python_mozetl with MIT License

5 votes

def agg_sum(field_name, alias=None, expression=None):
    field_alias = get_alias(field_name, alias, "sum")
    field_expression = expression
    if field_expression is None:
        field_expression = field_name
    return F.sum(field_expression).alias(field_alias)

Example #9

Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License

4 votes

def run(self, initial_scaffolds):
        randomized_scaffold_udf = psf.udf(self._generate_func, pst.ArrayType(pst.StringType()))
        get_attachment_points_udf = psf.udf(usc.get_attachment_points, pst.ArrayType(pst.IntegerType()))
        remove_attachment_point_numbers_udf = psf.udf(usc.remove_attachment_point_numbers, pst.StringType())

        results_df = self._initialize_results(initial_scaffolds)
        scaffolds_df = results_df.select("smiles", "scaffold", "decorations")
        i = 0
        while scaffolds_df.count() > 0:
            # generate randomized SMILES
            self._log("info", "Starting iteration #%d.", i)
            scaffolds_df = scaffolds_df.withColumn("randomized_scaffold", randomized_scaffold_udf("smiles"))\
                .select(
                    "smiles", "scaffold", "decorations",
                    psf.explode("randomized_scaffold").alias("randomized_scaffold"))\
                .withColumn("attachment_points", get_attachment_points_udf("randomized_scaffold"))\
                .withColumn("randomized_scaffold", remove_attachment_point_numbers_udf("randomized_scaffold"))\
                .withColumn("id", psf.monotonically_increasing_id())\
                .persist()
            self._log("info", "Generated %d randomized SMILES from %d scaffolds.",
                      scaffolds_df.count(), scaffolds_df.select("smiles").distinct().count())

            # sample each randomized scaffold N times
            scaffolds = scaffolds_df.select("id", "randomized_scaffold")\
                .rdd.map(lambda row: (row["id"], row["randomized_scaffold"])).toLocalIterator()
            self._sample_and_write_scaffolds_to_disk(scaffolds, scaffolds_df.count())
            self._log("info", "Sampled %d scaffolds.", scaffolds_df.count())

            # merge decorated molecules
            joined_df = self._join_results(scaffolds_df).persist()

            if joined_df.count() > 0:
                self._log("info", "Joined %d -> %d (valid) -> %d unique sampled scaffolds",
                          scaffolds_df.count(), joined_df.agg(psf.sum("count")).head()[0], joined_df.count())

            scaffolds_df = joined_df.join(results_df, on="smiles", how="left_anti")\
                .select("smiles", "scaffold", "decorations")\
                .where("smiles LIKE '%*%'")
            self._log("info", "Obtained %d scaffolds for next iteration.", scaffolds_df.count())

            results_df = results_df.union(joined_df)\
                .groupBy("smiles")\
                .agg(
                    psf.first("scaffold").alias("scaffold"),
                    psf.first("decorations").alias("decorations"),
                    psf.sum("count").alias("count"))\
                .persist()
            i += 1

        return results_df

Example #10

Source File: bookmark_validation.py From python_mozetl with MIT License

4 votes

def transform(spark):
    """Create the bookmark problem and summary tables."""

    query = """
    SELECT s.app_build_id,
           s.app_version,
           s.app_display_version,
           s.app_name,
           s.app_channel,
           s.uid,
           s.device_id AS device_id,
           s.submission_date_s3 AS submission_day,
           date_format(from_unixtime(s.when / 1000), 'YYYYMMdd') AS sync_day,
           s.when,
           s.status,
           e.name AS engine_name,
           e.status AS engine_status,
           e.failure_reason AS engine_failure_reason,
           e.validation.problems IS NOT NULL AS engine_has_problems,
           e.validation.version AS engine_validation_version,
           e.validation.checked AS engine_validation_checked,
           e.validation.took AS engine_validation_took,
           p.name AS engine_validation_problem_name,
           p.count AS engine_validation_problem_count
    FROM sync_summary s
    LATERAL VIEW explode(s.engines) AS e
    LATERAL VIEW OUTER explode(e.validation.problems) AS p
    WHERE s.failure_reason IS NULL
    """
    engine_validations = spark.sql(query)

    bookmark_validations = engine_validations.where(
        F.col("engine_name").isin("bookmarks", "bookmarks-buffered")
    )

    bookmark_validation_problems = bookmark_validations.where(
        F.col("engine_has_problems")
    )

    # Generate aggregates over all bookmarks
    bookmark_aggregates = (
        bookmark_validations.where(F.col("engine_validation_checked").isNotNull())
        # see bug 1410963 for submission date vs sync date
        .groupBy("submission_day").agg(
            F.countDistinct("uid", "device_id", "when").alias(
                "total_bookmark_validations"
            ),
            F.countDistinct("uid").alias("total_validated_users"),
            F.sum("engine_validation_checked").alias("total_bookmarks_checked"),
        )
    )

    bookmark_validation_problems.createOrReplaceTempView("bmk_validation_problems")
    bookmark_aggregates.createOrReplaceTempView("bmk_total_per_day")

Example #11

Source File: addon_aggregates.py From python_mozetl with MIT License

4 votes

def aggregate_addons(df):
    """
    Aggregates add-on indicators by client, channel, version and locale.
    The result is a DataFrame with the additional aggregate columns:

    n_self_installed_addons (int)
    n_shield_addons (int)
    n_foreign_installed_addons (int)
    n_system_addons (int)
    n_web_extensions (int)
    first_addon_install_date (str %Y%m%d)
    profile_creation_date (str %Y%m%d)

    for each of the above facets.

    :param df: an expoded instance of main_summary by active_addons
               with various additional indicator columns
    :return SparkDF: an aggregated dataset with each of the above columns
    """
    addon_aggregates = (
        df.distinct()
        .groupBy("client_id", "normalized_channel", "app_version", "locale")
        .agg(
            fun.sum("is_self_install").alias("n_self_installed_addons"),
            fun.sum("is_shield_addon").alias("n_shield_addons"),
            fun.sum("is_foreign_install").alias("n_foreign_installed_addons"),
            fun.sum("is_system").alias("n_system_addons"),
            fun.sum("is_web_extension").alias("n_web_extensions"),
            fun.min(
                fun.when(
                    df.is_self_install == 1,
                    fun.date_format(
                        fun.from_unixtime(fun.col("install_day") * 60 * 60 * 24),
                        "yyyyMMdd",
                    ),
                ).otherwise(None)
            ).alias("first_addon_install_date"),
            fun.date_format(
                fun.from_unixtime(fun.min("profile_creation_date") * 60 * 60 * 24),
                "yyyyMMdd",
            ).alias("profile_creation_date"),
        )
    )
    return addon_aggregates

Example #12