Python pyspark.sql.functions.sum() Examples
The following are 20
code examples of pyspark.sql.functions.sum().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.functions
, or try the search function
.
Example #1
Source File: groupby.py From sparklingpandas with Apache License 2.0 | 6 votes |
def sum(self): """Compute the sum for each group.""" if self._can_use_new_school(): self._prep_spark_sql_groupby() import pyspark.sql.functions as func return self._use_aggregation(func.sum) self._prep_pandas_groupby() myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).sum() def merge_value(x, y): return pd.concat([x, create_combiner(y)]) def merge_combiner(x, y): return x + y rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey( create_combiner, merge_value, merge_combiner)).values() return DataFrame.fromDataFrameRDD(rddOfSum, self.sql_ctx)
Example #2
Source File: employment.py From SMV with Apache License 2.0 | 5 votes |
def run(self, i): df = i[Employment] return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP"))
Example #3
Source File: employment.py From SMV with Apache License 2.0 | 5 votes |
def run(self, i): df = i[_DEP_NAME_] return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP"))
Example #4
Source File: employment.py From SMV with Apache License 2.0 | 5 votes |
def run(self, i): df = i[inputdata.Employment] return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP"))
Example #5
Source File: employment.py From SMV with Apache License 2.0 | 5 votes |
def run(self, i): df = i[_DEP_NAME_] return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP"))
Example #6
Source File: window.py From koalas with Apache License 2.0 | 5 votes |
def sum(self): def sum(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, F.sum(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(sum)
Example #7
Source File: compiler.py From ibis with Apache License 2.0 | 5 votes |
def compile_sum(t, expr, scope, context=None, **kwargs): return compile_aggregator(t, expr, scope, F.sum, context, **kwargs)
Example #8
Source File: fields.py From python_mozetl with MIT License | 5 votes |
def agg_sum(field_name, alias=None, expression=None): field_alias = get_alias(field_name, alias, "sum") field_expression = expression if field_expression is None: field_expression = field_name return F.sum(field_expression).alias(field_alias)
Example #9
Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License | 4 votes |
def run(self, initial_scaffolds): randomized_scaffold_udf = psf.udf(self._generate_func, pst.ArrayType(pst.StringType())) get_attachment_points_udf = psf.udf(usc.get_attachment_points, pst.ArrayType(pst.IntegerType())) remove_attachment_point_numbers_udf = psf.udf(usc.remove_attachment_point_numbers, pst.StringType()) results_df = self._initialize_results(initial_scaffolds) scaffolds_df = results_df.select("smiles", "scaffold", "decorations") i = 0 while scaffolds_df.count() > 0: # generate randomized SMILES self._log("info", "Starting iteration #%d.", i) scaffolds_df = scaffolds_df.withColumn("randomized_scaffold", randomized_scaffold_udf("smiles"))\ .select( "smiles", "scaffold", "decorations", psf.explode("randomized_scaffold").alias("randomized_scaffold"))\ .withColumn("attachment_points", get_attachment_points_udf("randomized_scaffold"))\ .withColumn("randomized_scaffold", remove_attachment_point_numbers_udf("randomized_scaffold"))\ .withColumn("id", psf.monotonically_increasing_id())\ .persist() self._log("info", "Generated %d randomized SMILES from %d scaffolds.", scaffolds_df.count(), scaffolds_df.select("smiles").distinct().count()) # sample each randomized scaffold N times scaffolds = scaffolds_df.select("id", "randomized_scaffold")\ .rdd.map(lambda row: (row["id"], row["randomized_scaffold"])).toLocalIterator() self._sample_and_write_scaffolds_to_disk(scaffolds, scaffolds_df.count()) self._log("info", "Sampled %d scaffolds.", scaffolds_df.count()) # merge decorated molecules joined_df = self._join_results(scaffolds_df).persist() if joined_df.count() > 0: self._log("info", "Joined %d -> %d (valid) -> %d unique sampled scaffolds", scaffolds_df.count(), joined_df.agg(psf.sum("count")).head()[0], joined_df.count()) scaffolds_df = joined_df.join(results_df, on="smiles", how="left_anti")\ .select("smiles", "scaffold", "decorations")\ .where("smiles LIKE '%*%'") self._log("info", "Obtained %d scaffolds for next iteration.", scaffolds_df.count()) results_df = results_df.union(joined_df)\ .groupBy("smiles")\ .agg( psf.first("scaffold").alias("scaffold"), psf.first("decorations").alias("decorations"), psf.sum("count").alias("count"))\ .persist() i += 1 return results_df
Example #10
Source File: bookmark_validation.py From python_mozetl with MIT License | 4 votes |
def transform(spark): """Create the bookmark problem and summary tables.""" query = """ SELECT s.app_build_id, s.app_version, s.app_display_version, s.app_name, s.app_channel, s.uid, s.device_id AS device_id, s.submission_date_s3 AS submission_day, date_format(from_unixtime(s.when / 1000), 'YYYYMMdd') AS sync_day, s.when, s.status, e.name AS engine_name, e.status AS engine_status, e.failure_reason AS engine_failure_reason, e.validation.problems IS NOT NULL AS engine_has_problems, e.validation.version AS engine_validation_version, e.validation.checked AS engine_validation_checked, e.validation.took AS engine_validation_took, p.name AS engine_validation_problem_name, p.count AS engine_validation_problem_count FROM sync_summary s LATERAL VIEW explode(s.engines) AS e LATERAL VIEW OUTER explode(e.validation.problems) AS p WHERE s.failure_reason IS NULL """ engine_validations = spark.sql(query) bookmark_validations = engine_validations.where( F.col("engine_name").isin("bookmarks", "bookmarks-buffered") ) bookmark_validation_problems = bookmark_validations.where( F.col("engine_has_problems") ) # Generate aggregates over all bookmarks bookmark_aggregates = ( bookmark_validations.where(F.col("engine_validation_checked").isNotNull()) # see bug 1410963 for submission date vs sync date .groupBy("submission_day").agg( F.countDistinct("uid", "device_id", "when").alias( "total_bookmark_validations" ), F.countDistinct("uid").alias("total_validated_users"), F.sum("engine_validation_checked").alias("total_bookmarks_checked"), ) ) bookmark_validation_problems.createOrReplaceTempView("bmk_validation_problems") bookmark_aggregates.createOrReplaceTempView("bmk_total_per_day")
Example #11
Source File: addon_aggregates.py From python_mozetl with MIT License | 4 votes |
def aggregate_addons(df): """ Aggregates add-on indicators by client, channel, version and locale. The result is a DataFrame with the additional aggregate columns: n_self_installed_addons (int) n_shield_addons (int) n_foreign_installed_addons (int) n_system_addons (int) n_web_extensions (int) first_addon_install_date (str %Y%m%d) profile_creation_date (str %Y%m%d) for each of the above facets. :param df: an expoded instance of main_summary by active_addons with various additional indicator columns :return SparkDF: an aggregated dataset with each of the above columns """ addon_aggregates = ( df.distinct() .groupBy("client_id", "normalized_channel", "app_version", "locale") .agg( fun.sum("is_self_install").alias("n_self_installed_addons"), fun.sum("is_shield_addon").alias("n_shield_addons"), fun.sum("is_foreign_install").alias("n_foreign_installed_addons"), fun.sum("is_system").alias("n_system_addons"), fun.sum("is_web_extension").alias("n_web_extensions"), fun.min( fun.when( df.is_self_install == 1, fun.date_format( fun.from_unixtime(fun.col("install_day") * 60 * 60 * 24), "yyyyMMdd", ), ).otherwise(None) ).alias("first_addon_install_date"), fun.date_format( fun.from_unixtime(fun.min("profile_creation_date") * 60 * 60 * 24), "yyyyMMdd", ).alias("profile_creation_date"), ) ) return addon_aggregates
Example #12
Source File: groupby.py From koalas with Apache License 2.0 | 4 votes |
def normalize_keyword_aggregation(kwargs): """ Normalize user-provided kwargs. Transforms from the new ``Dict[str, NamedAgg]`` style kwargs to the old OrderedDict[str, List[scalar]]]. Parameters ---------- kwargs : dict Returns ------- aggspec : dict The transformed kwargs. columns : List[str] The user-provided keys. order : List[Tuple[str, str]] Pairs of the input and output column names. Examples -------- >>> normalize_keyword_aggregation({'output': ('input', 'sum')}) (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) """ # this is due to python version issue, not sure the impact on koalas PY36 = sys.version_info >= (3, 6) if not PY36: kwargs = OrderedDict(sorted(kwargs.items())) # TODO(Py35): When we drop python 3.5, change this to defaultdict(list) aggspec = OrderedDict() order = [] columns, pairs = list(zip(*kwargs.items())) for column, aggfunc in pairs: if column in aggspec: aggspec[column].append(aggfunc) else: aggspec[column] = [aggfunc] order.append((column, aggfunc)) # For MultiIndex, we need to flatten the tuple, e.g. (('y', 'A'), 'max') needs to be # flattened to ('y', 'A', 'max'), it won't do anything on normal Index. if isinstance(order[0][0], tuple): order = [(*levs, method) for levs, method in order] return aggspec, columns, order
Example #13
Source File: groupby.py From koalas with Apache License 2.0 | 4 votes |
def cumsum(self): """ Cumulative sum for each group. Returns ------- Series or DataFrame See Also -------- Series.cumsum DataFrame.cumsum Examples -------- >>> df = ks.DataFrame( ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]], ... columns=list('ABC')) >>> df A B C 0 1 NaN 4 1 1 0.1 3 2 1 20.0 2 3 4 10.0 1 By default, iterates over rows and finds the sum in each column. >>> df.groupby("A").cumsum().sort_index() B C 0 NaN 4 1 0.1 7 2 20.1 9 3 10.0 1 It works as below in Series. >>> df.B.groupby(df.A).cumsum().sort_index() 0 NaN 1 0.1 2 20.1 3 10.0 Name: B, dtype: float64 """ return self._apply_series_op( lambda sg: sg._kser._cum(F.sum, True, part_cols=sg._groupkeys_scols), should_resolve=True, )
Example #14
Source File: groupby.py From koalas with Apache License 2.0 | 4 votes |
def cumprod(self): """ Cumulative product for each group. Returns ------- Series or DataFrame See Also -------- Series.cumprod DataFrame.cumprod Examples -------- >>> df = ks.DataFrame( ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]], ... columns=list('ABC')) >>> df A B C 0 1 NaN 4 1 1 0.1 3 2 1 20.0 2 3 4 10.0 1 By default, iterates over rows and finds the sum in each column. >>> df.groupby("A").cumprod().sort_index() B C 0 NaN 4.0 1 0.1 12.0 2 2.0 24.0 3 10.0 1.0 It works as below in Series. >>> df.B.groupby(df.A).cumprod().sort_index() 0 NaN 1 0.1 2 2.0 3 10.0 Name: B, dtype: float64 """ return self._apply_series_op( lambda sg: sg._kser._cumprod(True, part_cols=sg._groupkeys_scols), should_resolve=True )
Example #15
Source File: groupby.py From koalas with Apache License 2.0 | 4 votes |
def cummin(self): """ Cumulative min for each group. Returns ------- Series or DataFrame See Also -------- Series.cummin DataFrame.cummin Examples -------- >>> df = ks.DataFrame( ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]], ... columns=list('ABC')) >>> df A B C 0 1 NaN 4 1 1 0.1 3 2 1 20.0 2 3 4 10.0 1 By default, iterates over rows and finds the sum in each column. >>> df.groupby("A").cummin().sort_index() B C 0 NaN 4 1 0.1 3 2 0.1 2 3 10.0 1 It works as below in Series. >>> df.B.groupby(df.A).cummin().sort_index() 0 NaN 1 0.1 2 0.1 3 10.0 Name: B, dtype: float64 """ return self._apply_series_op( lambda sg: sg._kser._cum(F.min, True, part_cols=sg._groupkeys_scols), should_resolve=True, )
Example #16
Source File: groupby.py From koalas with Apache License 2.0 | 4 votes |
def cummax(self): """ Cumulative max for each group. Returns ------- Series or DataFrame See Also -------- Series.cummax DataFrame.cummax Examples -------- >>> df = ks.DataFrame( ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]], ... columns=list('ABC')) >>> df A B C 0 1 NaN 4 1 1 0.1 3 2 1 20.0 2 3 4 10.0 1 By default, iterates over rows and finds the sum in each column. >>> df.groupby("A").cummax().sort_index() B C 0 NaN 4 1 0.1 4 2 20.0 4 3 10.0 1 It works as below in Series. >>> df.C.groupby(df.A).cummax().sort_index() 0 4 1 4 2 4 3 1 Name: C, dtype: int64 """ return self._apply_series_op( lambda sg: sg._kser._cum(F.max, True, part_cols=sg._groupkeys_scols), should_resolve=True, )
Example #17
Source File: generic.py From koalas with Apache License 2.0 | 4 votes |
def sum(self, axis=None, numeric_only=True): """ Return the sum of the values. Parameters ---------- axis : {index (0), columns (1)} Axis for the function to be applied on. numeric_only : bool, default True Include only float, int, boolean columns. False is not supported. This parameter is mainly for pandas compatibility. Returns ------- sum : scalar for a Series, and a Series for a DataFrame. Examples -------- >>> df = ks.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]}, ... columns=['a', 'b']) On a DataFrame: >>> df.sum() a 6.0 b 0.6 Name: 0, dtype: float64 >>> df.sum(axis=1) 0 1.1 1 2.2 2 3.3 3 0.0 Name: 0, dtype: float64 On a Series: >>> df['a'].sum() 6.0 """ return self._reduce_for_stat_function( F.sum, name="sum", numeric_only=numeric_only, axis=axis )
Example #18
Source File: window.py From koalas with Apache License 2.0 | 4 votes |
def sum(self): """ Calculate expanding summation of given DataFrame or Series. Returns ------- Series or DataFrame Same type as the input, with the same index, containing the expanding summation. See Also -------- Series.expanding : Calling object with Series data. DataFrame.expanding : Calling object with DataFrames. Series.sum : Reducing sum for Series. DataFrame.sum : Reducing sum for DataFrame. Examples -------- >>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5]) >>> s.groupby(s).expanding(3).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE 0 2 0 NaN 1 NaN 3 2 NaN 3 NaN 4 9.0 4 5 NaN 6 NaN 7 12.0 8 16.0 5 9 NaN 10 NaN Name: 0, dtype: float64 For DataFrame, each expanding summation is computed column-wise. >>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2}) >>> df.groupby(df.A).expanding(2).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE A B A 2 0 NaN NaN 1 4.0 8.0 3 2 NaN NaN 3 6.0 18.0 4 9.0 27.0 4 5 NaN NaN 6 8.0 32.0 7 12.0 48.0 8 16.0 64.0 5 9 NaN NaN 10 10.0 50.0 """ return super(ExpandingGroupby, self).sum()
Example #19
Source File: window.py From koalas with Apache License 2.0 | 4 votes |
def sum(self): """ Calculate expanding summation of given DataFrame or Series. .. note:: the current implementation of this API uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. Returns ------- Series or DataFrame Same type as the input, with the same index, containing the expanding summation. See Also -------- Series.expanding : Calling object with Series data. DataFrame.expanding : Calling object with DataFrames. Series.sum : Reducing sum for Series. DataFrame.sum : Reducing sum for DataFrame. Examples -------- >>> s = ks.Series([1, 2, 3, 4, 5]) >>> s 0 1 1 2 2 3 3 4 4 5 Name: 0, dtype: int64 >>> s.expanding(3).sum() 0 NaN 1 NaN 2 6.0 3 10.0 4 15.0 Name: 0, dtype: float64 For DataFrame, each expanding summation is computed column-wise. >>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2}) >>> df A B 0 1 1 1 2 4 2 3 9 3 4 16 4 5 25 >>> df.expanding(3).sum() A B 0 NaN NaN 1 NaN NaN 2 6.0 14.0 3 10.0 30.0 4 15.0 55.0 """ return super(Expanding, self).sum()
Example #20
Source File: window.py From koalas with Apache License 2.0 | 4 votes |
def sum(self): """ The rolling summation of any non-NaN observations inside the window. Returns ------- Series or DataFrame Returned object type is determined by the caller of the rolling calculation. See Also -------- Series.rolling : Calling object with Series data. DataFrame.rolling : Calling object with DataFrames. Series.sum : Sum of the full Series. DataFrame.sum : Sum of the full DataFrame. Examples -------- >>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5]) >>> s.groupby(s).rolling(3).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE 0 2 0 NaN 1 NaN 3 2 NaN 3 NaN 4 9.0 4 5 NaN 6 NaN 7 12.0 8 12.0 5 9 NaN 10 NaN Name: 0, dtype: float64 For DataFrame, each rolling summation is computed column-wise. >>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2}) >>> df.groupby(df.A).rolling(2).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE A B A 2 0 NaN NaN 1 4.0 8.0 3 2 NaN NaN 3 6.0 18.0 4 6.0 18.0 4 5 NaN NaN 6 8.0 32.0 7 8.0 32.0 8 8.0 32.0 5 9 NaN NaN 10 10.0 50.0 """ return super(RollingGroupby, self).sum()