Python pyspark.sql.functions.max() Examples

The following are 30 code examples of pyspark.sql.functions.max(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.functions , or try the search function .
Example #1
Source File: compiler.py    From ibis with Apache License 2.0 7 votes vote down vote up
def compile_aggregator(t, expr, scope, fn, context=None, **kwargs):
    op = expr.op()
    src_col = t.translate(op.arg, scope)

    if getattr(op, 'where', None) is not None:
        condition = t.translate(op.where, scope)
        src_col = F.when(condition, src_col)

    col = fn(src_col)
    if context is None:
        # We are trying to compile a expr such as some_col.max()
        # to a Spark expression.
        # Here we get the root table df of that column and compile
        # the expr to:
        # df.select(max(some_col))
        return t.translate(expr.op().arg.op().table, scope).select(col)
    elif context == AggregationContext.WINDOW:
        window = kwargs['window']
        return col.over(window)
    else:
        return col 
Example #2
Source File: dataframe.py    From sparklingpandas with Apache License 2.0 6 votes vote down vote up
def stats(self, columns):
        """Compute the stats for each column provided in columns.
        Parameters
        ----------
        columns : list of str, contains all columns to compute stats on.
        """
        assert (not isinstance(columns, basestring)), "columns should be a " \
                                                      "list of strs,  " \
                                                      "not a str!"
        assert isinstance(columns, list), "columns should be a list!"

        from pyspark.sql import functions as F
        functions = [F.min, F.max, F.avg, F.count]
        aggs = list(
            self._flatmap(lambda column: map(lambda f: f(column), functions),
                          columns))
        return PStats(self.from_schema_rdd(self._schema_rdd.agg(*aggs))) 
Example #3
Source File: pyspark_dist_explore.py    From pyspark_dist_explore with MIT License 6 votes vote down vote up
def to_pandas(self, kind='hist'):
        """Returns a pandas dataframe from the Histogram object.

        This function calculates the Histogram function in Spark if it was not done yet.

        Args:
            :kind: (:obj:`str`, optional):
                'hist' or 'density'. When using hist this returns the histogram object
                as pandas dataframe. When using density the index contains the bin centers, and the values in the
                DataFrame are the scaled values. Defaults to 'hist'

        Returns:
            A pandas DataFrame from the Histogram object.
        """
        self.build()
        if kind == 'hist':
            return pd.DataFrame(self.hist_dict).set_index([self._get_col_names()])
        elif kind == 'density':
            result = pd.DataFrame(self.hist_dict).set_index([self._get_bin_centers()])
            return result.apply(lambda x: x / x.max(), axis=0) 
Example #4
Source File: groupby.py    From koalas with Apache License 2.0 6 votes vote down vote up
def is_multi_agg_with_relabel(**kwargs):
    """
    Check whether the kwargs pass to .agg look like multi-agg with relabling.

    Parameters
    ----------
    **kwargs : dict

    Returns
    -------
    bool

    Examples
    --------
    >>> is_multi_agg_with_relabel(a='max')
    False
    >>> is_multi_agg_with_relabel(a_max=('a', 'max'),
    ...                            a_min=('a', 'min'))
    True
    >>> is_multi_agg_with_relabel()
    False
    """
    if not kwargs:
        return False
    return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) 
Example #5
Source File: pyspark_dist_explore.py    From pyspark_dist_explore with MIT License 6 votes vote down vote up
def to_pandas(self, kind='hist'):
        """Returns a pandas dataframe from the Histogram object.

        This function calculates the Histogram function in Spark if it was not done yet.

        Args:
            :kind: (:obj:`str`, optional):
                'hist' or 'density'. When using hist this returns the histogram object
                as pandas dataframe. When using density the index contains the bin centers, and the values in the
                DataFrame are the scaled values. Defaults to 'hist'

        Returns:
            A pandas DataFrame from the Histogram object.
        """
        self.build()
        if kind == 'hist':
            return pd.DataFrame(self.hist_dict).set_index([self._get_col_names()])
        elif kind == 'density':
            result = pd.DataFrame(self.hist_dict).set_index([self._get_bin_centers()])
            return result.apply(lambda x: x / x.max(), axis=0) 
Example #6
Source File: candidate_sets.py    From listenbrainz-server with GNU General Public License v2.0 6 votes vote down vote up
def get_dates_to_generate_candidate_sets(mapped_df, recommendation_generation_window):
    """ Get window to fetch listens to generate candidate sets.

        Args:
            mapped_df (dataframe): listens mapped with msid_mbid_mapping. Refer to candidate_sets.py
                                   for dataframe columns.
            recommendation_generation_window (int): recommendations to be generated on history of given number of days.

        Returns:
            from_date (datetime): Date from which start fetching listens.
            to_date (datetime): Date upto which fetch listens.
    """
    # get timestamp of latest listen in HDFS
    to_date = mapped_df.select(func.max('listened_at').alias('listened_at')).collect()[0].listened_at
    from_date = stats.adjust_days(to_date, recommendation_generation_window).replace(hour=0, minute=0, second=0)
    return from_date, to_date 
Example #7
Source File: recommend.py    From listenbrainz-server with GNU General Public License v2.0 6 votes vote down vote up
def get_most_recent_model_id():
    """ Get model id of recently created model.

        Returns:
            model_id (str): Model identification string.
    """
    try:
        model_metadata = utils.read_files_from_HDFS(path.MODEL_METADATA)
    except PathNotFoundException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    latest_ts = model_metadata.select(func.max('model_created').alias('model_created')).take(1)[0].model_created
    model_id = model_metadata.select('model_id') \
                             .where(col('model_created') == latest_ts).take(1)[0].model_id

    return model_id 
Example #8
Source File: train_models.py    From listenbrainz-server with GNU General Public License v2.0 6 votes vote down vote up
def get_latest_dataframe_id(dataframe_metadata_df):
    """ Get dataframe id of dataframe on which model has been trained.

        Args:
            dataframe_metadata_df (dataframe): Refer to listenbrainz_spark.schema.dataframe_metadata_schema

        Returns:
            dataframe id
    """
    # get timestamp of recently saved dataframe.
    timestamp = dataframe_metadata_df.select(func.max('dataframe_created').alias('recent_dataframe_timestamp')).take(1)[0]
    # get dataframe id corresponding to most recent timestamp.
    df = dataframe_metadata_df.select('dataframe_id') \
                              .where(func.col('dataframe_created') == timestamp.recent_dataframe_timestamp).take(1)[0]

    return df.dataframe_id 
Example #9
Source File: groupby.py    From sparklingpandas with Apache License 2.0 6 votes vote down vote up
def max(self):
        """Compute the max for each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.max)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).max()

        def merge_value(x, y):
            return x.append(create_combiner(y)).max()

        def merge_combiner(x, y):
            return x.append(y).max(level=0)

        rddOfMax = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfMax, self.sql_ctx) 
Example #10
Source File: compiler.py    From ibis with Apache License 2.0 6 votes vote down vote up
def compile_notany(t, expr, scope, *, context=None, window=None, **kwargs):
    # The code here is a little ugly because the translation are different
    # with different context.
    # When translating col.notany() (context is None), we returns the dataframe
    # so we need to negate the aggregator, i.e., df.select(~F.max(col))
    # When traslating col.notany().over(w), we need to negate the result
    # after the window translation, i.e., ~(F.max(col).over(w))

    if context is None:

        def fn(col):
            return ~(F.max(col))

        return compile_aggregator(t, expr, scope, fn, context, **kwargs)
    else:
        return ~compile_any(
            t, expr, scope, context=context, window=window, **kwargs
        ) 
Example #11
Source File: plot.py    From koalas with Apache License 2.0 5 votes vote down vote up
def _get_ind(self, y):
        # 'y' is a Spark DataFrame that selects one column.
        if self.ind is None:
            min_val, max_val = y.select(F.min(y.columns[-1]), F.max(y.columns[-1])).first()

            sample_range = max_val - min_val
            ind = np.linspace(min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, 1000,)
        elif is_integer(self.ind):
            min_val, max_val = y.select(F.min(y.columns[-1]), F.max(y.columns[-1])).first()

            sample_range = np.nanmax(y) - np.nanmin(y)
            ind = np.linspace(min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, self.ind,)
        else:
            ind = self.ind
        return ind 
Example #12
Source File: dataframe.py    From sparklingpandas with Apache License 2.0 5 votes vote down vote up
def max(self):
        return self.from_spark_rdd(self._schema_rdd.max(), self.sql_ctx) 
Example #13
Source File: plot.py    From koalas with Apache License 2.0 5 votes vote down vote up
def _calc_whiskers(colname, outliers):
        # Computes min and max values of non-outliers - the whiskers
        minmax = (
            outliers.filter("not __{}_outlier".format(colname))
            .agg(F.min(colname).alias("min"), F.max(colname).alias("max"))
            .toPandas()
        )
        return minmax.iloc[0][["min", "max"]].values 
Example #14
Source File: series.py    From koalas with Apache License 2.0 5 votes vote down vote up
def _reduce_for_stat_function(self, sfun, name, axis=None, numeric_only=None):
        """
        Applies sfun to the column and returns a scalar

        Parameters
        ----------
        sfun : the stats function to be used for aggregation
        name : original pandas API name.
        axis : used only for sanity check because series only support index axis.
        numeric_only : not used by this implementation, but passed down by stats functions
        """
        from inspect import signature

        axis = validate_axis(axis)
        if axis == 1:
            raise ValueError("Series does not support columns axis.")
        num_args = len(signature(sfun).parameters)
        scol = self.spark.column
        spark_type = self.spark.data_type
        if isinstance(spark_type, BooleanType) and sfun.__name__ not in ("min", "max"):
            # Stat functions cannot be used with boolean values by default
            # Thus, cast to integer (true to 1 and false to 0)
            # Exclude the min and max methods though since those work with booleans
            scol = scol.cast("integer")
        if num_args == 1:
            # Only pass in the column if sfun accepts only one arg
            scol = sfun(scol)
        else:  # must be 2
            assert num_args == 2
            # Pass in both the column and its data type if sfun accepts two args
            scol = sfun(scol, spark_type)
        return unpack_scalar(self._internal.spark_frame.select(scol)) 
Example #15
Source File: fields.py    From python_mozetl with MIT License 5 votes vote down vote up
def agg_max(field_name, alias=None):
    field_alias = get_alias(field_name, alias, "max")
    return F.max(field_name).alias(field_alias) 
Example #16
Source File: pyspark_dist_explore.py    From pyspark_dist_explore with MIT License 5 votes vote down vote up
def pandas_histogram(x, bins=10, range=None):
    """Returns a pandas DataFrame with histograms of the Spark DataFrame

    Bin ranges are formatted as text an put on the Index.

    Args:
        :x: (`DataFrame` or `list` of `DataFrame`)
            A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames
        :bins: (`integer` or `array_like`, optional)
            If an integer is given, bins + 1 bin edges are returned, consistently with numpy.histogram() for
            numpy version >= 1.3.

            Unequally spaced bins are supported if bins is a sequence.

            Default is 10
        :range: (tuple or None, optional)
            The lower and upper range of the bins. Lower and upper outliers are ignored.
            If not provided, range is (x.min(), x.max()). Range has no effect if bins is a sequence.

            If bins is a sequence or range is specified, autoscaling is based on the specified bin range instead
            of the range of x.

            Default is None
    """
    histogram = Histogram(bins=bins, range=range)
    histogram.add_data(x)
    return histogram.to_pandas() 
Example #17
Source File: pyspark_dist_explore.py    From pyspark_dist_explore with MIT License 5 votes vote down vote up
def _get_max_value(self):
        if self.max_value is not None:
            return self.max_value
        return max([table.select(F.max(F.col(col_name))).collect()[0][0]
                    for table, col_name in self.col_list]) 
Example #18
Source File: compiler.py    From ibis with Apache License 2.0 5 votes vote down vote up
def compile_any(t, expr, scope, context=None, **kwargs):
    return compile_aggregator(t, expr, scope, F.max, context, **kwargs) 
Example #19
Source File: groupby.py    From koalas with Apache License 2.0 5 votes vote down vote up
def max(self):
        """
        Compute max of group values.

        See Also
        --------
        databricks.koalas.Series.groupby
        databricks.koalas.DataFrame.groupby
        """
        return self._reduce_for_stat_function(F.max, only_numeric=False)

    # TODO: examples should be updated. 
Example #20
Source File: indexes.py    From koalas with Apache License 2.0 5 votes vote down vote up
def max(self):
        """
        Return the maximum value of the Index.

        Returns
        -------
        scalar
            Maximum value.

        See Also
        --------
        Index.min : Return the minimum value in an Index.
        Series.max : Return the maximum value in a Series.
        DataFrame.max : Return the maximum values in a DataFrame.

        Examples
        --------
        >>> idx = pd.Index([3, 2, 1])
        >>> idx.max()
        3

        >>> idx = pd.Index(['c', 'b', 'a'])
        >>> idx.max()
        'c'

        For a MultiIndex, the maximum is determined lexicographically.

        >>> idx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2)])
        >>> idx.max()
        ('b', 'y', 2)
        """
        sdf = self._internal.spark_frame
        max_row = sdf.select(F.max(F.struct(self._internal.index_spark_columns))).head()
        result = tuple(max_row[0])

        return result if len(result) > 1 else result[0] 
Example #21
Source File: indexes.py    From koalas with Apache License 2.0 5 votes vote down vote up
def min(self):
        """
        Return the minimum value of the Index.

        Returns
        -------
        scalar
            Minimum value.

        See Also
        --------
        Index.max : Return the maximum value of the object.
        Series.min : Return the minimum value in a Series.
        DataFrame.min : Return the minimum values in a DataFrame.

        Examples
        --------
        >>> idx = ks.Index([3, 2, 1])
        >>> idx.min()
        1

        >>> idx = ks.Index(['c', 'b', 'a'])
        >>> idx.min()
        'a'

        For a MultiIndex, the maximum is determined lexicographically.

        >>> idx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2)])
        >>> idx.min()
        ('a', 'x', 1)
        """
        sdf = self._internal.spark_frame
        min_row = sdf.select(F.min(F.struct(self._internal.index_spark_columns))).head()
        result = tuple(min_row[0])

        return result if len(result) > 1 else result[0] 
Example #22
Source File: pyspark_dist_explore.py    From pyspark_dist_explore with MIT License 5 votes vote down vote up
def pandas_histogram(x, bins=10, range=None):
    """Returns a pandas DataFrame with histograms of the Spark DataFrame

    Bin ranges are formatted as text an put on the Index.

    Args:
        :x: (`DataFrame` or `list` of `DataFrame`)
            A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames
        :bins: (`integer` or `array_like`, optional)
            If an integer is given, bins + 1 bin edges are returned, consistently with numpy.histogram() for
            numpy version >= 1.3.

            Unequally spaced bins are supported if bins is a sequence.

            Default is 10
        :range: (tuple or None, optional)
            The lower and upper range of the bins. Lower and upper outliers are ignored.
            If not provided, range is (x.min(), x.max()). Range has no effect if bins is a sequence.

            If bins is a sequence or range is specified, autoscaling is based on the specified bin range instead
            of the range of x.

            Default is None
    """
    histogram = Histogram(bins=bins, range=range)
    histogram.add_data(x)
    return histogram.to_pandas() 
Example #23
Source File: window.py    From koalas with Apache License 2.0 5 votes vote down vote up
def max(self):
        """
        Calculate the expanding maximum.

        .. note:: the current implementation of this API uses Spark's Window without
            specifying partition specification. This leads to move all data into
            single partition in single machine and could cause serious
            performance degradation. Avoid this method against very large dataset.

        Returns
        -------
        Series or DataFrame
            Return type is determined by the caller.

        See Also
        --------
        Series.expanding : Calling object with Series data.
        DataFrame.expanding : Calling object with DataFrames.
        Series.max : Similar method for Series.
        DataFrame.max : Similar method for DataFrame.

        Examples
        --------
        Performing a expanding minimum with a window size of 3.

        >>> s = ks.Series([4, 3, 5, 2, 6])
        >>> s.expanding(3).max()
        0    NaN
        1    NaN
        2    5.0
        3    5.0
        4    6.0
        Name: 0, dtype: float64
        """
        return super(Expanding, self).max() 
Example #24
Source File: window.py    From koalas with Apache License 2.0 5 votes vote down vote up
def max(self):
        def max(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.max(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(max) 
Example #25
Source File: solids.py    From dagster with Apache License 2.0 5 votes vote down vote up
def make_daily_temperature_highs(_, weather_samples: DataFrame) -> DataFrame:
    '''Computes the temperature high for each day'''
    valid_date = f.to_date(weather_samples['valid']).alias('valid_date')
    return weather_samples.groupBy(valid_date).agg(f.max('tmpf').alias('max_tmpf')) 
Example #26
Source File: test_basic.py    From ibis with Apache License 2.0 5 votes vote down vote up
def test_aggregation(client):
    import pyspark.sql.functions as F

    table = client.table('basic_table')
    result = table.aggregate(table['id'].max()).compile()
    expected = table.compile().agg(F.max('id').alias('max'))

    tm.assert_frame_equal(result.toPandas(), expected.toPandas()) 
Example #27
Source File: compiler.py    From ibis with Apache License 2.0 5 votes vote down vote up
def compile_max(t, expr, scope, context=None, **kwargs):
    return compile_aggregator(t, expr, scope, F.max, context, **kwargs) 
Example #28
Source File: pyspark_dist_explore.py    From pyspark_dist_explore with MIT License 4 votes vote down vote up
def plot_density(self, ax, num=300, **kwargs):
        """Returns a density plot on an Pyplot Axes object.

        Args:
            :ax: (`Axes`)
                An matplotlib Axes object on which the histogram will be plot
            :num: (`int`)
                The number of x values the line is plotted on. Default: 300
            :**kwargs:
                Keyword arguments that are passed on to the pyplot.plot function.
        """
        colors = []

        self.build()
        bin_centers = np.asarray(self._get_bin_centers())
        x_new = np.linspace(bin_centers.min(), bin_centers.max(), num)

        if 'color' in kwargs:
            colors = kwargs['color']
            del kwargs['color']

        power_smooth = []

        for (colname, bin_values) in self.hist_dict.items():
            normed_values, ble = np.histogram(self._get_bin_centers(),
                                              bins=self.bin_boundaries,
                                              weights=bin_values,
                                              density=True
                                              )
            interpolation_function = interp1d(bin_centers, normed_values, kind='quadratic')

            power_smooth.append(x_new)
            power_smooth.append(interpolation_function(x_new))

        lines = ax.plot(*power_smooth, **kwargs)

        for i, line in enumerate(lines):
            if len(colors) > 0:
                plt.setp(line, color=colors[i], label=list(self.hist_dict.keys())[i])
            else:
                plt.setp(line, label=list(self.hist_dict.keys())[i])

        return lines 
Example #29
Source File: pyspark_dist_explore.py    From pyspark_dist_explore with MIT License 4 votes vote down vote up
def plot_density(self, ax, num=300, **kwargs):
        """Returns a density plot on an Pyplot Axes object.

        Args:
            :ax: (`Axes`)
                An matplotlib Axes object on which the histogram will be plot
            :num: (`int`)
                The number of x values the line is plotted on. Default: 300
            :**kwargs:
                Keyword arguments that are passed on to the pyplot.plot function.
        """
        colors = []

        self.build()
        bin_centers = np.asarray(self._get_bin_centers())
        x_new = np.linspace(bin_centers.min(), bin_centers.max(), num)

        if 'color' in kwargs:
            colors = kwargs['color']
            del kwargs['color']

        power_smooth = []

        for (colname, bin_values) in self.hist_dict.items():
            normed_values, ble = np.histogram(self._get_bin_centers(),
                                              bins=self.bin_list,
                                              weights=bin_values,
                                              density=True
                                              )

            power_smooth.append(x_new)
            power_smooth.append(spline(bin_centers, normed_values, x_new))

        lines = ax.plot(*power_smooth, **kwargs)

        for i, line in enumerate(lines):
            if len(colors) > 0:
                plt.setp(line, color=colors[i], label=list(self.hist_dict.keys())[i])
            else:
                plt.setp(line, label=list(self.hist_dict.keys())[i])

        return lines 
Example #30
Source File: series.py    From koalas with Apache License 2.0 4 votes vote down vote up
def _rank(self, method="average", ascending=True, part_cols=()):
        if method not in ["average", "min", "max", "first", "dense"]:
            msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'"
            raise ValueError(msg)

        if len(self._internal.index_spark_column_names) > 1:
            raise ValueError("rank do not support index now")

        if ascending:
            asc_func = lambda scol: scol.asc()
        else:
            asc_func = lambda scol: scol.desc()

        if method == "first":
            window = (
                Window.orderBy(
                    asc_func(self.spark.column), asc_func(F.col(NATURAL_ORDER_COLUMN_NAME)),
                )
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            scol = F.row_number().over(window)
        elif method == "dense":
            window = (
                Window.orderBy(asc_func(self.spark.column))
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            scol = F.dense_rank().over(window)
        else:
            if method == "average":
                stat_func = F.mean
            elif method == "min":
                stat_func = F.min
            elif method == "max":
                stat_func = F.max
            window1 = (
                Window.orderBy(asc_func(self.spark.column))
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            window2 = Window.partitionBy([self.spark.column] + list(part_cols)).rowsBetween(
                Window.unboundedPreceding, Window.unboundedFollowing
            )
            scol = stat_func(F.row_number().over(window1)).over(window2)
        kser = self._with_new_scol(scol).rename(self.name)
        return kser.astype(np.float64)