Python pyspark.sql.functions.min() Examples

The following are 30 code examples of pyspark.sql.functions.min(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.functions , or try the search function

Example #1

Source File: groupby.py From sparklingpandas with Apache License 2.0

6 votes

def min(self):
        """Compute the min for each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.min)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).min()

        def merge_value(x, y):
            return x.append(create_combiner(y)).min()

        def merge_combiner(x, y):
            return x.append(y).min(level=0)

        rddOfMin = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfMin, self.sql_ctx)

Example #2

Source File: dataframe.py From sparklingpandas with Apache License 2.0

6 votes

def stats(self, columns):
        """Compute the stats for each column provided in columns.
        Parameters
        ----------
        columns : list of str, contains all columns to compute stats on.
        """
        assert (not isinstance(columns, basestring)), "columns should be a " \
                                                      "list of strs,  " \
                                                      "not a str!"
        assert isinstance(columns, list), "columns should be a list!"

        from pyspark.sql import functions as F
        functions = [F.min, F.max, F.avg, F.count]
        aggs = list(
            self._flatmap(lambda column: map(lambda f: f(column), functions),
                          columns))
        return PStats(self.from_schema_rdd(self._schema_rdd.agg(*aggs)))

Example #3

Source File: plot.py From koalas with Apache License 2.0

6 votes

def get_sampled(self, data):
        from databricks.koalas import DataFrame, Series

        fraction = get_option("plotting.sample_ratio")
        if fraction is None:
            fraction = 1 / (len(data) / get_option("plotting.max_rows"))
            fraction = min(1.0, fraction)
        self.fraction = fraction

        if isinstance(data, (DataFrame, Series)):
            if isinstance(data, Series):
                data = data.to_frame()
            sampled = data._internal.resolved_copy.spark_frame.sample(fraction=self.fraction)
            return DataFrame(data._internal.with_new_sdf(sampled)).to_pandas()
        else:
            raise ValueError("Only DataFrame and Series are supported for plotting.")

Example #4

Source File: groupby.py From koalas with Apache License 2.0

6 votes

def is_multi_agg_with_relabel(**kwargs):
    """
    Check whether the kwargs pass to .agg look like multi-agg with relabling.

    Parameters
    ----------
    **kwargs : dict

    Returns
    -------
    bool

    Examples
    --------
    >>> is_multi_agg_with_relabel(a='max')
    False
    >>> is_multi_agg_with_relabel(a_max=('a', 'max'),
    ...                            a_min=('a', 'min'))
    True
    >>> is_multi_agg_with_relabel()
    False
    """
    if not kwargs:
        return False
    return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values())

Example #5

Source File: compiler.py From ibis with Apache License 2.0

5 votes

def compile_notall(t, expr, scope, *, context=None, window=None, **kwargs):
    # See comments for opts.NotAny for reasoning for the if/else
    if context is None:

        def fn(col):
            return ~(F.min(col))

        return compile_aggregator(t, expr, scope, fn, context, **kwargs)
    else:
        return ~compile_all(
            t, expr, scope, context=context, window=window, **kwargs
        )

Example #6

Source File: pyspark_dist_explore.py From pyspark_dist_explore with MIT License

5 votes

def _get_min_value(self):
        if self.min_value is not None:
            return self.min_value
        return min([table.select(F.min(F.col(col_name))).collect()[0][0]
                    for table, col_name in self.col_list])

Example #7

Source File: pyspark_dist_explore.py From pyspark_dist_explore with MIT License

5 votes

def pandas_histogram(x, bins=10, range=None):
    """Returns a pandas DataFrame with histograms of the Spark DataFrame

    Bin ranges are formatted as text an put on the Index.

    Args:
        :x: (`DataFrame` or `list` of `DataFrame`)
            A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames
        :bins: (`integer` or `array_like`, optional)
            If an integer is given, bins + 1 bin edges are returned, consistently with numpy.histogram() for
            numpy version >= 1.3.

            Unequally spaced bins are supported if bins is a sequence.

            Default is 10
        :range: (tuple or None, optional)
            The lower and upper range of the bins. Lower and upper outliers are ignored.
            If not provided, range is (x.min(), x.max()). Range has no effect if bins is a sequence.

            If bins is a sequence or range is specified, autoscaling is based on the specified bin range instead
            of the range of x.

            Default is None
    """
    histogram = Histogram(bins=bins, range=range)
    histogram.add_data(x)
    return histogram.to_pandas()

Example #8

Source File: pyspark_dist_explore.py From pyspark_dist_explore with MIT License

5 votes

def _get_min_value(self):
        if self.min_value is not None:
            return self.min_value
        return min([table.select(F.min(F.col(col_name))).collect()[0][0]
                    for table, col_name in self.col_list])

Example #9

Source File: pyspark_dist_explore.py From pyspark_dist_explore with MIT License

5 votes

def pandas_histogram(x, bins=10, range=None):
    """Returns a pandas DataFrame with histograms of the Spark DataFrame

    Bin ranges are formatted as text an put on the Index.

    Args:
        :x: (`DataFrame` or `list` of `DataFrame`)
            A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames
        :bins: (`integer` or `array_like`, optional)
            If an integer is given, bins + 1 bin edges are returned, consistently with numpy.histogram() for
            numpy version >= 1.3.

            Unequally spaced bins are supported if bins is a sequence.

            Default is 10
        :range: (tuple or None, optional)
            The lower and upper range of the bins. Lower and upper outliers are ignored.
            If not provided, range is (x.min(), x.max()). Range has no effect if bins is a sequence.

            If bins is a sequence or range is specified, autoscaling is based on the specified bin range instead
            of the range of x.

            Default is None
    """
    histogram = Histogram(bins=bins, range=range)
    histogram.add_data(x)
    return histogram.to_pandas()

Example #10

Source File: dataframe.py From sparklingpandas with Apache License 2.0

5 votes

def min(self):
        return self.from_spark_rdd(self._schema_rdd.min(), self.sql_ctx)

Example #11

Source File: series.py From koalas with Apache License 2.0

5 votes

def _reduce_for_stat_function(self, sfun, name, axis=None, numeric_only=None):
        """
        Applies sfun to the column and returns a scalar

        Parameters
        ----------
        sfun : the stats function to be used for aggregation
        name : original pandas API name.
        axis : used only for sanity check because series only support index axis.
        numeric_only : not used by this implementation, but passed down by stats functions
        """
        from inspect import signature

        axis = validate_axis(axis)
        if axis == 1:
            raise ValueError("Series does not support columns axis.")
        num_args = len(signature(sfun).parameters)
        scol = self.spark.column
        spark_type = self.spark.data_type
        if isinstance(spark_type, BooleanType) and sfun.__name__ not in ("min", "max"):
            # Stat functions cannot be used with boolean values by default
            # Thus, cast to integer (true to 1 and false to 0)
            # Exclude the min and max methods though since those work with booleans
            scol = scol.cast("integer")
        if num_args == 1:
            # Only pass in the column if sfun accepts only one arg
            scol = sfun(scol)
        else:  # must be 2
            assert num_args == 2
            # Pass in both the column and its data type if sfun accepts two args
            scol = sfun(scol, spark_type)
        return unpack_scalar(self._internal.spark_frame.select(scol))

Example #12

Source File: plot.py From koalas with Apache License 2.0

5 votes

def _get_bins(sdf, bins):
        # 'data' is a Spark DataFrame that selects all columns.
        if len(sdf.columns) > 1:
            min_col = F.least(*map(F.min, sdf))
            max_col = F.greatest(*map(F.max, sdf))
        else:
            min_col = F.min(sdf.columns[-1])
            max_col = F.max(sdf.columns[-1])
        boundaries = sdf.select(min_col, max_col).first()

        # divides the boundaries into bins
        if boundaries[0] == boundaries[1]:
            boundaries = (boundaries[0] - 0.5, boundaries[1] + 0.5)

        return np.linspace(boundaries[0], boundaries[1], bins + 1)

Example #13

Source File: plot.py From koalas with Apache License 2.0

5 votes

def _calc_whiskers(colname, outliers):
        # Computes min and max values of non-outliers - the whiskers
        minmax = (
            outliers.filter("not __{}_outlier".format(colname))
            .agg(F.min(colname).alias("min"), F.max(colname).alias("max"))
            .toPandas()
        )
        return minmax.iloc[0][["min", "max"]].values

Example #14

Source File: groupby.py From koalas with Apache License 2.0

5 votes

def min(self):
        """
        Compute min of group values.

        See Also
        --------
        databricks.koalas.Series.groupby
        databricks.koalas.DataFrame.groupby
        """
        return self._reduce_for_stat_function(F.min, only_numeric=False)

    # TODO: sync the doc and implement `ddof`.

Example #15

Source File: compiler.py From ibis with Apache License 2.0

5 votes

def compile_all(t, expr, scope, context=None, **kwargs):
    return compile_aggregator(t, expr, scope, F.min, context, **kwargs)

Example #16

Source File: compiler.py From ibis with Apache License 2.0

5 votes

def compile_min(t, expr, scope, context=None, **kwargs):
    return compile_aggregator(t, expr, scope, F.min, context, **kwargs)

Example #17

Source File: window.py From koalas with Apache License 2.0

5 votes

def min(self):
        def min(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.min(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(min)

Example #18

Source File: window.py From koalas with Apache License 2.0

5 votes

def min(self):
        """
        Calculate the expanding minimum.

        .. note:: the current implementation of this API uses Spark's Window without
            specifying partition specification. This leads to move all data into
            single partition in single machine and could cause serious
            performance degradation. Avoid this method against very large dataset.

        Returns
        -------
        Series or DataFrame
            Returned object type is determined by the caller of the expanding
            calculation.

        See Also
        --------
        Series.expanding : Calling object with a Series.
        DataFrame.expanding : Calling object with a DataFrame.
        Series.min : Similar method for Series.
        DataFrame.min : Similar method for DataFrame.

        Examples
        --------
        Performing a expanding minimum with a window size of 3.

        >>> s = ks.Series([4, 3, 5, 2, 6])
        >>> s.expanding(3).min()
        0    NaN
        1    NaN
        2    3.0
        3    2.0
        4    2.0
        Name: 0, dtype: float64
        """
        return super(Expanding, self).min()

Example #19

Source File: indexes.py From koalas with Apache License 2.0

5 votes

def min(self):
        """
        Return the minimum value of the Index.

        Returns
        -------
        scalar
            Minimum value.

        See Also
        --------
        Index.max : Return the maximum value of the object.
        Series.min : Return the minimum value in a Series.
        DataFrame.min : Return the minimum values in a DataFrame.

        Examples
        --------
        >>> idx = ks.Index([3, 2, 1])
        >>> idx.min()
        1

        >>> idx = ks.Index(['c', 'b', 'a'])
        >>> idx.min()
        'a'

        For a MultiIndex, the maximum is determined lexicographically.

        >>> idx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2)])
        >>> idx.min()
        ('a', 'x', 1)
        """
        sdf = self._internal.spark_frame
        min_row = sdf.select(F.min(F.struct(self._internal.index_spark_columns))).head()
        result = tuple(min_row[0])

        return result if len(result) > 1 else result[0]

Example #20

Source File: indexes.py From koalas with Apache License 2.0

5 votes

def max(self):
        """
        Return the maximum value of the Index.

        Returns
        -------
        scalar
            Maximum value.

        See Also
        --------
        Index.min : Return the minimum value in an Index.
        Series.max : Return the maximum value in a Series.
        DataFrame.max : Return the maximum values in a DataFrame.

        Examples
        --------
        >>> idx = pd.Index([3, 2, 1])
        >>> idx.max()
        3

        >>> idx = pd.Index(['c', 'b', 'a'])
        >>> idx.max()
        'c'

        For a MultiIndex, the maximum is determined lexicographically.

        >>> idx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2)])
        >>> idx.max()
        ('b', 'y', 2)
        """
        sdf = self._internal.spark_frame
        max_row = sdf.select(F.max(F.struct(self._internal.index_spark_columns))).head()
        result = tuple(max_row[0])

        return result if len(result) > 1 else result[0]

Example #21

Source File: series.py From koalas with Apache License 2.0

4 votes

def _rank(self, method="average", ascending=True, part_cols=()):
        if method not in ["average", "min", "max", "first", "dense"]:
            msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'"
            raise ValueError(msg)

        if len(self._internal.index_spark_column_names) > 1:
            raise ValueError("rank do not support index now")

        if ascending:
            asc_func = lambda scol: scol.asc()
        else:
            asc_func = lambda scol: scol.desc()

        if method == "first":
            window = (
                Window.orderBy(
                    asc_func(self.spark.column), asc_func(F.col(NATURAL_ORDER_COLUMN_NAME)),
                )
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            scol = F.row_number().over(window)
        elif method == "dense":
            window = (
                Window.orderBy(asc_func(self.spark.column))
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            scol = F.dense_rank().over(window)
        else:
            if method == "average":
                stat_func = F.mean
            elif method == "min":
                stat_func = F.min
            elif method == "max":
                stat_func = F.max
            window1 = (
                Window.orderBy(asc_func(self.spark.column))
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            window2 = Window.partitionBy([self.spark.column] + list(part_cols)).rowsBetween(
                Window.unboundedPreceding, Window.unboundedFollowing
            )
            scol = stat_func(F.row_number().over(window1)).over(window2)
        kser = self._with_new_scol(scol).rename(self.name)
        return kser.astype(np.float64)

Example #22

Source File: series.py From koalas with Apache License 2.0

4 votes

def aggregate(self, func: Union[str, List[str]]):
        """Aggregate using one or more operations over the specified axis.

        Parameters
        ----------
        func : str or a list of str
            function name(s) as string apply to series.

        Returns
        -------
        scalar, Series
            The return can be:
            - scalar : when Series.agg is called with single function
            - Series : when Series.agg is called with several functions

        Notes
        -----
        `agg` is an alias for `aggregate`. Use the alias.

        See Also
        --------
        Series.apply : Invoke function on a Series.
        Series.transform : Only perform transforming type operations.
        Series.groupby : Perform operations over groups.
        DataFrame.aggregate : The equivalent function for DataFrame.

        Examples
        --------
        >>> s = ks.Series([1, 2, 3, 4])
        >>> s.agg('min')
        1

        >>> s.agg(['min', 'max'])
        max    4
        min    1
        Name: 0, dtype: int64
        """
        if isinstance(func, list):
            return self.to_frame().agg(func)[self.name]
        elif isinstance(func, str):
            return getattr(self, func)()
        else:
            raise ValueError("func must be a string or list of strings")

Example #23

Source File: generic.py From koalas with Apache License 2.0

4 votes

def min(self, axis=None, numeric_only=None):
        """
        Return the minimum of the values.

        Parameters
        ----------
        axis : {index (0), columns (1)}
            Axis for the function to be applied on.
        numeric_only : bool, default None
            If True, include only float, int, boolean columns. This parameter is mainly for
            pandas compatibility. False is supported; however, the columns should
            be all numeric or all non-numeric.

        Returns
        -------
        min : scalar for a Series, and a Series for a DataFrame.

        Examples
        --------

        >>> df = ks.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
        ...                   columns=['a', 'b'])

        On a DataFrame:

        >>> df.min()
        a    1.0
        b    0.1
        Name: 0, dtype: float64

        >>> df.min(axis=1)
        0    0.1
        1    0.2
        2    0.3
        3    NaN
        Name: 0, dtype: float64

        On a Series:

        >>> df['a'].min()
        1.0
        """
        return self._reduce_for_stat_function(
            F.min, name="min", numeric_only=numeric_only, axis=axis
        )

Example #24

Source File: window.py From koalas with Apache License 2.0

4 votes

def min(self):
        """
        Calculate the expanding minimum.

        Returns
        -------
        Series or DataFrame
            Returned object type is determined by the caller of the expanding
            calculation.

        See Also
        --------
        Series.expanding : Calling object with a Series.
        DataFrame.expanding : Calling object with a DataFrame.
        Series.min : Similar method for Series.
        DataFrame.min : Similar method for DataFrame.

        Examples
        --------
        >>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
        >>> s.groupby(s).expanding(3).min().sort_index()  # doctest: +NORMALIZE_WHITESPACE
        0
        2  0     NaN
           1     NaN
        3  2     NaN
           3     NaN
           4     3.0
        4  5     NaN
           6     NaN
           7     4.0
           8     4.0
        5  9     NaN
           10    NaN
        Name: 0, dtype: float64

        For DataFrame, each expanding minimum is computed column-wise.

        >>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
        >>> df.groupby(df.A).expanding(2).min().sort_index()  # doctest: +NORMALIZE_WHITESPACE
                A     B
        A
        2 0   NaN   NaN
          1   2.0   4.0
        3 2   NaN   NaN
          3   3.0   9.0
          4   3.0   9.0
        4 5   NaN   NaN
          6   4.0  16.0
          7   4.0  16.0
          8   4.0  16.0
        5 9   NaN   NaN
          10  5.0  25.0
        """
        return super(ExpandingGroupby, self).min()

Example #25

Source File: addon_aggregates.py From python_mozetl with MIT License

4 votes

def aggregate_addons(df):
    """
    Aggregates add-on indicators by client, channel, version and locale.
    The result is a DataFrame with the additional aggregate columns:

    n_self_installed_addons (int)
    n_shield_addons (int)
    n_foreign_installed_addons (int)
    n_system_addons (int)
    n_web_extensions (int)
    first_addon_install_date (str %Y%m%d)
    profile_creation_date (str %Y%m%d)

    for each of the above facets.

    :param df: an expoded instance of main_summary by active_addons
               with various additional indicator columns
    :return SparkDF: an aggregated dataset with each of the above columns
    """
    addon_aggregates = (
        df.distinct()
        .groupBy("client_id", "normalized_channel", "app_version", "locale")
        .agg(
            fun.sum("is_self_install").alias("n_self_installed_addons"),
            fun.sum("is_shield_addon").alias("n_shield_addons"),
            fun.sum("is_foreign_install").alias("n_foreign_installed_addons"),
            fun.sum("is_system").alias("n_system_addons"),
            fun.sum("is_web_extension").alias("n_web_extensions"),
            fun.min(
                fun.when(
                    df.is_self_install == 1,
                    fun.date_format(
                        fun.from_unixtime(fun.col("install_day") * 60 * 60 * 24),
                        "yyyyMMdd",
                    ),
                ).otherwise(None)
            ).alias("first_addon_install_date"),
            fun.date_format(
                fun.from_unixtime(fun.min("profile_creation_date") * 60 * 60 * 24),
                "yyyyMMdd",
            ).alias("profile_creation_date"),
        )
    )
    return addon_aggregates

Example #26

Source File: window.py From koalas with Apache License 2.0

4 votes

def min(self):
        """
        The rolling minimum of any non-NaN observations inside the window.

        Returns
        -------
        Series or DataFrame
            Returned object type is determined by the caller of the rolling
            calculation.

        See Also
        --------
        Series.rolling : Calling object with Series data.
        DataFrame.rolling : Calling object with DataFrames.
        Series.min : Min of the full Series.
        DataFrame.min : Min of the full DataFrame.

        Examples
        --------
        >>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
        >>> s.groupby(s).rolling(3).min().sort_index()  # doctest: +NORMALIZE_WHITESPACE
        0
        2  0     NaN
           1     NaN
        3  2     NaN
           3     NaN
           4     3.0
        4  5     NaN
           6     NaN
           7     4.0
           8     4.0
        5  9     NaN
           10    NaN
        Name: 0, dtype: float64

        For DataFrame, each rolling minimum is computed column-wise.

        >>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
        >>> df.groupby(df.A).rolling(2).min().sort_index()  # doctest: +NORMALIZE_WHITESPACE
                A     B
        A
        2 0   NaN   NaN
          1   2.0   4.0
        3 2   NaN   NaN
          3   3.0   9.0
          4   3.0   9.0
        4 5   NaN   NaN
          6   4.0  16.0
          7   4.0  16.0
          8   4.0  16.0
        5 9   NaN   NaN
          10  5.0  25.0
        """
        return super(RollingGroupby, self).min()

Example #27

Source File: indexes.py From koalas with Apache License 2.0

4 votes

def asof(self, label):
        """
        Return the label from the index, or, if not present, the previous one.

        Assuming that the index is sorted, return the passed index label if it
        is in the index, or return the previous index label if the passed one
        is not in the index.

        .. note:: This API is dependent on :meth:`Index.is_monotonic_increasing`
            which can be expensive.

        Parameters
        ----------
        label : object
            The label up to which the method returns the latest index label.

        Returns
        -------
        object
            The passed label if it is in the index. The previous label if the
            passed label is not in the sorted index or `NaN` if there is no
            such label.

        Examples
        --------
        `Index.asof` returns the latest index label up to the passed label.

        >>> idx = ks.Index(['2013-12-31', '2014-01-02', '2014-01-03'])
        >>> idx.asof('2014-01-01')
        '2013-12-31'

        If the label is in the index, the method returns the passed label.

        >>> idx.asof('2014-01-02')
        '2014-01-02'

        If all of the labels in the index are later than the passed label,
        NaN is returned.

        >>> idx.asof('1999-01-02')
        nan
        """
        sdf = self._internal.spark_frame
        if self.is_monotonic_increasing:
            sdf = sdf.where(self.spark.column <= label).select(F.max(self.spark.column))
        elif self.is_monotonic_decreasing:
            sdf = sdf.where(self.spark.column >= label).select(F.min(self.spark.column))
        else:
            raise ValueError("index must be monotonic increasing or decreasing")
        result = sdf.head()[0]
        return result if result is not None else np.nan

Example #28

Source File: pyspark_dist_explore.py From pyspark_dist_explore with MIT License

4 votes

def plot_density(self, ax, num=300, **kwargs):
        """Returns a density plot on an Pyplot Axes object.

        Args:
            :ax: (`Axes`)
                An matplotlib Axes object on which the histogram will be plot
            :num: (`int`)
                The number of x values the line is plotted on. Default: 300
            :**kwargs:
                Keyword arguments that are passed on to the pyplot.plot function.
        """
        colors = []

        self.build()
        bin_centers = np.asarray(self._get_bin_centers())
        x_new = np.linspace(bin_centers.min(), bin_centers.max(), num)

        if 'color' in kwargs:
            colors = kwargs['color']
            del kwargs['color']

        power_smooth = []

        for (colname, bin_values) in self.hist_dict.items():
            normed_values, ble = np.histogram(self._get_bin_centers(),
                                              bins=self.bin_boundaries,
                                              weights=bin_values,
                                              density=True
                                              )
            interpolation_function = interp1d(bin_centers, normed_values, kind='quadratic')

            power_smooth.append(x_new)
            power_smooth.append(interpolation_function(x_new))

        lines = ax.plot(*power_smooth, **kwargs)

        for i, line in enumerate(lines):
            if len(colors) > 0:
                plt.setp(line, color=colors[i], label=list(self.hist_dict.keys())[i])
            else:
                plt.setp(line, label=list(self.hist_dict.keys())[i])

        return lines

Example #29

Source File: plot.py From koalas with Apache License 2.0

4 votes

def _compute_plot_data(self):
        colname = self.data.name
        data = self.data

        # Updates all props with the rc defaults from matplotlib
        self.kwds.update(KoalasBoxPlot.rc_defaults(**self.kwds))

        # Gets some important kwds
        showfliers = self.kwds.get("showfliers", False)
        whis = self.kwds.get("whis", 1.5)
        labels = self.kwds.get("labels", [colname])

        # This one is Koalas specific to control precision for approx_percentile
        precision = self.kwds.get("precision", 0.01)

        # # Computes mean, median, Q1 and Q3 with approx_percentile and precision
        col_stats, col_fences = KoalasBoxPlot._compute_stats(data, colname, whis, precision)

        # # Creates a column to flag rows as outliers or not
        outliers = KoalasBoxPlot._outliers(data, colname, *col_fences)

        # # Computes min and max values of non-outliers - the whiskers
        whiskers = KoalasBoxPlot._calc_whiskers(colname, outliers)

        if showfliers:
            fliers = KoalasBoxPlot._get_fliers(colname, outliers)
        else:
            fliers = []

        # Builds bxpstats dict
        stats = []
        item = {
            "mean": col_stats["mean"],
            "med": col_stats["med"],
            "q1": col_stats["q1"],
            "q3": col_stats["q3"],
            "whislo": whiskers[0],
            "whishi": whiskers[1],
            "fliers": fliers,
            "label": labels[0],
        }
        stats.append(item)

        self.data = {labels[0]: stats}

Example #30

Source File: groupby.py From koalas with Apache License 2.0

4 votes

def all(self):
        """
        Returns True if all values in the group are truthful, else False.

        See Also
        --------
        databricks.koalas.Series.groupby
        databricks.koalas.DataFrame.groupby

        Examples
        --------
        >>> df = ks.DataFrame({'A': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
        ...                    'B': [True, True, True, False, False,
        ...                          False, None, True, None, False]},
        ...                   columns=['A', 'B'])
        >>> df
           A      B
        0  1   True
        1  1   True
        2  2   True
        3  2  False
        4  3  False
        5  3  False
        6  4   None
        7  4   True
        8  5   None
        9  5  False

        >>> df.groupby('A').all().sort_index()  # doctest: +NORMALIZE_WHITESPACE
               B
        A
        1   True
        2  False
        3  False
        4   True
        5  False
        """
        return self._reduce_for_stat_function(
            lambda col: F.min(F.coalesce(col.cast("boolean"), F.lit(True))), only_numeric=False
        )

    # TODO: skipna should be implemented.