python source code of groupby

"""Provide wrapper around the grouped result from L{DataFrame}"""
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from sparklingpandas.utils import add_pyspark_path
from sparklingpandas.dataframe import DataFrame
add_pyspark_path()
import pyspark
import pandas as pd
import numpy as np


class GroupBy:

    """An RDD with key value pairs, where each value is a Pandas DataFrame and
    the key is the result of the group. Supports many of the same operations
    as a Pandas GroupBy."""

    def __init__(self, prdd, *args, **kwargs):
        """Construct a groupby object providing the functions on top of the
        provided RDD. We keep the base RDD so if someone calls aggregate we
        do things more intelligently.
        """
        self._sort = kwargs.get("sort", True)
        self._by = kwargs.get("by", None)
        self._prdd = prdd
        self._myargs = args
        self._mykwargs = kwargs
        self.sql_ctx = prdd.sql_ctx

    def _can_use_new_school(self):
        """Determine if we can use new school grouping, depends on the
        args / kwargs"""
        # TODO: check the other components for sanity
        # and add support for doing this with a map function if possible.
        if (isinstance(self._by, basestring)):
            return True
        return False

    def _prep_spark_sql_groupby(self):
        """Used Spark SQL group approach"""
        # Strip the index info
        non_index_columns = filter(lambda x: x not in self._prdd._index_names,
                                   self._prdd._column_names())
        self._grouped_spark_sql = (self._prdd.to_spark_sql()
                                   .select(non_index_columns)
                                   .groupBy(self._by))
        self._columns = filter(lambda x: x != self._by,
                               non_index_columns)

    def _prep_pandas_groupby(self):
        """Prepare the old school pandas group by based approach."""
        myargs = self._myargs
        mykwargs = self._mykwargs

        def extract_keys(groupedFrame):
            for key, group in groupedFrame:
                yield (key, group)

        def group_and_extract(frame):
            return extract_keys(frame.groupby(*myargs, **mykwargs))

        self._baseRDD = self._prdd._rdd()
        self._distributedRDD = self._baseRDD.flatMap(group_and_extract)
        self._mergedRDD = self._sortIfNeeded(
            self._group(self._distributedRDD))

    def _sortIfNeeded(self, rdd):
        """Sort by key if we need to."""
        if self._sort:
            return rdd.sortByKey()
        else:
            return rdd

    def _group(self, rdd):
        """Group together the values with the same key."""
        return rdd.reduceByKey(lambda x, y: x.append(y))

    def __len__(self):
        """Number of groups."""
        # TODO: use Spark SQL
        self._prep_pandas_groupby()
        return self._mergedRDD.count()

    def get_group(self, name):
        """Returns a concrete DataFrame for provided group name."""
        self._prep_pandas_groupby()
        self._mergedRDD.lookup(name)

    def __iter__(self):
        """Returns an iterator of (name, dataframe) to the local machine.
        """
        self._prep_pandas_groupby()
        return self._mergedRDD.collect().__iter__()

    def collect(self):
        """Return a list of the elements. This is a SparklingPandas extension
        because Spark gives us back a list we convert to an iterator in
        __iter__ so it allows us to skip the round trip through iterators.
        """
        self._prep_pandas_groupby()
        return self._mergedRDD.collect()

    @property
    def groups(self):
        """Returns dict {group name -> group labels}."""
        self._prep_pandas_groupby()

        def extract_group_labels(frame):
            return (frame[0], frame[1].index.values)

        return self._mergedRDD.map(extract_group_labels).collectAsMap()

    @property
    def ngroups(self):
        """Number of groups."""
        if self._can_use_new_school():
            return self._grouped_spark_sql.count()
        self._prep_pandas_groupby()
        return self._mergedRDD.count()

    @property
    def indices(self):
        """Returns dict {group name -> group indices}."""
        self._prep_pandas_groupby()

        def extract_group_indices(frame):
            return (frame[0], frame[1].index)

        return self._mergedRDD.map(extract_group_indices).collectAsMap()

    def median(self):
        """Compute median of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        self._prep_pandas_groupby()
        return DataFrame.fromDataFrameRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.median()), self.sql_ctx)

    def mean(self):
        """Compute mean of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.mean)
        self._prep_pandas_groupby()
        return DataFrame.fromDataFrameRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.mean()), self.sql_ctx)

    def var(self, ddof=1):
        """Compute standard deviation of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        self._prep_pandas_groupby()
        return DataFrame.fromDataFrameRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.var(ddof=ddof)), self.sql_ctx)

    def sum(self):
        """Compute the sum for each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.sum)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).sum()

        def merge_value(x, y):
            return pd.concat([x, create_combiner(y)])

        def merge_combiner(x, y):
            return x + y

        rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfSum, self.sql_ctx)

    def _create_exprs_using_func(self, f, columns):
        """Create aggregate expressions using the provided function
        with the result coming back as the original column name."""
        expressions = map(lambda c: f(c).alias(c),
                          self._columns)
        return expressions

    def min(self):
        """Compute the min for each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.min)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).min()

        def merge_value(x, y):
            return x.append(create_combiner(y)).min()

        def merge_combiner(x, y):
            return x.append(y).min(level=0)

        rddOfMin = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfMin, self.sql_ctx)

    def max(self):
        """Compute the max for each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.max)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).max()

        def merge_value(x, y):
            return x.append(create_combiner(y)).max()

        def merge_combiner(x, y):
            return x.append(y).max(level=0)

        rddOfMax = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfMax, self.sql_ctx)

    def count(self):
        """Compute the number of elements in each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.count)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).count()

        def merge_value(x, y):
            return x.append(create_combiner(y)).count()

        def merge_combiner(x, y):
            return x.append(y).count(level=0)

        rddOfCounts = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfCounts, self.sql_ctx)

    def _use_aggregation(self, agg, columns=None):
        """Compute the result using the aggregation function provided.
        The aggregation name must also be provided so we can strip of the extra
        name that Spark SQL adds."""
        if not columns:
            columns = self._columns
        from pyspark.sql import functions as F
        aggs = map(lambda column: agg(column).alias(column), self._columns)
        aggRdd = self._grouped_spark_sql.agg(*aggs)
        df = DataFrame.from_schema_rdd(aggRdd, self._by)
        return df

    def first(self):
        """
        Pull out the first from each group. Note: this is different than
        Spark's first.
        """
        # If its possible to use Spark SQL grouping do it
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.first)
        myargs = self._myargs
        mykwargs = self._mykwargs
        self._prep_pandas_groupby()

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).first()

        def merge_value(x, y):
            return create_combiner(x)

        def merge_combiner(x, y):
            return x

        rddOfFirst = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfFirst, self.sql_ctx)

    def last(self):
        """Pull out the last from each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs
        # If its possible to use Spark SQL grouping do it
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.last)

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).last()

        def merge_value(x, y):
            return create_combiner(y)

        def merge_combiner(x, y):
            return y

        rddOfLast = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfLast, self.sql_ctx)

    def _regroup_mergedRDD(self):
        """A common pattern is we want to call groupby again on the dataframes
        so we can use the groupby functions.
        """
        myargs = self._myargs
        mykwargs = self._mykwargs
        self._prep_pandas_groupby()

        def regroup(df):
            return df.groupby(*myargs, **mykwargs)

        return self._mergedRDD.mapValues(regroup)

    def nth(self, n, *args, **kwargs):
        """Take the nth element of each grouby."""
        # TODO: Stop collecting the entire frame for each key.
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs
        nthRDD = self._regroup_mergedRDD().mapValues(
            lambda r: r.nth(
                n, *args, **kwargs)).values()
        return DataFrame.fromDataFrameRDD(nthRDD, self.sql_ctx)

    def aggregate(self, f):
        """Apply the aggregation function.
        Note: This implementation does note take advantage of partial
        aggregation unless we have one of the special cases.
        Currently the only special case is Series.kurtosis - and even
        that doesn't properly do partial aggregations, but we can improve
        it to do this eventually!
        """
        if self._can_use_new_school() and f == pd.Series.kurtosis:
            self._prep_spark_sql_groupby()
            import custom_functions as CF
            return self._use_aggregation(CF.kurtosis)
        else:
            self._prep_pandas_groupby()
            return DataFrame.fromDataFrameRDD(
                self._regroup_mergedRDD().values().map(
                    lambda g: g.aggregate(f)), self.sql_ctx)

    def agg(self, f):
        return self.aggregate(f)

    def apply(self, func, *args, **kwargs):
        """Apply the provided function and combine the results together in the
        same way as apply from groupby in pandas.

        This returns a DataFrame.
        """
        self._prep_pandas_groupby()

        def key_by_index(data):
            """Key each row by its index.
            """
            # TODO: Is there a better way to do this?
            for key, row in data.iterrows():
                yield (key, pd.DataFrame.from_dict(
                    dict([(key, row)]), orient='index'))

        myargs = self._myargs
        mykwargs = self._mykwargs
        regroupedRDD = self._distributedRDD.mapValues(
            lambda data: data.groupby(*myargs, **mykwargs))
        appliedRDD = regroupedRDD.map(
            lambda key_data: key_data[1].apply(func, *args, **kwargs))
        reKeyedRDD = appliedRDD.flatMap(key_by_index)
        dataframe = self._sortIfNeeded(reKeyedRDD).values()
        return DataFrame.fromDataFrameRDD(dataframe, self.sql_ctx)