Python pandas.factorize() Examples

The following are 30 code examples of pandas.factorize(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: scatter.py    From scprep with GNU General Public License v3.0 6 votes vote down vote up
def c_discrete(self):
        """Discretized form of c

        If c is discrete then this converts it to
        integers from 0 to `n_c_unique`
        """
        if self._c_discrete is None:
            if isinstance(self._cmap, dict):
                self._labels = np.array(
                    [k for k in self._cmap.keys() if k in self.c_unique]
                )
                self._c_discrete = np.zeros_like(self._c, dtype=int)
                for i, label in enumerate(self._labels):
                    self._c_discrete[self._c == label] = i
            else:
                self._c_discrete = np.zeros_like(self._c, dtype=int)
                self._c_discrete[self._mask], self._labels = pd.factorize(
                    self._c_masked, sort=True
                )
        return self._c_discrete 
Example #2
Source File: sandwich_covariance.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def group_sums(x, group):
    '''sum x for each group, simple bincount version, again

    group : array, integer
        assumed to be consecutive integers

    no dtype checking because I want to raise in that case

    uses loop over columns of x

    #TODO: remove this, already copied to tools/grouputils
    '''

    #TODO: transpose return in group_sum, need test coverage first

    # re-label groups or bincount takes too much memory
    if np.max(group) > 2 * x.shape[0]:
        group = pd.factorize(group)[0]

    return np.array([np.bincount(group, weights=x[:, col])
                            for col in range(x.shape[1])]) 
Example #3
Source File: feature_engineering_titanic.py    From Deep-Learning-By-Example with MIT License 6 votes vote down vote up
def process_cabin():
    # refering to the global variable that contains the titanic examples
    global df_titanic_data

    # repllacing the missing value in cabin variable "U0"
    df_titanic_data['Cabin'][df_titanic_data.Cabin.isnull()] = 'U0'

    # the cabin number is a sequence of of alphanumerical digits, so we are going to create some features
    # from the alphabetical part of it
    df_titanic_data['CabinLetter'] = df_titanic_data['Cabin'].map(lambda l: get_cabin_letter(l))
    df_titanic_data['CabinLetter'] = pd.factorize(df_titanic_data['CabinLetter'])[0]

    # binarizing the cabin letters features
    if keep_binary:
        cletters = pd.get_dummies(df_titanic_data['CabinLetter']).rename(columns=lambda x: 'CabinLetter_' + str(x))
        df_titanic_data = pd.concat([df_titanic_data, cletters], axis=1)

    # creating features from the numerical side of the cabin
    df_titanic_data['CabinNumber'] = df_titanic_data['Cabin'].map(lambda x: get_cabin_num(x)).astype(int) + 1

    # scaling the feature
    if keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
    df_titanic_data['CabinNumber_scaled'] = scaler_processing.fit_transform(df_titanic_data.CabinNumber.reshape(-1, 1)) 
Example #4
Source File: test_algos.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_mixed(self):

        # doc example reshaping.rst
        x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
        labels, uniques = algos.factorize(x)

        exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = Index(['A', 'B', 3.14, np.inf])
        tm.assert_index_equal(uniques, exp)

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = Index([3.14, np.inf, 'A', 'B'])
        tm.assert_index_equal(uniques, exp) 
Example #5
Source File: test_algos.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_mixed(self):

        # doc example reshaping.rst
        x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
        labels, uniques = algos.factorize(x)

        exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = Index(['A', 'B', 3.14, np.inf])
        tm.assert_index_equal(uniques, exp)

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = Index([3.14, np.inf, 'A', 'B'])
        tm.assert_index_equal(uniques, exp) 
Example #6
Source File: feature_engineering_titanic.py    From Deep-Learning-By-Example with MIT License 6 votes vote down vote up
def process_embarked():
    global df_titanic_data

    # replacing the missing values with the most commmon value in the variable
    df_titanic_data.Embarked[df_titanic_data.Embarked.isnull()] = df_titanic_data.Embarked.dropna().mode().values

    # converting the values into numbers
    df_titanic_data['Embarked'] = pd.factorize(df_titanic_data['Embarked'])[0]

    # binarizing the constructed features
    if keep_binary:
        df_titanic_data = pd.concat([df_titanic_data, pd.get_dummies(df_titanic_data['Embarked']).rename(
            columns=lambda x: 'Embarked_' + str(x))], axis=1)



# Define a helper function that can use RandomForestClassifier for handling the missing values of the age variable 
Example #7
Source File: test_algos.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_uint64_factorize(self):
        data = np.array([2**63, 1, 2**63], dtype=np.uint64)
        exp_labels = np.array([0, 1, 0], dtype=np.intp)
        exp_uniques = np.array([2**63, 1], dtype=np.uint64)

        labels, uniques = algos.factorize(data)
        tm.assert_numpy_array_equal(labels, exp_labels)
        tm.assert_numpy_array_equal(uniques, exp_uniques)

        data = np.array([2**63, -1, 2**63], dtype=object)
        exp_labels = np.array([0, 1, 0], dtype=np.intp)
        exp_uniques = np.array([2**63, -1], dtype=object)

        labels, uniques = algos.factorize(data)
        tm.assert_numpy_array_equal(labels, exp_labels)
        tm.assert_numpy_array_equal(uniques, exp_uniques) 
Example #8
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_mixed(self):

        # doc example reshaping.rst
        x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
        labels, uniques = algos.factorize(x)

        exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = Index(['A', 'B', 3.14, np.inf])
        tm.assert_index_equal(uniques, exp)

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = Index([3.14, np.inf, 'A', 'B'])
        tm.assert_index_equal(uniques, exp) 
Example #9
Source File: sandwich_covariance.py    From Splunking-Crime with GNU Affero General Public License v3.0 6 votes vote down vote up
def group_sums(x, group):
    '''sum x for each group, simple bincount version, again

    group : array, integer
        assumed to be consecutive integers

    no dtype checking because I want to raise in that case

    uses loop over columns of x

    #TODO: remove this, already copied to tools/grouputils
    '''

    #TODO: transpose return in group_sum, need test coverage first

    # re-label groups or bincount takes too much memory
    if np.max(group) > 2 * x.shape[0]:
        group = pd.factorize(group)[0]

    return np.array([np.bincount(group, weights=x[:, col])
                            for col in range(x.shape[1])]) 
Example #10
Source File: test_algos.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_factorize_nan(self):
        # nan should map to na_sentinel, not reverse_indexer[na_sentinel]
        # rizer.factorize should not raise an exception if na_sentinel indexes
        # outside of reverse_indexer
        key = np.array([1, 2, 1, np.nan], dtype='O')
        rizer = ht.Factorizer(len(key))
        for na_sentinel in (-1, 20):
            ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel)
            expected = np.array([0, 1, 0, na_sentinel], dtype='int32')
            assert len(set(key)) == len(set(expected))
            tm.assert_numpy_array_equal(pd.isna(key),
                                        expected == na_sentinel)

        # nan still maps to na_sentinel when sort=False
        key = np.array([0, np.nan, 1], dtype='O')
        na_sentinel = -1

        # TODO(wesm): unused?
        ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel)  # noqa

        expected = np.array([2, -1, 0], dtype='int32')
        assert len(set(key)) == len(set(expected))
        tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) 
Example #11
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_int64_factorize(self, writable):
        data = np.array([2**63 - 1, -2**63, 2**63 - 1], dtype=np.int64)
        data.setflags(write=writable)
        exp_labels = np.array([0, 1, 0], dtype=np.intp)
        exp_uniques = np.array([2**63 - 1, -2**63], dtype=np.int64)

        labels, uniques = algos.factorize(data)
        tm.assert_numpy_array_equal(labels, exp_labels)
        tm.assert_numpy_array_equal(uniques, exp_uniques) 
Example #12
Source File: test_ip.py    From cyberpandas with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_factorize():
    arr = ip.IPArray([3, 3, 1, 2, 3, _U8_MAX + 1])
    labels, uniques = arr.factorize()
    expected_labels, expected_uniques = pd.factorize(arr.astype(object))

    assert isinstance(uniques, ip.IPArray)

    uniques = uniques.astype(object)
    tm.assert_numpy_array_equal(labels, expected_labels)
    tm.assert_numpy_array_equal(uniques, expected_uniques) 
Example #13
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_object_factorize(self, writable):
        data = np.array(['a', 'c', None, np.nan, 'a', 'b', pd.NaT, 'c'],
                        dtype=object)
        data.setflags(write=writable)
        exp_labels = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp)
        exp_uniques = np.array(['a', 'c', 'b'], dtype=object)

        labels, uniques = algos.factorize(data)
        tm.assert_numpy_array_equal(labels, exp_labels)
        tm.assert_numpy_array_equal(uniques, exp_uniques) 
Example #14
Source File: feature_engineering_titanic.py    From Deep-Learning-By-Example with MIT License 5 votes vote down vote up
def process_fare():
    global df_titanic_data

    # handling the missing values by replacing it with the median feare
    df_titanic_data['Fare'][np.isnan(df_titanic_data['Fare'])] = df_titanic_data['Fare'].median()

    # zeros in the fare will cause some division problems so we are going to set them  to 1/10th of the lowest fare
    df_titanic_data['Fare'][np.where(df_titanic_data['Fare'] == 0)[0]] = df_titanic_data['Fare'][
                                                                             df_titanic_data['Fare'].nonzero()[
                                                                                 0]].min() / 10

    # Binarizing the features by binning them into quantiles
    df_titanic_data['Fare_bin'] = pd.qcut(df_titanic_data['Fare'], 4)

    if keep_binary:
        df_titanic_data = pd.concat(
            [df_titanic_data, pd.get_dummies(df_titanic_data['Fare_bin']).rename(columns=lambda x: 'Fare_' + str(x))],
            axis=1)

    # binning
    if keep_bins:
        df_titanic_data['Fare_bin_id'] = pd.factorize(df_titanic_data['Fare_bin'])[0] + 1

    # scaling the value
    if keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Fare_scaled'] = scaler_processing.fit_transform(df_titanic_data.Fare.reshape(-1, 1))

    if keep_bins and keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Fare_bin_id_scaled'] = scaler_processing.fit_transform(
            df_titanic_data.Fare_bin_id.reshape(-1, 1))

    if not keep_strings:
        df_titanic_data.drop('Fare_bin', axis=1, inplace=True)


# Helper function for constructing features from the ticket variable 
Example #15
Source File: grouputils.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def group_sums(x, group, use_bincount=True):
    """simple bincount version, again

    group : array, integer
        assumed to be consecutive integers

    no dtype checking because I want to raise in that case

    uses loop over columns of x

    for comparison, simple python loop
    """
    x = np.asarray(x)
    if x.ndim == 1:
        x = x[:, None]
    elif x.ndim > 2 and use_bincount:
        raise ValueError('not implemented yet')

    if use_bincount:

        # re-label groups or bincount takes too much memory
        if np.max(group) > 2 * x.shape[0]:
            group = pd.factorize(group)[0]

        return np.array([np.bincount(group, weights=x[:, col])
                         for col in range(x.shape[1])])
    else:
        uniques = np.unique(group)
        result = np.zeros([len(uniques)] + list(x.shape[1:]))
        for ii, cat in enumerate(uniques):
            result[ii] = x[g == cat].sum(0)
        return result 
Example #16
Source File: feature_transformer.py    From py_ml_utils with Apache License 2.0 5 votes vote down vote up
def _fit_special_process(self, data, target=None):
        _, self.encoder = pd.factorize(data[self._name], sort=True) 
Example #17
Source File: feature_engineering_titanic.py    From Deep-Learning-By-Example with MIT License 5 votes vote down vote up
def process_age():
    global df_titanic_data

    # calling the set_missing_ages helper function to use random forest regression for predicting missing values of age
    set_missing_ages()

    #     # scale the age variable by centering it around the mean with a unit variance
    #     if keep_scaled:
    #         scaler_preprocessing = preprocessing.StandardScaler()
    #         df_titanic_data['Age_scaled'] = scaler_preprocessing.fit_transform(df_titanic_data.Age.reshape(-1, 1))

    # construct a feature for children
    df_titanic_data['isChild'] = np.where(df_titanic_data.Age < 13, 1, 0)

    # bin into quartiles and create binary features
    df_titanic_data['Age_bin'] = pd.qcut(df_titanic_data['Age'], 4)

    if keep_binary:
        df_titanic_data = pd.concat(
            [df_titanic_data, pd.get_dummies(df_titanic_data['Age_bin']).rename(columns=lambda y: 'Age_' + str(y))],
            axis=1)

    if keep_bins:
        df_titanic_data['Age_bin_id'] = pd.factorize(df_titanic_data['Age_bin'])[0] + 1

    if keep_bins and keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Age_bin_id_scaled'] = scaler_processing.fit_transform(
            df_titanic_data.Age_bin_id.reshape(-1, 1))

    if not keep_strings:
        df_titanic_data.drop('Age_bin', axis=1, inplace=True)


# Helper function for constructing features from the passengers/crew names 
Example #18
Source File: feature_engineering_titanic.py    From Deep-Learning-By-Example with MIT License 5 votes vote down vote up
def process_name():
    global df_titanic_data

    # getting the different names in the names variable
    df_titanic_data['Names'] = df_titanic_data['Name'].map(lambda y: len(re.split(' ', y)))

    # Getting titles for each person
    df_titanic_data['Title'] = df_titanic_data['Name'].map(lambda y: re.compile(", (.*?)\.").findall(y)[0])

    # handling the low occuring titles
    df_titanic_data['Title'][df_titanic_data.Title == 'Jonkheer'] = 'Master'
    df_titanic_data['Title'][df_titanic_data.Title.isin(['Ms', 'Mlle'])] = 'Miss'
    df_titanic_data['Title'][df_titanic_data.Title == 'Mme'] = 'Mrs'
    df_titanic_data['Title'][df_titanic_data.Title.isin(['Capt', 'Don', 'Major', 'Col', 'Sir'])] = 'Sir'
    df_titanic_data['Title'][df_titanic_data.Title.isin(['Dona', 'Lady', 'the Countess'])] = 'Lady'

    # binarizing all the features
    if keep_binary:
        df_titanic_data = pd.concat(
            [df_titanic_data, pd.get_dummies(df_titanic_data['Title']).rename(columns=lambda x: 'Title_' + str(x))],
            axis=1)

    # scalling
    if keep_scaled:
        scaler_preprocessing = preprocessing.StandardScaler()
        df_titanic_data['Names_scaled'] = scaler_preprocessing.fit_transform(df_titanic_data.Names.reshape(-1, 1))

    # binning
    if keep_bins:
        df_titanic_data['Title_id'] = pd.factorize(df_titanic_data['Title'])[0] + 1

    if keep_bins and keep_scaled:
        scaler = preprocessing.StandardScaler()
        df_titanic_data['Title_id_scaled'] = scaler.fit_transform(df_titanic_data.Title_id.reshape(-1, 1))



# Generate features from the cabin input variable 
Example #19
Source File: test_ip_pandas.py    From cyberpandas with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_factorize():
    arr = ip.IPArray([1, 1, 10, 10])
    labels, uniques = pd.factorize(arr)

    expected_labels = np.array([0, 0, 1, 1])
    tm.assert_numpy_array_equal(labels, expected_labels)

    expected_uniques = ip.IPArray([1, 10])
    assert uniques.equals(expected_uniques) 
Example #20
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_factorize_na_sentinel(self, sort, na_sentinel):
        data = np.array(['b', 'a', None, 'b'], dtype=object)
        labels, uniques = algos.factorize(data, sort=sort,
                                          na_sentinel=na_sentinel)
        if sort:
            expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp)
            expected_uniques = np.array(['a', 'b'], dtype=object)
        else:
            expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp)
            expected_uniques = np.array(['b', 'a'], dtype=object)
        tm.assert_numpy_array_equal(labels, expected_labels)
        tm.assert_numpy_array_equal(uniques, expected_uniques) 
Example #21
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_uint64_factorize(self, writable):
        data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64)
        data.setflags(write=writable)
        exp_labels = np.array([0, 1, 0], dtype=np.intp)
        exp_uniques = np.array([2**64 - 1, 1], dtype=np.uint64)

        labels, uniques = algos.factorize(data)
        tm.assert_numpy_array_equal(labels, exp_labels)
        tm.assert_numpy_array_equal(uniques, exp_uniques) 
Example #22
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_float64_factorize(self, writable):
        data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
        data.setflags(write=writable)
        exp_labels = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp)
        exp_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64)

        labels, uniques = algos.factorize(data)
        tm.assert_numpy_array_equal(labels, exp_labels)
        tm.assert_numpy_array_equal(uniques, exp_uniques) 
Example #23
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_complex_sorting(self):
        # gh 12666 - check no segfault
        x17 = np.array([complex(i) for i in range(17)], dtype=object)

        pytest.raises(TypeError, algos.factorize, x17[::-1], sort=True) 
Example #24
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_factorize_tuple_list(self, data, expected_label, expected_level):
        # GH9454
        result = pd.factorize(data)

        tm.assert_numpy_array_equal(result[0],
                                    np.array(expected_label, dtype=np.intp))

        expected_level_array = com.asarray_tuplesafe(expected_level,
                                                     dtype=object)
        tm.assert_numpy_array_equal(result[1], expected_level_array) 
Example #25
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_basic(self):

        labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c',
                                           'c'])
        tm.assert_numpy_array_equal(
            uniques, np.array(['a', 'b', 'c'], dtype=object))

        labels, uniques = algos.factorize(['a', 'b', 'b', 'a',
                                           'a', 'c', 'c', 'c'], sort=True)
        exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = np.array(['a', 'b', 'c'], dtype=object)
        tm.assert_numpy_array_equal(uniques, exp)

        labels, uniques = algos.factorize(list(reversed(range(5))))
        exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = np.array([4, 3, 2, 1, 0], dtype=np.int64)
        tm.assert_numpy_array_equal(uniques, exp)

        labels, uniques = algos.factorize(list(reversed(range(5))), sort=True)

        exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
        tm.assert_numpy_array_equal(uniques, exp)

        labels, uniques = algos.factorize(list(reversed(np.arange(5.))))
        exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = np.array([4., 3., 2., 1., 0.], dtype=np.float64)
        tm.assert_numpy_array_equal(uniques, exp)

        labels, uniques = algos.factorize(list(reversed(np.arange(5.))),
                                          sort=True)
        exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = np.array([0., 1., 2., 3., 4.], dtype=np.float64)
        tm.assert_numpy_array_equal(uniques, exp) 
Example #26
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_factorized_sort_ordered():
    cat = pd.Categorical(['b', 'b', None, 'a'],
                         categories=['c', 'b', 'a'],
                         ordered=True)

    labels, uniques = pd.factorize(cat, sort=True)
    expected_labels = np.array([0, 0, -1, 1], dtype=np.intp)
    expected_uniques = pd.Categorical(['b', 'a'],
                                      categories=['c', 'b', 'a'],
                                      ordered=True)

    tm.assert_numpy_array_equal(labels, expected_labels)
    tm.assert_categorical_equal(uniques, expected_uniques) 
Example #27
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_factorized_sort():
    cat = pd.Categorical(['b', 'b', None, 'a'])
    labels, uniques = pd.factorize(cat, sort=True)
    expected_labels = np.array([1, 1, -1, 0], dtype=np.intp)
    expected_uniques = pd.Categorical(['a', 'b'])

    tm.assert_numpy_array_equal(labels, expected_labels)
    tm.assert_categorical_equal(uniques, expected_uniques) 
Example #28
Source File: groupby.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_groupby_extension_no_sort(self, data_for_grouping):
        df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
                           "B": data_for_grouping})
        result = df.groupby("B", sort=False).A.mean()
        _, index = pd.factorize(data_for_grouping, sort=False)

        index = pd.Index(index, name="B")
        expected = pd.Series([1, 3, 4], index=index, name="A")
        self.assert_series_equal(result, expected) 
Example #29
Source File: groupby.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_groupby_extension_agg(self, as_index, data_for_grouping):
        df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
                           "B": data_for_grouping})
        result = df.groupby("B", as_index=as_index).A.mean()
        _, index = pd.factorize(data_for_grouping, sort=True)

        index = pd.Index(index, name="B")
        expected = pd.Series([3, 1, 4], index=index, name="A")
        if as_index:
            self.assert_series_equal(result, expected)
        else:
            expected = expected.reset_index()
            self.assert_frame_equal(result, expected) 
Example #30
Source File: methods.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_factorize_empty(self, data):
        labels, uniques = pd.factorize(data[:0])
        expected_labels = np.array([], dtype=np.intp)
        expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)

        tm.assert_numpy_array_equal(labels, expected_labels)
        self.assert_extension_array_equal(uniques, expected_uniques)