Python pandas.core.algorithms.unique() Examples

The following are 30 code examples of pandas.core.algorithms.unique(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas.core.algorithms , or try the search function .
Example #1
Source File: test_algos.py    From vnpy_crypto with MIT License 7 votes vote down vote up
def test_datetime64_dtype_array_returned(self):
        # GH 9431
        expected = np_array_datetime64_compat(
            ['2015-01-03T00:00:00.000000000+0000',
             '2015-01-01T00:00:00.000000000+0000'],
            dtype='M8[ns]')

        dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000+0000',
                                   '2015-01-01T00:00:00.000000000+0000',
                                   '2015-01-01T00:00:00.000000000+0000'])
        result = algos.unique(dt_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(dt_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype 
Example #2
Source File: frequencies.py    From recruit with Apache License 2.0 6 votes vote down vote up
def _get_wom_rule(self):
        #         wdiffs = unique(np.diff(self.index.week))
        # We also need -47, -49, -48 to catch index spanning year boundary
        #     if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
        #         return None

        weekdays = unique(self.index.weekday)
        if len(weekdays) > 1:
            return None

        week_of_months = unique((self.index.day - 1) // 7)
        # Only attempt to infer up to WOM-4. See #9425
        week_of_months = week_of_months[week_of_months < 4]
        if len(week_of_months) == 0 or len(week_of_months) > 1:
            return None

        # get which week
        week = week_of_months[0] + 1
        wd = int_to_weekday[weekdays[0]]

        return 'WOM-{week}{weekday}'.format(week=week, weekday=wd) 
Example #3
Source File: multi.py    From recruit with Apache License 2.0 6 votes vote down vote up
def _engine(self):
        # Calculate the number of bits needed to represent labels in each
        # level, as log2 of their sizes (including -1 for NaN):
        sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels]))

        # Sum bit counts, starting from the _right_....
        lev_bits = np.cumsum(sizes[::-1])[::-1]

        # ... in order to obtain offsets such that sorting the combination of
        # shifted codes (one for each level, resulting in a unique integer) is
        # equivalent to sorting lexicographically the codes themselves. Notice
        # that each level needs to be shifted by the number of bits needed to
        # represent the _previous_ ones:
        offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint64')

        # Check the total number of bits needed for our representation:
        if lev_bits[0] > 64:
            # The levels would overflow a 64 bit uint - use Python integers:
            return MultiIndexPyIntEngine(self.levels, self.codes, offsets)
        return MultiIndexUIntEngine(self.levels, self.codes, offsets) 
Example #4
Source File: pytables.py    From Computable with MIT License 6 votes vote down vote up
def _reindex_axis(obj, axis, labels, other=None):
    ax = obj._get_axis(axis)
    labels = _ensure_index(labels)

    # try not to reindex even if other is provided
    # if it equals our current index
    if other is not None:
        other = _ensure_index(other)
    if (other is None or labels.equals(other)) and labels.equals(ax):
        return obj

    labels = _ensure_index(labels.unique())
    if other is not None:
        labels = labels & _ensure_index(other.unique())
    if not labels.equals(ax):
        slicer = [slice(None, None)] * obj.ndim
        slicer[axis] = labels
        obj = obj.loc[tuple(slicer)]
    return obj 
Example #5
Source File: frequencies.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def _get_wom_rule(self):
        #         wdiffs = unique(np.diff(self.index.week))
        # We also need -47, -49, -48 to catch index spanning year boundary
        #     if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
        #         return None

        weekdays = unique(self.index.weekday)
        if len(weekdays) > 1:
            return None

        week_of_months = unique((self.index.day - 1) // 7)
        # Only attempt to infer up to WOM-4. See #9425
        week_of_months = week_of_months[week_of_months < 4]
        if len(week_of_months) == 0 or len(week_of_months) > 1:
            return None

        # get which week
        week = week_of_months[0] + 1
        wd = int_to_weekday[weekdays[0]]

        return 'WOM-{week}{weekday}'.format(week=week, weekday=wd) 
Example #6
Source File: frequencies.py    From Computable with MIT License 6 votes vote down vote up
def _get_wom_rule(self):
#         wdiffs = unique(np.diff(self.index.week))
        #We also need -47, -49, -48 to catch index spanning year boundary
#         if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
#             return None

        weekdays = unique(self.index.weekday)
        if len(weekdays) > 1:
            return None

        week_of_months = unique((self.index.day - 1) // 7)
        if len(week_of_months) > 1:
            return None

        # get which week
        week = week_of_months[0] + 1
        wd = _weekday_rule_aliases[weekdays[0]]

        return 'WOM-%d%s' % (week, wd) 
Example #7
Source File: pytables.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def _reindex_axis(obj, axis, labels, other=None):
    ax = obj._get_axis(axis)
    labels = _ensure_index(labels)

    # try not to reindex even if other is provided
    # if it equals our current index
    if other is not None:
        other = _ensure_index(other)
    if (other is None or labels.equals(other)) and labels.equals(ax):
        return obj

    labels = _ensure_index(labels.unique())
    if other is not None:
        labels = _ensure_index(other.unique()) & labels
    if not labels.equals(ax):
        slicer = [slice(None, None)] * obj.ndim
        slicer[axis] = labels
        obj = obj.loc[tuple(slicer)]
    return obj 
Example #8
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_datetime64_dtype_array_returned(self):
        # GH 9431
        expected = np_array_datetime64_compat(
            ['2015-01-03T00:00:00.000000000+0000',
             '2015-01-01T00:00:00.000000000+0000'],
            dtype='M8[ns]')

        dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000',
                                   '2015-01-01T00:00:00.000000000',
                                   '2015-01-01T00:00:00.000000000'])
        result = algos.unique(dt_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(dt_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype 
Example #9
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_timedelta64_dtype_array_returned(self):
        # GH 9431
        expected = np.array([31200, 45678, 10000], dtype='m8[ns]')

        td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
        result = algos.unique(td_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(td_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype 
Example #10
Source File: pytables.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def write(self, obj, **kwargs):
        super(BlockManagerFixed, self).write(obj, **kwargs)
        data = obj._data
        if not data.is_consolidated():
            data = data.consolidate()

        self.attrs.ndim = data.ndim
        for i, ax in enumerate(data.axes):
            if i == 0:
                if not ax.is_unique:
                    raise ValueError(
                        "Columns index has to be unique for fixed format")
            self.write_index('axis%d' % i, ax)

        # Supporting mixed-type DataFrame objects...nontrivial
        self.attrs.nblocks = len(data.blocks)
        for i, blk in enumerate(data.blocks):
            # I have no idea why, but writing values before items fixed #2299
            blk_items = data.items.take(blk.mgr_locs)
            self.write_array('block%d_values' % i, blk.values, items=blk_items)
            self.write_index('block%d_items' % i, blk_items) 
Example #11
Source File: pytables.py    From recruit with Apache License 2.0 6 votes vote down vote up
def _reindex_axis(obj, axis, labels, other=None):
    ax = obj._get_axis(axis)
    labels = ensure_index(labels)

    # try not to reindex even if other is provided
    # if it equals our current index
    if other is not None:
        other = ensure_index(other)
    if (other is None or labels.equals(other)) and labels.equals(ax):
        return obj

    labels = ensure_index(labels.unique())
    if other is not None:
        labels = ensure_index(other.unique()).intersection(labels, sort=False)
    if not labels.equals(ax):
        slicer = [slice(None, None)] * obj.ndim
        slicer[axis] = labels
        obj = obj.loc[tuple(slicer)]
    return obj 
Example #12
Source File: test_algos.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_timedelta64_dtype_array_returned(self):
        # GH 9431
        expected = np.array([31200, 45678, 10000], dtype='m8[ns]')

        td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
        result = algos.unique(td_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(td_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype 
Example #13
Source File: test_algos.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_datetime64_dtype_array_returned(self):
        # GH 9431
        expected = np_array_datetime64_compat(
            ['2015-01-03T00:00:00.000000000+0000',
             '2015-01-01T00:00:00.000000000+0000'],
            dtype='M8[ns]')

        dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000',
                                   '2015-01-01T00:00:00.000000000',
                                   '2015-01-01T00:00:00.000000000'])
        result = algos.unique(dt_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(dt_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype 
Example #14
Source File: test_algos.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def test_get_unique(self):
        s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
        exp = np.array([1, 2, 2**63], dtype=np.uint64)
        tm.assert_numpy_array_equal(s.unique(), exp) 
Example #15
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_hashtable_unique(self, htable, tm_dtype, writable):
        # output of maker has guaranteed unique elements
        maker = getattr(tm, 'make' + tm_dtype + 'Index')
        s = Series(maker(1000))
        if htable == ht.Float64HashTable:
            # add NaN for float column
            s.loc[500] = np.nan
        elif htable == ht.PyObjectHashTable:
            # use different NaN types for object column
            s.loc[500:502] = [np.nan, None, pd.NaT]

        # create duplicated selection
        s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
        s_duplicated.values.setflags(write=writable)

        # drop_duplicates has own cython code (hash_table_func_helper.pxi)
        # and is tested separately; keeps first occurrence like ht.unique()
        expected_unique = s_duplicated.drop_duplicates(keep='first').values
        result_unique = htable().unique(s_duplicated.values)
        tm.assert_numpy_array_equal(result_unique, expected_unique)

        # test return_inverse=True
        # reconstruction can only succeed if the inverse is correct
        result_unique, result_inverse = htable().unique(s_duplicated.values,
                                                        return_inverse=True)
        tm.assert_numpy_array_equal(result_unique, expected_unique)
        reconstr = result_unique[result_inverse]
        tm.assert_numpy_array_equal(reconstr, s_duplicated.values) 
Example #16
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_do_not_mangle_na_values(self, unique_nulls_fixture,
                                     unique_nulls_fixture2):
        # GH 22295
        if unique_nulls_fixture is unique_nulls_fixture2:
            return  # skip it, values not unique
        a = np.array([unique_nulls_fixture,
                      unique_nulls_fixture2], dtype=np.object)
        result = pd.unique(a)
        assert result.size == 2
        assert a[0] is unique_nulls_fixture
        assert a[1] is unique_nulls_fixture2 
Example #17
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_different_nans(self):
        # GH 21866
        # create different nans from bit-patterns:
        NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
        NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
        assert NAN1 != NAN1
        assert NAN2 != NAN2
        a = np.array([NAN1, NAN2])  # NAN1 and NAN2 are equivalent
        result = pd.unique(a)
        expected = np.array([np.nan])
        tm.assert_numpy_array_equal(result, expected) 
Example #18
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_signed_zero(self):
        # GH 21866
        a = np.array([-0.0, 0.0])
        result = pd.unique(a)
        expected = np.array([-0.0])  # 0.0 and -0.0 are equivalent
        tm.assert_numpy_array_equal(result, expected) 
Example #19
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_unique_tuples(self, arr, unique):
        # https://github.com/pandas-dev/pandas/issues/16519
        expected = np.empty(len(unique), dtype=object)
        expected[:] = unique

        result = pd.unique(arr)
        tm.assert_numpy_array_equal(result, expected) 
Example #20
Source File: test_algos.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def test_obj_none_preservation(self):
        # GH 20866
        arr = np.array(['foo', None], dtype=object)
        result = pd.unique(arr)
        expected = np.array(['foo', None], dtype=object)

        tm.assert_numpy_array_equal(result, expected, strict_nan=True) 
Example #21
Source File: test_algos.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_hashtable_factorize(self, htable, tm_dtype, writable):
        # output of maker has guaranteed unique elements
        maker = getattr(tm, 'make' + tm_dtype + 'Index')
        s = Series(maker(1000))
        if htable == ht.Float64HashTable:
            # add NaN for float column
            s.loc[500] = np.nan
        elif htable == ht.PyObjectHashTable:
            # use different NaN types for object column
            s.loc[500:502] = [np.nan, None, pd.NaT]

        # create duplicated selection
        s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
        s_duplicated.values.setflags(write=writable)
        na_mask = s_duplicated.isna().values

        result_unique, result_inverse = htable().factorize(s_duplicated.values)

        # drop_duplicates has own cython code (hash_table_func_helper.pxi)
        # and is tested separately; keeps first occurrence like ht.factorize()
        # since factorize removes all NaNs, we do the same here
        expected_unique = s_duplicated.dropna().drop_duplicates().values
        tm.assert_numpy_array_equal(result_unique, expected_unique)

        # reconstruction can only succeed if the inverse is correct. Since
        # factorize removes the NaNs, those have to be excluded here as well
        result_reconstruct = result_unique[result_inverse[~na_mask]]
        expected_reconstruct = s_duplicated.dropna().values
        tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct) 
Example #22
Source File: pytables.py    From Computable with MIT License 5 votes vote down vote up
def unique(self, key, column, **kwargs):
        warnings.warn("unique(key,column) is deprecated\n"
                      "use select_column(key,column).unique() instead",
                      FutureWarning)
        return self.get_storer(key).read_column(column=column,
                                                **kwargs).unique() 
Example #23
Source File: test_algos.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def test_unique_tuples(self, arr, unique):
        # https://github.com/pandas-dev/pandas/issues/16519
        expected = np.empty(len(unique), dtype=object)
        expected[:] = unique

        result = pd.unique(arr)
        tm.assert_numpy_array_equal(result, expected) 
Example #24
Source File: multi.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def unique(self, level=None):

        if level is None:
            return super(MultiIndex, self).unique()
        else:
            level = self._get_level_number(level)
            return self._get_level_values(level=level, unique=True) 
Example #25
Source File: frequencies.py    From Computable with MIT License 5 votes vote down vote up
def _get_annual_rule(self):
        if len(self.ydiffs) > 1:
            return None

        if len(algos.unique(self.fields['M'])) > 1:
            return None

        pos_check = self.month_position_check()
        return {'cs': 'AS', 'bs': 'BAS',
                'ce': 'A', 'be': 'BA'}.get(pos_check) 
Example #26
Source File: test_algos.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def test_unique_label_indices():

    a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')

    left = unique_label_indices(a)
    right = np.unique(a, return_index=True)[1]

    tm.assert_numpy_array_equal(left, right,
                                check_dtype=False)

    a[np.random.choice(len(a), 10)] = -1
    left = unique_label_indices(a)
    right = np.unique(a, return_index=True)[1][1:]
    tm.assert_numpy_array_equal(left, right,
                                check_dtype=False) 
Example #27
Source File: multi.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def _verify_integrity(self, labels=None, levels=None):
        """

        Parameters
        ----------
        labels : optional list
            Labels to check for validity. Defaults to current labels.
        levels : optional list
            Levels to check for validity. Defaults to current levels.

        Raises
        ------
        ValueError
            If length of levels and labels don't match, if any label would
            exceed level bounds, or there are any duplicate levels.
        """
        # NOTE: Currently does not check, among other things, that cached
        # nlevels matches nor that sortorder matches actually sortorder.
        labels = labels or self.labels
        levels = levels or self.levels

        if len(levels) != len(labels):
            raise ValueError("Length of levels and labels must match. NOTE:"
                             " this index is in an inconsistent state.")
        label_length = len(self.labels[0])
        for i, (level, label) in enumerate(zip(levels, labels)):
            if len(label) != label_length:
                raise ValueError("Unequal label lengths: %s" %
                                 ([len(lab) for lab in labels]))
            if len(label) and label.max() >= len(level):
                raise ValueError("On level %d, label max (%d) >= length of"
                                 " level  (%d). NOTE: this index is in an"
                                 " inconsistent state" % (i, label.max(),
                                                          len(level)))
            if not level.is_unique:
                raise ValueError("Level values must be unique: {values} on "
                                 "level {level}".format(
                                     values=[value for value in level],
                                     level=i)) 
Example #28
Source File: multi.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def _get_level_values(self, level, unique=False):
        """
        Return vector of label values for requested level,
        equal to the length of the index

        **this is an internal method**

        Parameters
        ----------
        level : int level
        unique : bool, default False
            if True, drop duplicated values

        Returns
        -------
        values : ndarray
        """

        values = self.levels[level]
        labels = self.labels[level]
        if unique:
            labels = algos.unique(labels)
        filled = algos.take_1d(values._values, labels,
                               fill_value=values._na_value)
        values = values._shallow_copy(filled)
        return values 
Example #29
Source File: datetimes.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
    """
    Convert array of dates with a cache and box the result

    Parameters
    ----------
    arg : integer, float, string, datetime, list, tuple, 1-d array, Series
    cache_array : Series
        Cache of converted, unique dates
    box : boolean
        True boxes result as an Index-like, False returns an ndarray
    errors : string
        'ignore' plus box=True will convert result to Index
    name : string, default None
        Name for a DatetimeIndex

    Returns
    -------
    result : datetime of converted dates
        Returns:

        - Index-like if box=True
        - ndarray if box=False
    """
    from pandas import Series, DatetimeIndex, Index
    result = Series(arg).map(cache_array)
    if box:
        if errors == 'ignore':
            return Index(result)
        else:
            return DatetimeIndex(result, name=name)
    return result.values 
Example #30
Source File: datetimes.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def _maybe_cache(arg, format, cache, tz, convert_listlike):
    """
    Create a cache of unique dates from an array of dates

    Parameters
    ----------
    arg : integer, float, string, datetime, list, tuple, 1-d array, Series
    format : string
        Strftime format to parse time
    cache : boolean
        True attempts to create a cache of converted values
    tz : string
        Timezone of the dates
    convert_listlike : function
        Conversion function to apply on dates

    Returns
    -------
    cache_array : Series
        Cache of converted, unique dates. Can be empty
    """
    from pandas import Series
    cache_array = Series()
    if cache:
        # Perform a quicker unique check
        from pandas import Index
        if not Index(arg).is_unique:
            unique_dates = algorithms.unique(arg)
            cache_dates = convert_listlike(unique_dates, True, format, tz=tz)
            cache_array = Series(cache_dates, index=unique_dates)
    return cache_array