Python pandas.util.hash_pandas_object() Examples

The following are 30 code examples of pandas.util.hash_pandas_object(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas.util , or try the search function .
Example #1
Source File: test_hashing.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_categorical_consistency(self):
        # GH15143
        # Check that categoricals hash consistent with their values, not codes
        # This should work for categoricals of any dtype
        for s1 in [Series(['a', 'b', 'c', 'd']),
                   Series([1000, 2000, 3000, 4000]),
                   Series(pd.date_range(0, periods=4))]:
            s2 = s1.astype('category').cat.set_categories(s1)
            s3 = s2.cat.set_categories(list(reversed(s1)))
            for categorize in [True, False]:
                # These should all hash identically
                h1 = hash_pandas_object(s1, categorize=categorize)
                h2 = hash_pandas_object(s2, categorize=categorize)
                h3 = hash_pandas_object(s3, categorize=categorize)
                tm.assert_series_equal(h1, h2)
                tm.assert_series_equal(h1, h3) 
Example #2
Source File: test_hashing.py    From elasticintel with GNU General Public License v3.0 6 votes vote down vote up
def test_categorical_consistency(self):
        # GH15143
        # Check that categoricals hash consistent with their values, not codes
        # This should work for categoricals of any dtype
        for s1 in [Series(['a', 'b', 'c', 'd']),
                   Series([1000, 2000, 3000, 4000]),
                   Series(pd.date_range(0, periods=4))]:
            s2 = s1.astype('category').cat.set_categories(s1)
            s3 = s2.cat.set_categories(list(reversed(s1)))
            for categorize in [True, False]:
                # These should all hash identically
                h1 = hash_pandas_object(s1, categorize=categorize)
                h2 = hash_pandas_object(s2, categorize=categorize)
                h3 = hash_pandas_object(s3, categorize=categorize)
                tm.assert_series_equal(h1, h2)
                tm.assert_series_equal(h1, h3) 
Example #3
Source File: test_hashing.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_categorical_consistency(self):
        # GH15143
        # Check that categoricals hash consistent with their values, not codes
        # This should work for categoricals of any dtype
        for s1 in [Series(['a', 'b', 'c', 'd']),
                   Series([1000, 2000, 3000, 4000]),
                   Series(pd.date_range(0, periods=4))]:
            s2 = s1.astype('category').cat.set_categories(s1)
            s3 = s2.cat.set_categories(list(reversed(s1)))
            for categorize in [True, False]:
                # These should all hash identically
                h1 = hash_pandas_object(s1, categorize=categorize)
                h2 = hash_pandas_object(s2, categorize=categorize)
                h3 = hash_pandas_object(s3, categorize=categorize)
                tm.assert_series_equal(h1, h2)
                tm.assert_series_equal(h1, h3) 
Example #4
Source File: test_hashing.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def test_invalid_key(self):
        # this only matters for object dtypes
        def f():
            hash_pandas_object(Series(list('abc')), hash_key='foo')
        pytest.raises(ValueError, f) 
Example #5
Source File: test_hashing.py    From recruit with Apache License 2.0 5 votes vote down vote up
def _check_equal(obj, **kwargs):
    """
    Check that hashing an objects produces the same value each time.

    Parameters
    ----------
    obj : object
        The object to hash.
    kwargs : kwargs
        Keyword arguments to pass to the hashing function.
    """
    a = hash_pandas_object(obj, **kwargs)
    b = hash_pandas_object(obj, **kwargs)
    tm.assert_series_equal(a, b) 
Example #6
Source File: test_hashing.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_invalid_key():
    # This only matters for object dtypes.
    msg = "key should be a 16-byte string encoded"

    with pytest.raises(ValueError, match=msg):
        hash_pandas_object(Series(list("abc")), hash_key="foo") 
Example #7
Source File: view_4c_analysis_baseline.py    From aurum-datadiscovery with MIT License 5 votes vote down vote up
def identify_compatible_groups(dataframes_with_metadata):
    already_classified = set()
    compatible_groups = []

    for t1, path1, md1 in dataframes_with_metadata:
        # these local variables are for this one view
        compatible_group = [path1]
        hashes1 = hash_pandas_object(t1, index=False)
        ht1 = hashes1.sum()
        if path1 in already_classified:
            continue
        for t2, path2, md2 in dataframes_with_metadata:
            if path1 == path2:  # same table
                continue
            # if t2 is in remove group
            if path2 in already_classified:
                continue
            hashes2 = hash_pandas_object(t2, index=False)
            ht2 = hashes2.sum()

            # are views compatible
            if ht1 == ht2:
                compatible_group.append(path2)
                already_classified.add(path1)
                already_classified.add(path2)
        # if len(compatible_group) > 1:
        #  cannot check this condition because now all views are analyzed from compatible groups
        compatible_groups.append(compatible_group)
    return compatible_groups 
Example #8
Source File: test_hashing.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def test_consistency(self):
        # check that our hash doesn't change because of a mistake
        # in the actual code; this is the ground truth
        result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
        expected = Series(np.array([3600424527151052760, 1374399572096150070,
                                    477881037637427054], dtype='uint64'),
                          index=['foo', 'bar', 'baz'])
        tm.assert_series_equal(result, expected) 
Example #9
Source File: test_hashing.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b) 
Example #10
Source File: test_hashing.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            if len(obj):
                assert not (a == b).all() 
Example #11
Source File: test_hashing.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def test_hash_tuples(self):
        tups = [(1, 'one'), (1, 'two'), (2, 'one')]
        result = hash_tuples(tups)
        expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
        tm.assert_numpy_array_equal(result, expected)

        result = hash_tuples(tups[0])
        assert result == expected[0] 
Example #12
Source File: test_hashing.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def test_multiindex_unique(self):
        mi = MultiIndex.from_tuples([(118, 472), (236, 118),
                                     (51, 204), (102, 51)])
        assert mi.is_unique
        result = hash_pandas_object(mi)
        assert result.is_unique 
Example #13
Source File: test_hashing.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def test_pandas_errors(self):

        for obj in [pd.Timestamp('20130101')]:
            with pytest.raises(TypeError):
                hash_pandas_object(obj)

        with catch_warnings(record=True):
            obj = tm.makePanel()
        with pytest.raises(TypeError):
            hash_pandas_object(obj) 
Example #14
Source File: test_hashing.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        assert (a != b).all() 
Example #15
Source File: test_hashing.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_hash_keys():
    # Using different hash keys, should have
    # different hashes for the same data.
    #
    # This only matters for object dtypes.
    obj = Series(list("abc"))

    a = hash_pandas_object(obj, hash_key="9876543210123456")
    b = hash_pandas_object(obj, hash_key="9876543210123465")

    assert (a != b).all() 
Example #16
Source File: test_hashing.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def test_deprecation():

    with tm.assert_produces_warning(DeprecationWarning,
                                    check_stacklevel=False):
        from pandas.tools.hashing import hash_pandas_object
        obj = Series(list('abc'))
        hash_pandas_object(obj, hash_key='9876543210123456')

    with tm.assert_produces_warning(DeprecationWarning,
                                    check_stacklevel=False):
        from pandas.tools.hashing import hash_array
        obj = np.array([1, 2, 3])
        hash_array(obj, hash_key='9876543210123456') 
Example #17
Source File: test_hashing.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_consistency(self):
        # check that our hash doesn't change because of a mistake
        # in the actual code; this is the ground truth
        result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
        expected = Series(np.array([3600424527151052760, 1374399572096150070,
                                    477881037637427054], dtype='uint64'),
                          index=['foo', 'bar', 'baz'])
        tm.assert_series_equal(result, expected) 
Example #18
Source File: test_hashing.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b) 
Example #19
Source File: test_hashing.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            if len(obj):
                assert not (a == b).all() 
Example #20
Source File: test_hashing.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_hash_tuples(self):
        tups = [(1, 'one'), (1, 'two'), (2, 'one')]
        result = hash_tuples(tups)
        expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
        tm.assert_numpy_array_equal(result, expected)

        result = hash_tuples(tups[0])
        assert result == expected[0] 
Example #21
Source File: test_hashing.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_multiindex_unique(self):
        mi = MultiIndex.from_tuples([(118, 472), (236, 118),
                                     (51, 204), (102, 51)])
        assert mi.is_unique
        result = hash_pandas_object(mi)
        assert result.is_unique 
Example #22
Source File: test_hashing.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_pandas_errors(self):

        for obj in [pd.Timestamp('20130101')]:
            with pytest.raises(TypeError):
                hash_pandas_object(obj)

        with catch_warnings(record=True):
            obj = tm.makePanel()
        with pytest.raises(TypeError):
            hash_pandas_object(obj) 
Example #23
Source File: test_hashing.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        assert (a != b).all() 
Example #24
Source File: test_hashing.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_invalid_key(self):
        # this only matters for object dtypes
        def f():
            hash_pandas_object(Series(list('abc')), hash_key='foo')
        pytest.raises(ValueError, f) 
Example #25
Source File: helpers.py    From siuba with MIT License 5 votes vote down vote up
def load_cached_df(self, df):
        import hashlib
        from pandas import util
        hash_arr = util.hash_pandas_object(df, index=True).values
        hashed = hashlib.sha256(hash_arr).hexdigest()

        if hashed in self.cache:
            return self.cache[hashed]
        
        res = self.cache[hashed] = self.load_df(df)

        return res 
Example #26
Source File: test_hashing.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def test_hash_tuples(self):
        tups = [(1, 'one'), (1, 'two'), (2, 'one')]
        result = hash_tuples(tups)
        expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
        tm.assert_numpy_array_equal(result, expected)

        result = hash_tuples(tups[0])
        assert result == expected[0] 
Example #27
Source File: test_hashing.py    From recruit with Apache License 2.0 5 votes vote down vote up
def _check_not_equal_with_index(obj):
    """
    Check the hash of an object with and without its index is not the same.

    Parameters
    ----------
    obj : object
        The object to hash.
    """
    if not isinstance(obj, Index):
        a = hash_pandas_object(obj, index=True)
        b = hash_pandas_object(obj, index=False)

        if len(obj):
            assert not (a == b).all() 
Example #28
Source File: test_hashing.py    From recruit with Apache License 2.0 5 votes vote down vote up
def test_consistency():
    # Check that our hash doesn't change because of a mistake
    # in the actual code; this is the ground truth.
    result = hash_pandas_object(Index(["foo", "bar", "baz"]))
    expected = Series(np.array([3600424527151052760, 1374399572096150070,
                                477881037637427054], dtype="uint64"),
                      index=["foo", "bar", "baz"])
    tm.assert_series_equal(result, expected) 
Example #29
Source File: test_hashing.py    From recruit with Apache License 2.0 5 votes vote down vote up
def test_hash_tuples():
    tuples = [(1, "one"), (1, "two"), (2, "one")]
    result = hash_tuples(tuples)

    expected = hash_pandas_object(MultiIndex.from_tuples(tuples)).values
    tm.assert_numpy_array_equal(result, expected)

    result = hash_tuples(tuples[0])
    assert result == expected[0] 
Example #30
Source File: test_hashing.py    From recruit with Apache License 2.0 5 votes vote down vote up
def test_multiindex_unique():
    mi = MultiIndex.from_tuples([(118, 472), (236, 118),
                                 (51, 204), (102, 51)])
    assert mi.is_unique is True

    result = hash_pandas_object(mi)
    assert result.is_unique is True