Python sklearn.feature_extraction.FeatureHasher() Examples

The following are 30 code examples of sklearn.feature_extraction.FeatureHasher(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.feature_extraction , or try the search function .
Example #1
Source File: pe_imports_features.py    From driverlessai-recipes with Apache License 2.0 7 votes vote down vote up
def imports_features(self, lief_binary):
        from sklearn.feature_extraction import FeatureHasher

        imports = lief_binary.imports
        features = {}
        for lib in imports:
            if lib.name not in features:
                features[lib.name] = []
            for entry in lib.entries:
                if entry.is_ordinal:
                    features[lib.name].append("ordinal" + str(entry.ordinal))
                else:
                    features[lib.name].append(entry.name[:10000])

        features_hashed = {}
        libraries = sorted(list(set([l.lower() for l in features.keys()])))
        for i, x in enumerate(FeatureHasher(256, input_type='string').transform([libraries]).toarray()[0]):
            features_hashed.update({f'Imports_libraries_hash_{i}': x})
        entries = sorted([lib.lower() + ':' + e for lib, elist in features.items() for e in elist])
        for i, x in enumerate(FeatureHasher(1024, input_type='string').transform([entries]).toarray()[0]):
            features_hashed.update({f'Imports_entries_hash_{i}': x})
        return features_hashed 
Example #2
Source File: test_feature_hasher.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_hash_collisions():
    X = [list("Thequickbrownfoxjumped")]

    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
                       n_features=1, input_type='string').fit_transform(X)
    # check that some of the hashed tokens are added
    # with an opposite sign and cancel out
    assert abs(Xt.data[0]) < len(X[0])

    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
                       n_features=1, input_type='string').fit_transform(X)
    assert abs(Xt.data[0]) < len(X[0])

    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
                       n_features=1, input_type='string').fit_transform(X)
    assert Xt.data[0] == len(X[0]) 
Example #3
Source File: test_feature_hasher.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_hasher_alternate_sign():
    X = [list("Thequickbrownfoxjumped")]

    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() < 0 and Xt.data.max() > 0

    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() > 0

    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() > 0
    Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False,
                         input_type='string').fit_transform(X)
    # With initially positive features, the non_negative option should
    # have no impact when alternate_sign=False
    assert_array_equal(Xt.data, Xt_2.data) 
Example #4
Source File: test_feature_hasher.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_feature_hasher_strings():
    # mix byte and Unicode strings; note that "foo" is a duplicate in row 0
    raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
             ["bar".encode("ascii"), "baz", "quux"]]

    for lg_n_features in (7, 9, 11, 16, 22):
        n_features = 2 ** lg_n_features

        it = (x for x in raw_X)                 # iterable

        h = FeatureHasher(n_features, input_type="string",
                          alternate_sign=False)
        X = h.transform(it)

        assert_equal(X.shape[0], len(raw_X))
        assert_equal(X.shape[1], n_features)

        assert_equal(X[0].sum(), 4)
        assert_equal(X[1].sum(), 3)

        assert_equal(X.nnz, 6) 
Example #5
Source File: test_feature_hasher.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_feature_hasher_pairs_with_string_values():
    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
                                       {"baz": "abc", "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 1], x1_nz)
    assert_equal([1, 1, 4], x2_nz)

    raw_X = (iter(d.items()) for d in [{"bax": "abc"},
                                       {"bax": "abc"}])
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = np.abs(x1[x1 != 0])
    x2_nz = np.abs(x2[x2 != 0])
    assert_equal([1], x1_nz)
    assert_equal([1], x2_nz)
    assert_array_equal(x1, x2) 
Example #6
Source File: run_model.py    From kaggle_avazu_benchmark with Apache License 2.0 6 votes vote down vote up
def main(neg_rate, submission_num, n_iter, train_path):
    ids = [x for x in pp.get_int_field('id', 'original_data/test')]
    clicks = pp.get_int_field('click', train_path)
    # Get Data Generators
    train = pp.data_generator(pp.clean_parse_row, train_path)
    test = pp.data_generator(pp.clean_parse_row, 'original_data/test')

    # Define estimators
    fh = FeatureHasher(n_features=2 ** 20, input_type='pair')
    sgd = SGDClassifier(loss='log', n_iter=1, alpha=.003, penalty='l2')

    #Fit pipeline
    pipeline = ml.PartialFitter([fh, sgd],
                                batch_size=10000,
                                logging=True,
                                n_iter=n_iter,
                                neg_rate=neg_rate)

    pipeline.partial_fit(X=train, y=clicks)
    # Correct Intercept
    pipeline.steps[-1].intercept_[0] += np.log(neg_rate)
    preds = pipeline.predict_proba(newX=test)[:, 1]
    pp.write_submission(number=submission_num, ids=ids, preds=preds) 
Example #7
Source File: test_feature_hasher.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_feature_hasher_pairs_with_string_values():
    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
                                       {"baz": u"abc", "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 1], x1_nz)
    assert_equal([1, 1, 4], x2_nz)

    raw_X = (iter(d.items()) for d in [{"bax": "abc"},
                                       {"bax": "abc"}])
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = np.abs(x1[x1 != 0])
    x2_nz = np.abs(x2[x2 != 0])
    assert_equal([1], x1_nz)
    assert_equal([1], x2_nz)
    assert_array_equal(x1, x2) 
Example #8
Source File: test_feature_extraction.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.feature_extraction.DictVectorizer, fe.DictVectorizer)
        self.assertIs(df.feature_extraction.FeatureHasher, fe.FeatureHasher)

        self.assertIs(df.feature_extraction.image.img_to_graph, fe.image.img_to_graph)
        self.assertIs(df.feature_extraction.image.grid_to_graph, fe.image.grid_to_graph)
        self.assertIs(df.feature_extraction.image.extract_patches_2d, fe.image.extract_patches_2d)
        self.assertIs(df.feature_extraction.image.reconstruct_from_patches_2d,
                      fe.image.reconstruct_from_patches_2d)
        self.assertIs(df.feature_extraction.image.PatchExtractor, fe.image.PatchExtractor)

        self.assertIs(df.feature_extraction.text.CountVectorizer, fe.text.CountVectorizer)
        self.assertIs(df.feature_extraction.text.HashingVectorizer, fe.text.HashingVectorizer)
        self.assertIs(df.feature_extraction.text.TfidfTransformer, fe.text.TfidfTransformer)
        self.assertIs(df.feature_extraction.text.TfidfVectorizer, fe.text.TfidfVectorizer) 
Example #9
Source File: test_feature_hasher.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_feature_hasher_strings():
    # mix byte and Unicode strings; note that "foo" is a duplicate in row 0
    raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
             ["bar".encode("ascii"), "baz", "quux"]]

    for lg_n_features in (7, 9, 11, 16, 22):
        n_features = 2 ** lg_n_features

        it = (x for x in raw_X)                 # iterable

        h = FeatureHasher(n_features, non_negative=True, input_type="string")
        X = h.transform(it)

        assert_equal(X.shape[0], len(raw_X))
        assert_equal(X.shape[1], n_features)

        assert_true(np.all(X.data > 0))
        assert_equal(X[0].sum(), 4)
        assert_equal(X[1].sum(), 3)

        assert_equal(X.nnz, 6) 
Example #10
Source File: pefeatures.py    From youarespecial with MIT License 5 votes vote down vote up
def __call__(self, binary):
        libraries = [l.lower() for l in binary.libraries]
        # we'll create a string like "kernel32.dll:CreateFileMappingA" for each entry
        imports = [lib.name.lower() + ':' +
                   e.name for lib in binary.imports for e in lib.entries]

        # two separate elements: libraries (alone) and fully-qualified names of imported functions
        return np.concatenate([
            FeatureHasher(256, input_type="string", dtype=self.dtype).transform(
                [libraries]).toarray(),
            FeatureHasher(1024, input_type="string", dtype=self.dtype).transform(
                [imports]).toarray()
        ], axis=-1).flatten().astype(self.dtype) 
Example #11
Source File: _machine_learning.py    From qlik-py-tools with MIT License 5 votes vote down vote up
def hasher(df, col, n_features):
        """
        Hash the unique values in the specified column in the given dataframe, creating n_features
        """
        
        unique = pd.DataFrame(df[col].unique(), columns=[col])
        fh = FeatureHasher(n_features=n_features, input_type="string")
        hashed = fh.fit_transform(unique.loc[:, col])
        unique = unique.join(pd.DataFrame(hashed.toarray()).add_prefix(col))
        return unique.set_index(col) 
Example #12
Source File: pefeatures.py    From youarespecial with MIT License 5 votes vote down vote up
def __call__(self, binary):
        return FeatureHasher(128, input_type="string", dtype=self.dtype).transform([binary.exported_functions]).toarray().flatten().astype(self.dtype) 
Example #13
Source File: pefeatures.py    From youarespecial with MIT License 5 votes vote down vote up
def __call__(self, binary):

        return np.concatenate([
            [[binary.header.time_date_stamps]],
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.header.machine)]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(c) for c in binary.header.characteristics_list]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.optional_header.subsystem)]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(c) for c in binary.optional_header.dll_characteristics_lists]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.optional_header.magic)]]).toarray(),
            [[binary.optional_header.major_image_version]],
            [[binary.optional_header.minor_image_version]],
            [[binary.optional_header.major_linker_version]],
            [[binary.optional_header.minor_linker_version]],
            [[binary.optional_header.major_operating_system_version]],
            [[binary.optional_header.minor_operating_system_version]],
            [[binary.optional_header.major_subsystem_version]],
            [[binary.optional_header.minor_subsystem_version]],
            [[binary.optional_header.sizeof_code]],
            [[binary.optional_header.sizeof_headers]],
            [[binary.optional_header.sizeof_heap_commit]],
        ], axis=-1).flatten().astype(self.dtype) 
Example #14
Source File: test_feature_hasher.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_feature_hasher_dicts():
    h = FeatureHasher(n_features=16)
    assert_equal("dict", h.input_type)

    raw_X = [{"foo": "bar", "dada": 42, "tzara": 37},
             {"foo": "baz", "gaga": u"string1"}]
    X1 = FeatureHasher(n_features=16).transform(raw_X)
    gen = (iter(d.items()) for d in raw_X)
    X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
    assert_array_equal(X1.toarray(), X2.toarray()) 
Example #15
Source File: test_feature_hasher.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_feature_hasher_pairs():
    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": 2},
                                       {"baz": 3, "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 2], x1_nz)
    assert_equal([1, 3, 4], x2_nz) 
Example #16
Source File: test_feature_hasher.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_hash_empty_input():
    n_features = 16
    raw_X = [[], (), iter(range(0))]

    h = FeatureHasher(n_features=n_features, input_type="string")
    X = h.transform(raw_X)

    assert_array_equal(X.A, np.zeros((len(raw_X), n_features))) 
Example #17
Source File: test_feature_hasher.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_hasher_set_params():
    # Test delayed input validation in fit (useful for grid search).
    hasher = FeatureHasher()
    hasher.set_params(n_features=np.inf)
    assert_raises(TypeError, hasher.fit) 
Example #18
Source File: test_feature_hasher.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_hasher_zeros():
    # Assert that no zeros are materialized in the output.
    X = FeatureHasher().transform([{'foo': 0}])
    assert_equal(X.data.shape, (0,)) 
Example #19
Source File: test_feature_hasher.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_hasher_negative():
    X = [{"foo": 2, "bar": -4, "baz": -1}.items()]
    Xt = FeatureHasher(alternate_sign=False, non_negative=False,
                       input_type="pair").fit_transform(X)
    assert_true(Xt.data.min() < 0 and Xt.data.max() > 0)
    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
                       input_type="pair").fit_transform(X)
    assert_true(Xt.data.min() > 0)
    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
                       input_type="pair").fit_transform(X)
    assert_true(Xt.data.min() < 0 and Xt.data.max() > 0)
    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
                       input_type="pair").fit_transform(X)
    assert_true(Xt.data.min() > 0) 
Example #20
Source File: test_feature_hasher.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_hasher_alternate_sign():
    X = [list("Thequickbrownfoxjumped")]

    Xt = FeatureHasher(alternate_sign=True,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() < 0 and Xt.data.max() > 0

    Xt = FeatureHasher(alternate_sign=False,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() > 0 
Example #21
Source File: pefeatures.py    From gym-malware with MIT License 5 votes vote down vote up
def __call__(self, binary):
        libraries = [l.lower() for l in binary.libraries]
        # we'll create a string like "kernel32.dll:CreateFileMappingA" for each entry
        imports = [lib.name.lower() + ':' +
                   e.name for lib in binary.imports for e in lib.entries]

        # two separate elements: libraries (alone) and fully-qualified names of imported functions
        return np.concatenate([
            FeatureHasher(256, input_type="string", dtype=self.dtype).transform(
                [libraries]).toarray(),
            FeatureHasher(1024, input_type="string", dtype=self.dtype).transform(
                [imports]).toarray()
        ], axis=-1).flatten().astype(self.dtype) 
Example #22
Source File: pefeatures.py    From gym-malware with MIT License 5 votes vote down vote up
def __call__(self, binary):
        return FeatureHasher(128, input_type="string", dtype=self.dtype).transform([binary.exported_functions]).toarray().flatten().astype(self.dtype) 
Example #23
Source File: pefeatures.py    From gym-malware with MIT License 5 votes vote down vote up
def __call__(self, binary):

        return np.concatenate([
            [[binary.header.time_date_stamps]],
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.header.machine)]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(c) for c in binary.header.characteristics_list]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.optional_header.subsystem)]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(c) for c in binary.optional_header.dll_characteristics_lists]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.optional_header.magic)]]).toarray(),
            [[binary.optional_header.major_image_version]],
            [[binary.optional_header.minor_image_version]],
            [[binary.optional_header.major_linker_version]],
            [[binary.optional_header.minor_linker_version]],
            [[binary.optional_header.major_operating_system_version]],
            [[binary.optional_header.minor_operating_system_version]],
            [[binary.optional_header.major_subsystem_version]],
            [[binary.optional_header.minor_subsystem_version]],
            [[binary.optional_header.sizeof_code]],
            [[binary.optional_header.sizeof_headers]],
            [[binary.optional_header.sizeof_heap_commit]],
        ], axis=-1).flatten().astype(self.dtype) 
Example #24
Source File: test_feature_hasher.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_feature_hasher_dicts():
    h = FeatureHasher(n_features=16)
    assert_equal("dict", h.input_type)

    raw_X = [{"foo": "bar", "dada": 42, "tzara": 37},
             {"foo": "baz", "gaga": "string1"}]
    X1 = FeatureHasher(n_features=16).transform(raw_X)
    gen = (iter(d.items()) for d in raw_X)
    X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
    assert_array_equal(X1.toarray(), X2.toarray()) 
Example #25
Source File: test_feature_hasher.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_feature_hasher_pairs():
    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": 2},
                                       {"baz": 3, "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 2], x1_nz)
    assert_equal([1, 3, 4], x2_nz) 
Example #26
Source File: test_feature_hasher.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_hash_empty_input():
    n_features = 16
    raw_X = [[], (), iter(range(0))]

    h = FeatureHasher(n_features=n_features, input_type="string")
    X = h.transform(raw_X)

    assert_array_equal(X.A, np.zeros((len(raw_X), n_features))) 
Example #27
Source File: test_feature_hasher.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_hasher_set_params():
    # Test delayed input validation in fit (useful for grid search).
    hasher = FeatureHasher()
    hasher.set_params(n_features=np.inf)
    assert_raises(TypeError, hasher.fit) 
Example #28
Source File: test_feature_hasher.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_hasher_zeros():
    # Assert that no zeros are materialized in the output.
    X = FeatureHasher().transform([{'foo': 0}])
    assert_equal(X.data.shape, (0,)) 
Example #29
Source File: test_feature_hasher.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_hash_collisions():
    X = [list("Thequickbrownfoxjumped")]

    Xt = FeatureHasher(alternate_sign=True, n_features=1,
                       input_type='string').fit_transform(X)
    # check that some of the hashed tokens are added
    # with an opposite sign and cancel out
    assert abs(Xt.data[0]) < len(X[0])

    Xt = FeatureHasher(alternate_sign=False, n_features=1,
                       input_type='string').fit_transform(X)
    assert Xt.data[0] == len(X[0]) 
Example #30
Source File: pe_exports_features.py    From driverlessai-recipes with Apache License 2.0 5 votes vote down vote up
def exports_features(self, lief_binary):
        from sklearn.feature_extraction import FeatureHasher

        exports = sorted(lief_binary.exported_functions)

        features_hashed = {}
        if exports:
            for i, x in enumerate(FeatureHasher(128, input_type='string').transform(exports).toarray()[0]):
                features_hashed.update({f'Exports_functions_hash_{i}': x})
        else:
            for i in range(128):
                features_hashed.update({f'Exports_functions_hash_{i}': 0})

        return features_hashed