Python sklearn.preprocessing.OneHotEncoder() Examples

The following are 30 code examples for showing how to use sklearn.preprocessing.OneHotEncoder(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.preprocessing , or try the search function .

Example 1
Project: KDDCup2019_admin   Author: DominickZhang   File: feature_expansion.py    License: MIT License 6 votes vote down vote up
def cat_onehot_encoder(df,y,col,selection=True):
    feat_x = df.values.reshape(-1,1)

    from sklearn.preprocessing import LabelEncoder

    le = LabelEncoder()
    le.fit(feat_x)
    feat_x = le.transform(feat_x)

    mlbs = OneHotEncoder(sparse=True).fit(feat_x.reshape(-1,1))
    from scipy.sparse import csr_matrix
    features_tmp = mlbs.transform(feat_x.reshape(-1,1))
    features_tmp = csr_matrix(features_tmp,dtype=float).tocsr()
    models = None
    auc_score = None
    if selection is True:
        auc_score, models = train_lightgbm_for_feature_selection(features_tmp, y)
        print(col, "auc", auc_score)
    #new_feature = pd.DataFrame(features_tmp,columns=["mul_feature_"+col])
    new_feature = features_tmp




    return new_feature,mlbs,models,auc_score,le 
Example 2
Project: PyShortTextCategorization   Author: stephenhky   File: sakaguchi.py    License: MIT License 6 votes vote down vote up
def loadmodel(self, prefix):
        """ Load the model.

        :param prefix: prefix of the model path
        :return: None
        :type prefix: str
        """
        self.dictionary = Dictionary.load(prefix+'_vocabs.gensimdict')
        parameters = json.load(open(prefix+'_config.json', 'r'))
        self.operation = parameters['operation']
        self.alph = parameters['alph']
        self.specialsignals = parameters['special_signals']
        self.binarizer = SCRNNBinarizer(self.alph, self.specialsignals)
        self.concatcharvec_encoder = SpellingToConcatCharVecEncoder(self.alph)
        self.batchsize = parameters['batchsize']
        self.nb_hiddenunits = parameters['nb_hiddenunits']
        self.onehotencoder = OneHotEncoder()
        self.onehotencoder.fit(np.arange(len(self.dictionary)).reshape((len(self.dictionary), 1)))
        self.model = kerasio.load_model(prefix)
        self.trained = True 
Example 3
def get_X_y(**kwargs):
    """simple wrapper around pd.read_csv that extracts features and labels

    Some systematic preprocessing is also carried out to avoid doing this
    transformation repeatedly in the code.
    """
    global label_encoder
    df = pd.read_csv(info['path'], sep='\t', **kwargs)
    return preprocess(df, label_encoder)

###############################################################################
# Classifier objects in |sklearn| often require :code:`y` to be integer labels.
# Additionally, |APS| requires a binary version of the labels.  For these two
# purposes, we create:
#
# * a |LabelEncoder|, that we pre-fitted on the known :code:`y` classes
# * a |OneHotEncoder|, pre-fitted on the resulting integer labels.
#
# Their |transform| methods can the be called at appopriate times. 
Example 4
Project: nussl   Author: nussl   File: transforms.py    License: MIT License 6 votes vote down vote up
def __call__(self, data):
        if 'metadata' not in data:
            raise TransformException(
                f"Expected metadata in data, got {list(data.keys())}")
        if 'labels' not in data['metadata']:
            raise TransformException(
                f"Expected labels in data['metadata'], got "
                f"{list(data['metadata'].keys())}")

        enc = OneHotEncoder(categories=[data['metadata']['labels']])

        sources = data[self.source_key]
        source_keys = [k.split('::')[0] for k in list(sources.keys())]
        source_labels = [[l] for l in sorted(source_keys)]

        one_hot_labels = enc.fit_transform(source_labels)
        data['one_hot_labels'] = one_hot_labels.toarray()

        return data 
Example 5
Project: Broad-Learning-System   Author: LiangjunFeng   File: bls.py    License: MIT License 6 votes vote down vote up
def __init__(self, 
                 maptimes = 10, 
                 enhencetimes = 10,
                 map_function = 'linear',
                 enhence_function = 'linear',
                 batchsize = 'auto', 
                 reg = 0.001):
        
        self._maptimes = maptimes
        self._enhencetimes = enhencetimes
        self._batchsize = batchsize
        self._reg = reg
        self._map_function = map_function
        self._enhence_function = enhence_function
        
        self.W = 0
        self.pesuedoinverse = 0
        self.normalscaler = scaler()
        self.onehotencoder = preprocessing.OneHotEncoder(sparse = False)
        self.mapping_generator = node_generator()
        self.enhence_generator = node_generator(whiten = True) 
Example 6
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_column_transformer.py    License: MIT License 6 votes vote down vote up
def test_column_transformer_list():
    X_list = [
        [1, float('nan'), 'a'],
        [0, 0, 'b']
    ]
    expected_result = np.array([
        [1, float('nan'), 1, 0],
        [-1, 0, 0, 1],
    ])

    ct = ColumnTransformer([
        ('numerical', StandardScaler(), [0, 1]),
        ('categorical', OneHotEncoder(), [2]),
    ])

    assert_array_equal(ct.fit_transform(X_list), expected_result)
    assert_array_equal(ct.fit(X_list).transform(X_list), expected_result) 
Example 7
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_discretization.py    License: MIT License 6 votes vote down vote up
def test_encode_options():
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
                           encode='ordinal').fit(X)
    Xt_1 = est.transform(X)
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
                           encode='onehot-dense').fit(X)
    Xt_2 = est.transform(X)
    assert not sp.issparse(Xt_2)
    assert_array_equal(OneHotEncoder(
                           categories=[np.arange(i) for i in [2, 3, 3, 3]],
                           sparse=False)
                       .fit_transform(Xt_1), Xt_2)
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
                           encode='onehot').fit(X)
    Xt_3 = est.transform(X)
    assert sp.issparse(Xt_3)
    assert_array_equal(OneHotEncoder(
                           categories=[np.arange(i) for i in [2, 3, 3, 3]],
                           sparse=True)
                       .fit_transform(Xt_1).toarray(),
                       Xt_3.toarray()) 
Example 8
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_encoders.py    License: MIT License 6 votes vote down vote up
def test_one_hot_encoder_force_new_behaviour():
    # ambiguous integer case (non secutive range of categories)
    X = np.array([[1, 2]]).T
    X2 = np.array([[0, 1]]).T

    # without argument -> by default using legacy behaviour with warnings
    enc = OneHotEncoder()

    with ignore_warnings(category=FutureWarning):
        enc.fit(X)

    res = enc.transform(X2)
    exp = np.array([[0, 0], [1, 0]])
    assert_array_equal(res.toarray(), exp)

    # with explicit auto argument -> don't use legacy behaviour
    # (so will raise an error on unseen value within range)
    enc = OneHotEncoder(categories='auto')
    enc.fit(X)
    assert_raises(ValueError, enc.transform, X2) 
Example 9
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_encoders.py    License: MIT License 6 votes vote down vote up
def test_one_hot_encoder_categorical_features():
    X = np.array([[3, 2, 1], [0, 1, 1]])
    X2 = np.array([[1, 1, 1]])

    cat = [True, False, False]
    _check_one_hot(X, X2, cat, 4)

    # Edge case: all non-categorical
    cat = [False, False, False]
    _check_one_hot(X, X2, cat, 3)

    # Edge case: all categorical
    cat = [True, True, True]
    _check_one_hot(X, X2, cat, 5)

    # check error raised if also specifying categories
    oh = OneHotEncoder(categories=[range(3)],
                       categorical_features=[True, False, False])
    assert_raises(ValueError, oh.fit, X) 
Example 10
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_encoders.py    License: MIT License 6 votes vote down vote up
def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
    enc = OneHotEncoder(categories=cats)
    exp = np.array([[1., 0., 0.],
                    [0., 1., 0.]])
    assert_array_equal(enc.fit_transform(X).toarray(), exp)
    assert list(enc.categories[0]) == list(cats[0])
    assert enc.categories_[0].tolist() == list(cats[0])
    # manually specified categories should have same dtype as
    # the data when coerced from lists
    assert enc.categories_[0].dtype == cat_dtype

    # when specifying categories manually, unknown categories should already
    # raise when fitting
    enc = OneHotEncoder(categories=cats)
    with pytest.raises(ValueError, match="Found unknown categories"):
        enc.fit(X2)
    enc = OneHotEncoder(categories=cats, handle_unknown='ignore')
    exp = np.array([[1., 0., 0.], [0., 0., 0.]])
    assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp) 
Example 11
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_encoders.py    License: MIT License 6 votes vote down vote up
def test_one_hot_encoder_unsorted_categories():
    X = np.array([['a', 'b']], dtype=object).T

    enc = OneHotEncoder(categories=[['b', 'a', 'c']])
    exp = np.array([[0., 1., 0.],
                    [1., 0., 0.]])
    assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
    assert_array_equal(enc.fit_transform(X).toarray(), exp)
    assert enc.categories_[0].tolist() == ['b', 'a', 'c']
    assert np.issubdtype(enc.categories_[0].dtype, np.object_)

    # unsorted passed categories still raise for numerical values
    X = np.array([[1, 2]]).T
    enc = OneHotEncoder(categories=[[2, 1, 3]])
    msg = 'Unsorted categories are not supported'
    with pytest.raises(ValueError, match=msg):
        enc.fit_transform(X) 
Example 12
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_encoders.py    License: MIT License 6 votes vote down vote up
def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
    if as_data_frame:
        pd = pytest.importorskip('pandas')
        X = pd.DataFrame(X)

    ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit(X)

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit_transform(X)

    if as_data_frame:
        X_partial = X.iloc[:1, :]
    else:
        X_partial = X[:1, :]

    ohe.fit(X_partial)

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.transform(X) 
Example 13
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_encoders.py    License: MIT License 6 votes vote down vote up
def test_encoder_dtypes():
    # check that dtypes are preserved when determining categories
    enc = OneHotEncoder(categories='auto')
    exp = np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]], dtype='float64')

    for X in [np.array([[1, 2], [3, 4]], dtype='int64'),
              np.array([[1, 2], [3, 4]], dtype='float64'),
              np.array([['a', 'b'], ['c', 'd']]),  # string dtype
              np.array([[1, 'a'], [3, 'b']], dtype='object')]:
        enc.fit(X)
        assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
        assert_array_equal(enc.transform(X).toarray(), exp)

    X = [[1, 2], [3, 4]]
    enc.fit(X)
    assert all([np.issubdtype(enc.categories_[i].dtype, np.integer)
                for i in range(2)])
    assert_array_equal(enc.transform(X).toarray(), exp)

    X = [[1, 'a'], [3, 'b']]
    enc.fit(X)
    assert all([enc.categories_[i].dtype == 'object' for i in range(2)])
    assert_array_equal(enc.transform(X).toarray(), exp) 
Example 14
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_encoders.py    License: MIT License 6 votes vote down vote up
def test_encoder_dtypes_pandas():
    # check dtype (similar to test_categorical_encoder_dtypes for dataframes)
    pd = pytest.importorskip('pandas')

    enc = OneHotEncoder(categories='auto')
    exp = np.array([[1., 0., 1., 0., 1., 0.],
                    [0., 1., 0., 1., 0., 1.]], dtype='float64')

    X = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}, dtype='int64')
    enc.fit(X)
    assert all([enc.categories_[i].dtype == 'int64' for i in range(2)])
    assert_array_equal(enc.transform(X).toarray(), exp)

    X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]})
    X_type = [int, object, float]
    enc.fit(X)
    assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
    assert_array_equal(enc.transform(X).toarray(), exp) 
Example 15
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_encoders.py    License: MIT License 6 votes vote down vote up
def test_one_hot_encoder_drop_manual():
    cats_to_drop = ['def', 12, 3, 56]
    enc = OneHotEncoder(drop=cats_to_drop)
    X = [['abc', 12, 2, 55],
         ['def', 12, 1, 55],
         ['def', 12, 3, 56]]
    trans = enc.fit_transform(X).toarray()
    exp = [[1, 0, 1, 1],
           [0, 1, 0, 1],
           [0, 0, 0, 0]]
    assert_array_equal(trans, exp)
    dropped_cats = [cat[feature]
                    for cat, feature in zip(enc.categories_,
                                            enc.drop_idx_)]
    assert_array_equal(dropped_cats, cats_to_drop)
    assert_array_equal(np.array(X, dtype=object),
                       enc.inverse_transform(trans)) 
Example 16
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_encoders.py    License: MIT License 6 votes vote down vote up
def test_categories(density, drop):
    ohe_base = OneHotEncoder(sparse=density)
    ohe_test = OneHotEncoder(sparse=density, drop=drop)
    X = [['c', 1, 'a'],
         ['a', 2, 'b']]
    ohe_base.fit(X)
    ohe_test.fit(X)
    assert_array_equal(ohe_base.categories_, ohe_test.categories_)
    if drop == 'first':
        assert_array_equal(ohe_test.drop_idx_, 0)
    else:
        for drop_cat, drop_idx, cat_list in zip(drop,
                                                ohe_test.drop_idx_,
                                                ohe_test.categories_):
            assert cat_list[drop_idx] == drop_cat
    assert isinstance(ohe_test.drop_idx_, np.ndarray)
    assert ohe_test.drop_idx_.dtype == np.int_ 
Example 17
Project: RandomForestClustering   Author: joshloyal   File: forest_embedding.py    License: MIT License 6 votes vote down vote up
def fit_transform(self, X, y=None, sample_weight=None):
        X = check_array(X, accept_sparse=['csc'], ensure_2d=False)

        if sp.issparse(X):
            # Pre-sort indices to avoid that each individual tree of the
            # ensemble sorts the indices.
            X.sort_indices()

        X_, y_ = generate_discriminative_dataset(X)

        super(RandomForestEmbedding, self).fit(X_, y_,
                                               sample_weight=sample_weight)

        self.one_hot_encoder_ = OneHotEncoder(sparse=True)
        if self.sparse_output:
            return self.one_hot_encoder_.fit_transform(self.apply(X))
        return self.apply(X) 
Example 18
Project: tick   Author: X-DataInitiative   File: features_binarizer_test.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_binarizer_remove_first(self):
        """...Test binarizer fit when remove_first=True
        """
        n_cuts = 3
        one_hot_encoder = OneHotEncoder(sparse=True)
        expected_binarization = one_hot_encoder.fit_transform(
            self.default_expected_intervals)

        binarizer = FeaturesBinarizer(method='quantile', n_cuts=n_cuts,
                                      detect_column_type="auto",
                                      remove_first=True)

        binarizer.fit(self.features)
        binarized_array = binarizer.transform(self.features)
        self.assertEqual(binarized_array.__class__, csr.csr_matrix)

        expected_binarization_without_first = \
            np.delete(expected_binarization.toarray(), [0, 4, 8, 10], 1)

        np.testing.assert_array_equal(expected_binarization_without_first,
                                      binarized_array.toarray())

        return 
Example 19
Project: torchkit   Author: CW-Huang   File: datasets.py    License: MIT License 6 votes vote down vote up
def load_cifar10_image(root='dataset',labels=False):
    helpers.create(root, 'cifar10')
    droot = root+'/'+'cifar10'
    
    if not os.path.exists('{}/cifar10.pkl'.format(droot)):
        from downloader import download_cifar10
        download_cifar10(droot)
    
    f = lambda d:d.astype(floatX)
    filename = '{}/cifar10.pkl'.format(droot)
    tr_x, tr_y, te_x, te_y = pickle.load(open(filename,'r'))
    if tr_x.max() == 255:
        tr_x = tr_x / 256.
        te_x = te_x / 256.
        
    if labels:
        enc = OneHotEncoder(10)
        tr_y = enc.fit_transform(tr_y).toarray().reshape(50000,10).astype(int)
        te_y = enc.fit_transform(te_y).toarray().reshape(10000,10).astype(int)
        
        return (f(d) for d in [tr_x, tr_y, te_x, te_y])   
    else:
        return (f(d) for d in [tr_x, te_x]) 
Example 20
def test_conversion_many_columns(self):
        scikit_model = OneHotEncoder()
        scikit_model.fit(self.scikit_data_multiple_cols)
        spec = sklearn.convert(
            scikit_model, ["feature_1", "feature_2"], "out"
        ).get_spec()

        test_data = [
            {"feature_1": row[0], "feature_2": row[1]}
            for row in self.scikit_data_multiple_cols
        ]
        scikit_output = [
            {"out": row}
            for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray()
        ]
        metrics = evaluate_transformer(spec, test_data, scikit_output)

        self.assertIsNotNone(spec)
        self.assertIsNotNone(spec.description)
        self.assertEquals(metrics["num_errors"], 0) 
Example 21
def test_conversion_one_column_of_several(self):
        scikit_model = OneHotEncoder(categorical_features=[0])
        scikit_model.fit(copy(self.scikit_data_multiple_cols))
        spec = sklearn.convert(
            scikit_model, ["feature_1", "feature_2"], "out"
        ).get_spec()

        test_data = [
            {"feature_1": row[0], "feature_2": row[1]}
            for row in self.scikit_data_multiple_cols
        ]
        scikit_output = [
            {"out": row}
            for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray()
        ]
        metrics = evaluate_transformer(spec, test_data, scikit_output)

        self.assertIsNotNone(spec)
        self.assertIsNotNone(spec.description)
        self.assertEquals(metrics["num_errors"], 0) 
Example 22
def test_boston_OHE_pipeline(self):
        data = load_boston()

        for categorical_features in [[3], [8], [3, 8], [8, 3]]:
            # Put it in a pipeline so that we can test whether the output dimension
            # handling is correct.

            model = Pipeline(
                [
                    ("OHE", OneHotEncoder(categorical_features=categorical_features)),
                    ("Normalizer", Normalizer()),
                ]
            )

            model.fit(data.data.copy(), data.target)

            # Convert the model
            spec = sklearn.convert(model, data.feature_names, "out").get_spec()

            input_data = [dict(zip(data.feature_names, row)) for row in data.data]
            output_data = [{"out": row} for row in model.transform(data.data.copy())]

            result = evaluate_transformer(spec, input_data, output_data)

            assert result["num_errors"] == 0 
Example 23
Project: Bidirectiona-LSTM-for-text-summarization-   Author: DeepsMoseli   File: word2vec.py    License: MIT License 5 votes vote down vote up
def summonehot(corpus):
    allwords=[]
    annotated={}
    for sent in corpus:
        for word in wt(sent):
            allwords.append(word.lower())
    print(len(set(allwords)), "unique characters in corpus")
    #maxcorp=int(input("Enter desired number of vocabulary: "))
    maxcorp=int(len(set(allwords))/1.1)
    wordcount = Counter(allwords).most_common(maxcorp)
    allwords=[]
    
    for p in wordcount:
        allwords.append(p[0])  
        
    allwords=list(set(allwords))
    
    print(len(allwords), "unique characters in corpus after max corpus cut")
    #integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(allwords)
    #one hot
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    #make look up dict
    for k in range(len(onehot_encoded)): 
        inverted = cleantext(label_encoder.inverse_transform([argmax(onehot_encoded[k, :])])[0]).strip()
        annotated[inverted]=onehot_encoded[k]
    return label_encoder,onehot_encoded,annotated 
Example 24
Project: highdimensional-decision-boundary-plot   Author: tmadl   File: uci_loader.py    License: MIT License 5 votes vote down vote up
def getdataset(datasetname, onehot_encode_strings=True):
    # load
    dataset = fetch_openml(datasetname)
    # get X and y
    X = dshape(dataset.data)
    try:
        target = dshape(dataset.target)
    except:
        print("WARNING: No target found. Taking last column of data matrix as target")
        target = X[:, -1]
        X = X[:, :-1]
    if (
        len(target.shape) > 1 and target.shape[1] > X.shape[1]
    ):  # some mldata sets are mixed up...
        X = target
        target = dshape(dataset.data)
    if len(X.shape) == 1 or X.shape[1] <= 1:
        for k in dataset.keys():
            if k != "data" and k != "target" and len(dataset[k]) == X.shape[1]:
                X = np.hstack((X, dshape(dataset[k])))
    # one-hot for categorical values
    if onehot_encode_strings:
        cat_ft = [
            i
            for i in range(X.shape[1])
            if "str" in str(type(unpack(X[0, i])))
            or "unicode" in str(type(unpack(X[0, i])))
        ]
        if len(cat_ft):
            for i in cat_ft:
                X[:, i] = tonumeric(X[:, i])
            X = OneHotEncoder(categorical_features=cat_ft).fit_transform(X)
    # if sparse, make dense
    try:
        X = X.toarray()
    except:
        pass
    # convert y to monotonically increasing ints
    y = tonumeric(target).astype(int)
    return np.nan_to_num(X.astype(float)), y 
Example 25
Project: deepchem   Author: deepchem   File: utils.py    License: MIT License 5 votes vote down vote up
def one_hot_encode(sequences):
  sequence_length = len(sequences[0])
  integer_type = np.int8 if sys.version_info[
      0] == 2 else np.int32  # depends on Python version
  integer_array = LabelEncoder().fit(
      np.array(('ACGTN',)).view(integer_type)).transform(
          sequences.view(integer_type)).reshape(
              len(sequences), sequence_length)
  one_hot_encoding = OneHotEncoder(
      sparse=False, n_values=5, dtype=integer_type).fit_transform(integer_array)

  return one_hot_encoding.reshape(len(sequences), 1, sequence_length,
                                  5).swapaxes(2, 3)[:, :, [0, 1, 2, 4], :] 
Example 26
Project: malss   Author: canard0328   File: data.py    License: MIT License 5 votes vote down vote up
def __encode(self, X):
        Xenc = X.copy(deep=True)

        if self._label_encoder is None or self._onehot_encoder is None:
            self._label_encoder = [None] * len(Xenc.columns)
            self._onehot_encoder = [None] * len(Xenc.columns)

        del_columns = []
        for i in range(len(Xenc.columns)):
            if Xenc.dtypes[i] == np.dtype('O'):
                if self._label_encoder[i] is None:
                    self._label_encoder[i] = LabelEncoder().fit(Xenc.iloc[:,i])
                col_enc = self._label_encoder[i].transform(Xenc.iloc[:,i])
                if self._onehot_encoder[i] is None:
                    self._onehot_encoder[i] = OneHotEncoder(categories='auto').fit(
                        col_enc.reshape(-1, 1))
                col_onehot = np.array(self._onehot_encoder[i].transform(
                    col_enc.reshape(-1, 1)).todense())
                col_names = [str(Xenc.columns[i]) + '_' + c
                             for c in self._label_encoder[i].classes_]
                col_onehot = pd.DataFrame(col_onehot, columns=col_names,
                                          index=Xenc.index)
                Xenc = pd.concat([Xenc, col_onehot], axis=1)
                del_columns.append(Xenc.columns[i])
        for col in del_columns:
            del Xenc[col]

        return Xenc, del_columns 
Example 27
Project: AIX360   Author: IBM   File: PDASH_utils.py    License: Apache License 2.0 5 votes vote down vote up
def get_Processed_NHANES_Data(filename):
    """
    Args:
        filename (str): Enter NHANES filename

    Returns:
        One hot encoded features and original input
    """
    # returns original and one hot encoded data
    # Input: XPT filename e.g. 2_H.XPT)
    # output:
    # One hot endcoded, e.g. (5924 x 145)
    # original, e.g. (5924 x 9)

    with open(filename, 'rb') as f:
        original = xport.to_numpy(f)

    # replace nan's with 0's.
    original[np.isnan(original)] = 0

    # delete 1st column (contains sequence numbers)
    original = original[:, 1:]

    # one hot encoding of all columns/features
    onehot_encoder = OneHotEncoder(sparse=False)
    onehot_encoded = onehot_encoder.fit_transform(original)

    # return one hot encoded and original data
    return (onehot_encoded, original) 
Example 28
Project: evo-pawness   Author: haryoa   File: action_encoder.py    License: GNU General Public License v3.0 5 votes vote down vote up
def fit(self, list_all_action):
        """
        Fit the encoder of Label Encoder. So it can map an integer to an action key.
        Also fit the One Hot Encoder.
        :param list_all_action: list of all possible action keys in the game
        :return:
        """
        self.le = preprocessing.LabelEncoder()
        list_all_action = self.le.fit_transform(list_all_action)
        self.shape_all_actions = len(list_all_action)
        self.onehot_encoder = OneHotEncoder(self.shape_all_actions, sparse=False)
        list_all_action = list_all_action.reshape(len(list_all_action), 1)
        self.onehot_encoder.fit(list_all_action)
        self.create_mirror_dict() 
Example 29
Project: KDDCup2019_admin   Author: DominickZhang   File: automl.py    License: MIT License 5 votes vote down vote up
def oneHotEncodingForFastFM(X: pd.DataFrame):
    numeric_table = X[[c for c in X.columns if c.startswith(CONSTANT.NUMERICAL_PREFIX) or c.startswith(CONSTANT.TIME_PREFIX)]]
    X = X[[c for c in X.columns if not (c.startswith(CONSTANT.NUMERICAL_PREFIX) or c.startswith(CONSTANT.TIME_PREFIX))]]

    numeric_table = (numeric_table - numeric_table.min())/(numeric_table.max() - numeric_table.min())

    enc = OneHotEncoder(sparse=True, dtype=np.float32, categories="auto")

    X = enc.fit_transform(X)

    X = hstack((X, numeric_table.values), dtype=np.float32).tocsr()

    return X 
Example 30
Project: spektral   Author: danielegrattarola   File: tud.py    License: MIT License 5 votes vote down vote up
def _normalize(x, norm=None):
    """
    Apply one-hot encoding or z-score to a list of node features
    """
    if norm == 'ohe':
        fnorm = OneHotEncoder(sparse=False, categories='auto')
    elif norm == 'zscore':
        fnorm = StandardScaler()
    else:
        return x
    return fnorm.fit_transform(x)