Python sklearn.preprocessing.LabelEncoder() Examples

The following are 30 code examples for showing how to use sklearn.preprocessing.LabelEncoder(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.preprocessing , or try the search function .

Example 1
Project: interpret-text   Author: interpretml   File: test_classical_explainer.py    License: MIT License 6 votes vote down vote up
def test_explain_model_local_with_predicted_label(self):
        """
        Test for explain_local of classical explainer
        :return:
        """
        X_train, X_test, y_train, y_test = setup_mnli_test_train_split()

        label_encoder = LabelEncoder()
        y_train = label_encoder.fit_transform(y_train)
        explainer = ClassicalTextExplainer()
        classifier, best_params = explainer.fit(X_train, y_train)
        explainer.preprocessor.labelEncoder = label_encoder
        y = classifier.predict(DOCUMENT)
        predicted_label = label_encoder.inverse_transform(y)
        local_explanation = explainer.explain_local(DOCUMENT, predicted_label)
        assert len(local_explanation.local_importance_values) == len(local_explanation.features) 
Example 2
Project: razzy-spinner   Author: rafasashi   File: scikitlearn.py    License: GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse) 
Example 3
Project: sato   Author: megagonlabs   File: datasets.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 corpus,
                 sherlock_features: List[str] = None,
                 topic_feature: str = None,
                 label_enc: LabelEncoder = None,
                 id_filter: List[str] = None,
                 max_col_count:int = None,
                 shuffle_group:str=None):

        super().__init__(corpus,
                                    sherlock_features,
                                    topic_feature,
                                    label_enc,
                                    id_filter,
                                    max_col_count)

        l = len(self.df_header)
        self.tempcorpus = corpus

        self.shuffle_group = shuffle_group
        self.prng = np.random.RandomState(SEED)
        self.shuffle_order = self.prng.permutation(l) 
Example 4
Project: sato   Author: megagonlabs   File: datasets.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 df_dict: Dict[str, pd.DataFrame]=None,
                 tensor_dict: Dict[str, torch.FloatTensor]=None,
                 labels: List[str] =[],
                 label_enc: LabelEncoder = None,
                 shuffle_group: str = None):

        super().__init__(df_dict,
                         tensor_dict,
                         labels,
                         label_enc)

        l = self.__len__()

        self.shuffle_group = shuffle_group
        prng = np.random.RandomState(SEED)
        self.shuffle_order = prng.permutation(l) 
Example 5
Project: KDDCup2019_admin   Author: DominickZhang   File: feature_expansion.py    License: MIT License 6 votes vote down vote up
def cat_onehot_encoder(df,y,col,selection=True):
    feat_x = df.values.reshape(-1,1)

    from sklearn.preprocessing import LabelEncoder

    le = LabelEncoder()
    le.fit(feat_x)
    feat_x = le.transform(feat_x)

    mlbs = OneHotEncoder(sparse=True).fit(feat_x.reshape(-1,1))
    from scipy.sparse import csr_matrix
    features_tmp = mlbs.transform(feat_x.reshape(-1,1))
    features_tmp = csr_matrix(features_tmp,dtype=float).tocsr()
    models = None
    auc_score = None
    if selection is True:
        auc_score, models = train_lightgbm_for_feature_selection(features_tmp, y)
        print(col, "auc", auc_score)
    #new_feature = pd.DataFrame(features_tmp,columns=["mul_feature_"+col])
    new_feature = features_tmp




    return new_feature,mlbs,models,auc_score,le 
Example 6
Project: stock-price-prediction   Author: chinuy   File: util.py    License: MIT License 6 votes vote down vote up
def preprocessData(dataset):

    le = preprocessing.LabelEncoder()

    # in case divid-by-zero
    dataset.Open[dataset.Open == 0] = 1

    # add prediction target: next day Up/Down
    threshold = 0.000
    dataset['UpDown'] = (dataset['Close'] - dataset['Open']) / dataset['Open']
    dataset.UpDown[dataset.UpDown >= threshold] = 'Up'
    dataset.UpDown[dataset.UpDown < threshold] = 'Down'
    dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
    dataset.UpDown = dataset.UpDown.shift(-1) # shift 1, so the y is actually next day's up/down
    dataset = dataset.drop(dataset.index[-1]) # drop last one because it has no up/down value
    return dataset 
Example 7
Project: DeepResearch   Author: Hsankesara   File: prototypicalNet.py    License: MIT License 6 votes vote down vote up
def get_query_y(self, Qy, Qyc, class_label):
        """
        Returns labeled representation of classes of Query set and a list of labels.
        """
        labels = []
        m = len(Qy)
        for i in range(m):
            labels += [Qy[i]] * Qyc[i]
        labels = np.array(labels).reshape(len(labels), 1)
        label_encoder = LabelEncoder()
        Query_y = torch.Tensor(
            label_encoder.fit_transform(labels).astype(int)).long()
        if self.gpu:
            Query_y = Query_y.cuda()
        Query_y_labels = np.unique(labels)
        return Query_y, Query_y_labels 
Example 8
Project: category_encoders   Author: scikit-learn-contrib   File: loaders.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_cars_data():
    """
    Load the cars dataset, split it into X and y, and then call the label encoder to get an integer y column.

    :return:
    """

    df = pd.read_csv('source_data/cars/car.data.txt')
    X = df.reindex(columns=[x for x in df.columns.values if x != 'class'])
    y = df.reindex(columns=['class'])
    y = preprocessing.LabelEncoder().fit_transform(y.values.reshape(-1, ))

    mapping = [
        {'col': 'buying', 'mapping': [('vhigh', 0), ('high', 1), ('med', 2), ('low', 3)]},
        {'col': 'maint', 'mapping': [('vhigh', 0), ('high', 1), ('med', 2), ('low', 3)]},
        {'col': 'doors', 'mapping': [('2', 0), ('3', 1), ('4', 2), ('5more', 3)]},
        {'col': 'persons', 'mapping': [('2', 0), ('4', 1), ('more', 2)]},
        {'col': 'lug_boot', 'mapping': [('small', 0), ('med', 1), ('big', 2)]},
        {'col': 'safety', 'mapping': [('high', 0), ('med', 1), ('low', 2)]},
    ]

    return X, y, mapping 
Example 9
Project: category_encoders   Author: scikit-learn-contrib   File: loaders.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_mushroom_data():
    """
    Load the mushroom dataset, split it into X and y, and then call the label encoder to get an integer y column.

    :return:
    """

    df = pd.read_csv('source_data/mushrooms/agaricus-lepiota.csv')
    X = df.reindex(columns=[x for x in df.columns.values if x != 'class'])
    y = df.reindex(columns=['class'])
    y = preprocessing.LabelEncoder().fit_transform(y.values.reshape(-1, ))

    # this data is truly categorical, with no known concept of ordering
    mapping = None

    return X, y, mapping 
Example 10
Project: category_encoders   Author: scikit-learn-contrib   File: loaders.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_splice_data():
    """
    Load the mushroom dataset, split it into X and y, and then call the label encoder to get an integer y column.

    :return:
    """

    df = pd.read_csv('source_data/splice/splice.csv')
    X = df.reindex(columns=[x for x in df.columns.values if x != 'class'])
    X['dna'] = X['dna'].map(lambda x: list(str(x).strip()))
    for idx in range(60):
        X['dna_%d' % (idx, )] = X['dna'].map(lambda x: x[idx])
    del X['dna']

    y = df.reindex(columns=['class'])
    y = preprocessing.LabelEncoder().fit_transform(y.values.reshape(-1, ))

    # this data is truly categorical, with no known concept of ordering
    mapping = None

    return X, y, mapping 
Example 11
def get_X_y(**kwargs):
    """simple wrapper around pd.read_csv that extracts features and labels

    Some systematic preprocessing is also carried out to avoid doing this
    transformation repeatedly in the code.
    """
    global label_encoder
    df = pd.read_csv(info['path'], sep='\t', **kwargs)
    return preprocess(df, label_encoder)

###############################################################################
# Classifier objects in |sklearn| often require :code:`y` to be integer labels.
# Additionally, |APS| requires a binary version of the labels.  For these two
# purposes, we create:
#
# * a |LabelEncoder|, that we pre-fitted on the known :code:`y` classes
# * a |OneHotEncoder|, pre-fitted on the resulting integer labels.
#
# Their |transform| methods can the be called at appopriate times. 
Example 12
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_estimator_checks.py    License: MIT License 6 votes vote down vote up
def fit(self, X, y):
        from sklearn.preprocessing import LabelEncoder
        from sklearn.utils import compute_class_weight

        label_encoder = LabelEncoder().fit(y)
        classes = label_encoder.classes_
        class_weight = compute_class_weight(self.class_weight, classes, y)

        # Intentionally modify the balanced class_weight
        # to simulate a bug and raise an exception
        if self.class_weight == "balanced":
            class_weight += 1.

        # Simply assigning coef_ to the class_weight
        self.coef_ = class_weight
        return self 
Example 13
Project: driverlessai-recipes   Author: h2oai   File: f4_score.py    License: Apache License 2.0 6 votes vote down vote up
def score(self,
              actual: np.array,
              predicted: np.array,
              sample_weight: typing.Optional[np.array] = None,
              labels: typing.Optional[np.array] = None,
              **kwargs) -> float:
        lb = LabelEncoder()
        labels = lb.fit_transform(labels)
        actual = lb.transform(actual)
        method = "binary"
        if len(labels) > 2:
            predicted = np.argmax(predicted, axis=1)
            method = "micro"
        else:
            predicted = (predicted > self._threshold)
        f4_score = fbeta_score(actual, predicted, labels=labels, average=method, sample_weight=sample_weight, beta=4)
        return f4_score 
Example 14
Project: driverlessai-recipes   Author: h2oai   File: cost.py    License: Apache License 2.0 6 votes vote down vote up
def score(self,
              actual: np.array,
              predicted: np.array,
              sample_weight: typing.Optional[np.array] = None,
              labels: typing.Optional[np.array] = None,
              **kwargs) -> float:
        # label actuals as 1 or 0
        lb = LabelEncoder()
        labels = lb.fit_transform(labels)
        actual = lb.transform(actual)

        # label predictions as 1 or 0
        predicted = predicted >= self._threshold

        # use sklearn to get fp and fn
        cm = confusion_matrix(actual, predicted, sample_weight=sample_weight, labels=labels)
        tn, fp, fn, tp = cm.ravel()

        # calculate`$1*FP + $2*FN`
        return ((fp * self.__class__._fp_cost) + (fn * self.__class__._fn_cost)) / (
                    tn + fp + fn + tp)  # divide by total weighted count to make loss invariant to data size 
Example 15
Project: driverlessai-recipes   Author: h2oai   File: f3_score.py    License: Apache License 2.0 6 votes vote down vote up
def score(self,
              actual: np.array,
              predicted: np.array,
              sample_weight: typing.Optional[np.array] = None,
              labels: typing.Optional[np.array] = None,
              **kwargs) -> float:
        lb = LabelEncoder()
        labels = lb.fit_transform(labels)
        actual = lb.transform(actual)
        method = "binary"
        if len(labels) > 2:
            predicted = np.argmax(predicted, axis=1)
            method = "micro"
        else:
            predicted = (predicted > self._threshold)
        f3_score = fbeta_score(actual, predicted, labels=labels, average=method, sample_weight=sample_weight, beta=3)
        return f3_score 
Example 16
Project: driverlessai-recipes   Author: h2oai   File: amazon.py    License: Apache License 2.0 6 votes vote down vote up
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        lb = LabelEncoder()
        lb.fit(self.labels)
        y = lb.transform(y)
        orig_cols = list(X.names)
        XX = X.to_pandas()
        params = {
            'train_dir': user_dir(),
            'allow_writing_files': False,
            'thread_count': 10,
            # 'loss_function': 'Logloss'
        }
        from catboost import CatBoostClassifier
        model = CatBoostClassifier(**params)
        model.fit(XX, y=y, sample_weight=sample_weight, verbose=False,
                  cat_features=list(X[:, [str, int]].names))  # Amazon specific, also no early stopping

        # must always set best_iterations
        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=model.feature_importances_,
                                  iterations=0) 
Example 17
def test_transactional_to_iid():
    ret = TransactionalToIID.create_data()
    for name, X in ret.items():
        le = LabelEncoder()
        y = le.fit_transform(X[target]).ravel()
        print(name)
        print(X.head(10))
        print(X.tail(10))
        for col in X.names:
            if "_past_" in col:
                auc = roc_auc_score(y, X[col].to_numpy().ravel())
                print("%s: auc = %f" % (col, auc))
                if "leaky" not in col:
                    assert auc > 0.53  # all lags must have signal
                    assert auc < 0.8  # but not too much
                else:
                    assert auc > 0.75  # all leaky lags must have a lot of signal 
Example 18
Project: driverlessai-recipes   Author: h2oai   File: feature_selection.py    License: Apache License 2.0 6 votes vote down vote up
def get_feature_importances(data, shuffle, cats=[], seed=None):
    # Gather real features
    train_features = [f for f in data if f not in [target] + cols2ignore]

    # Shuffle target if required
    y = data[target].copy()
    if shuffle:
        y = data[target].copy().sample(frac=1.0, random_state=seed + 4)
    from h2oaicore.lightgbm_dynamic import got_cpu_lgb, got_gpu_lgb
    import lightgbm as lgbm
    if is_regression:
        model = lgbm.LGBMRegressor(random_state=seed, importance_type=importance, **lgbm_params)
    else:
        model = lgbm.LGBMClassifier(random_state=seed, importance_type=importance, **lgbm_params)
        y = LabelEncoder().fit_transform(y)
    # Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
    model.fit(data[train_features], y, categorical_feature=cats)
    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df["feature"] = list(train_features)
    imp_df["importance"] = model.feature_importances_

    return imp_df 
Example 19
Project: Bidirectiona-LSTM-for-text-summarization-   Author: DeepsMoseli   File: word2vec.py    License: MIT License 5 votes vote down vote up
def summonehot(corpus):
    allwords=[]
    annotated={}
    for sent in corpus:
        for word in wt(sent):
            allwords.append(word.lower())
    print(len(set(allwords)), "unique characters in corpus")
    #maxcorp=int(input("Enter desired number of vocabulary: "))
    maxcorp=int(len(set(allwords))/1.1)
    wordcount = Counter(allwords).most_common(maxcorp)
    allwords=[]
    
    for p in wordcount:
        allwords.append(p[0])  
        
    allwords=list(set(allwords))
    
    print(len(allwords), "unique characters in corpus after max corpus cut")
    #integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(allwords)
    #one hot
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    #make look up dict
    for k in range(len(onehot_encoded)): 
        inverted = cleantext(label_encoder.inverse_transform([argmax(onehot_encoded[k, :])])[0]).strip()
        annotated[inverted]=onehot_encoded[k]
    return label_encoder,onehot_encoded,annotated 
Example 20
Project: mercari-price-suggestion   Author: aerdem4   File: preprocess_for_nn.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        self.tok_raw = Tokenizer()
        self.le = {}
        self.cat_cols = ["brand_name", "subcat_0", "subcat_1", "subcat_2"]
        self.cat_vocab = {}
        for cat in self.cat_cols:
            self.le[cat] = LabelEncoder()
        self.freqs = {}
        self.max_freqs = {}
        self.voc = None 
Example 21
Project: interpret-text   Author: interpretml   File: utils_classical.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        """Initializes the Encoder object and sets internal tokenizer,
            labelEncoder and vectorizer using predefined objects.
        """
        self.tokenizer = BOWTokenizer(
            English()
        )  # the tokenizer must have a tokenize() and parse() function.
        self.labelEncoder = LabelEncoder()
        self.vectorizer = CountVectorizer(
            tokenizer=self.tokenizer.tokenize, ngram_range=(1, 1)
        )
        self.decode_params = {}

    # The keep_ids flag, is used by explain local in the explainer to decode
    # importances over raw features. 
Example 22
Project: interpret-text   Author: interpretml   File: test_classical_explainer.py    License: MIT License 5 votes vote down vote up
def test_explain_model_local_default(self):
        """
        Test for explain_local of classical explainer
        :return:
        """
        X_train, X_test, y_train, y_test = setup_mnli_test_train_split()
        label_encoder = LabelEncoder()
        y_train = label_encoder.fit_transform(y_train)
        explainer = ClassicalTextExplainer()
        classifier, best_params = explainer.fit(X_train, y_train)
        explainer.preprocessor.labelEncoder = label_encoder

        local_explanation = explainer.explain_local(DOCUMENT)
        assert len(local_explanation.local_importance_values) == len(local_explanation.features) 
Example 23
Project: Python-ELM   Author: masaponto   File: ecob_elm.py    License: MIT License 5 votes vote down vote up
def main():
    from sklearn import preprocessing
    from sklearn.datasets import fetch_openml as fetch_mldata
    from sklearn.model_selection import cross_val_score

    db_name = 'iris'
    hid_num = 1000
    data_set = fetch_mldata(db_name, version=1)
    data_set.data = preprocessing.scale(data_set.data)
    data_set.target = preprocessing.LabelEncoder().fit_transform(data_set.target)

    print(db_name)
    print('ECOBELM', hid_num)
    e = ECOBELM(hid_num, c=2**5)
    ave = 0
    for i in range(10):
        scores = cross_val_score(
            e, data_set.data, data_set.target, cv=5, scoring='accuracy')
        ave += scores.mean()
    ave /= 10
    print("Accuracy: %0.2f " % (ave))

    print('ELM', hid_num)
    e = ELM(hid_num)
    ave = 0
    for i in range(10):
        scores = cross_val_score(
            e, data_set.data, data_set.target, cv=5, scoring='accuracy')
        ave += scores.mean()
    ave /= 10
    print("Accuracy: %0.2f " % (ave)) 
Example 24
Project: sato   Author: megagonlabs   File: datasets.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 df_dict: Dict[str, pd.DataFrame]=None,
                 tensor_dict: Dict[str, torch.FloatTensor]=None,
                 labels: List[str] =[],
                 label_enc: LabelEncoder = None):  
        assert not (df_dict is None and tensor_dict is None),\
            print('df_dict and tensor_dict can\'t be both None')

        assert len(labels)>0, 'lables can\'t be empty'

        if label_enc is None:
            label_enc = LabelEncoder()
            label_enc.fit(labels)
        self.label_enc = label_enc
        self.label_ids = self.label_enc.transform(labels)

        if tensor_dict is not None:
            self.name_tensor_dict = tensor_dict
            self.f_g_names = list(tensor_dict.keys())
            self.len = tensor_dict[self.f_g_names[0]].shape[0]
        else: 

            self.f_g_names = df_dict.keys()
            self.len = len(list(df_dict.values())[0])

            # df_dict must have at least one key-value pair
            assert len(df_dict) > 0
            # Make sure each df has the same size
            for name, df in df_dict.items():
                assert len(df) == len(list(df_dict.values())[0])

            # Convert dataframe into a dictionary of FloatTensor to avoid on-the-fly conversion
            self.name_tensor_dict = {}
            for name, df in df_dict.items():
                self.name_tensor_dict[name] = torch.FloatTensor(df.values.astype('float')) 
Example 25
Project: dataiku-contrib   Author: dataiku   File: preprocessing.py    License: Apache License 2.0 5 votes vote down vote up
def encode_categorical_features(cls, df):
        cat_feature_map = OrderedDict()
        for pos, f in enumerate(df):
            if not np.issubdtype(df[f].dtype, np.number):
                encoder = LabelEncoder()
                df[f] = encoder.fit_transform(df[f])
                #TODO: must ensure the mapping is consistent
                cat_feature_map[pos] = encoder.classes_.tolist()

        return cat_feature_map 
Example 26
Project: santander-product-recommendation-8th-place   Author: yaxen   File: main.py    License: MIT License 5 votes vote down vote up
def label_encode(df, features, name):
    df[name] = df[name].astype('str')
    if name in transformers: # test
        df[name] = transformers[name].transform(df[name])
    else: # train
        transformers[name] = LabelEncoder()
        df[name] = transformers[name].fit_transform(df[name])
    features.append(name) 
Example 27
Project: deepchem   Author: deepchem   File: utils.py    License: MIT License 5 votes vote down vote up
def one_hot_encode(sequences):
  sequence_length = len(sequences[0])
  integer_type = np.int8 if sys.version_info[
      0] == 2 else np.int32  # depends on Python version
  integer_array = LabelEncoder().fit(
      np.array(('ACGTN',)).view(integer_type)).transform(
          sequences.view(integer_type)).reshape(
              len(sequences), sequence_length)
  one_hot_encoding = OneHotEncoder(
      sparse=False, n_values=5, dtype=integer_type).fit_transform(integer_array)

  return one_hot_encoding.reshape(len(sequences), 1, sequence_length,
                                  5).swapaxes(2, 3)[:, :, [0, 1, 2, 4], :] 
Example 28
Project: pygbm   Author: ogrisel   File: gradient_boosting.py    License: MIT License 5 votes vote down vote up
def _encode_y(self, y):
        # encode classes into 0 ... n_classes - 1 and sets attributes classes_
        # and n_trees_per_iteration_
        check_classification_targets(y)

        label_encoder = LabelEncoder()
        encoded_y = label_encoder.fit_transform(y)
        self.classes_ = label_encoder.classes_
        n_classes = self.classes_.shape[0]
        # only 1 tree for binary classification. For multiclass classification,
        # we build 1 tree per class.
        self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes
        encoded_y = encoded_y.astype(np.float32, copy=False)
        return encoded_y 
Example 29
Project: malss   Author: canard0328   File: data.py    License: MIT License 5 votes vote down vote up
def __encode(self, X):
        Xenc = X.copy(deep=True)

        if self._label_encoder is None or self._onehot_encoder is None:
            self._label_encoder = [None] * len(Xenc.columns)
            self._onehot_encoder = [None] * len(Xenc.columns)

        del_columns = []
        for i in range(len(Xenc.columns)):
            if Xenc.dtypes[i] == np.dtype('O'):
                if self._label_encoder[i] is None:
                    self._label_encoder[i] = LabelEncoder().fit(Xenc.iloc[:,i])
                col_enc = self._label_encoder[i].transform(Xenc.iloc[:,i])
                if self._onehot_encoder[i] is None:
                    self._onehot_encoder[i] = OneHotEncoder(categories='auto').fit(
                        col_enc.reshape(-1, 1))
                col_onehot = np.array(self._onehot_encoder[i].transform(
                    col_enc.reshape(-1, 1)).todense())
                col_names = [str(Xenc.columns[i]) + '_' + c
                             for c in self._label_encoder[i].classes_]
                col_onehot = pd.DataFrame(col_onehot, columns=col_names,
                                          index=Xenc.index)
                Xenc = pd.concat([Xenc, col_onehot], axis=1)
                del_columns.append(Xenc.columns[i])
        for col in del_columns:
            del Xenc[col]

        return Xenc, del_columns 
Example 30
Project: kaggle_Otto   Author: puyokw   File: kerasNN2_2nd.py    License: MIT License 5 votes vote down vote up
def preprocess_labels(labels, encoder=None, categorical=True):
    if not encoder:
        encoder = LabelEncoder()
        encoder.fit(labels)
    y = encoder.transform(labels).astype(np.int32)
    if categorical:
        y = np_utils.to_categorical(y)
    return y, encoder