Python sklearn.preprocessing.LabelEncoder() Examples
The following are 30
code examples of sklearn.preprocessing.LabelEncoder().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.preprocessing
, or try the search function
.

Example #1
Source File: test_classical_explainer.py From interpret-text with MIT License | 6 votes |
def test_explain_model_local_with_predicted_label(self): """ Test for explain_local of classical explainer :return: """ X_train, X_test, y_train, y_test = setup_mnli_test_train_split() label_encoder = LabelEncoder() y_train = label_encoder.fit_transform(y_train) explainer = ClassicalTextExplainer() classifier, best_params = explainer.fit(X_train, y_train) explainer.preprocessor.labelEncoder = label_encoder y = classifier.predict(DOCUMENT) predicted_label = label_encoder.inverse_transform(y) local_explanation = explainer.explain_local(DOCUMENT, predicted_label) assert len(local_explanation.local_importance_values) == len(local_explanation.features)
Example #2
Source File: scikitlearn.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def __init__(self, estimator, dtype=float, sparse=True): """ :param estimator: scikit-learn classifier object. :param dtype: data type used when building feature array. scikit-learn estimators work exclusively on numeric data. The default value should be fine for almost all situations. :param sparse: Whether to use sparse matrices internally. The estimator must support these; not all scikit-learn classifiers do (see their respective documentation and look for "sparse matrix"). The default value is True, since most NLP problems involve sparse feature sets. Setting this to False may take a great amount of memory. :type sparse: boolean. """ self._clf = estimator self._encoder = LabelEncoder() self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
Example #3
Source File: datasets.py From sato with Apache License 2.0 | 6 votes |
def __init__(self, corpus, sherlock_features: List[str] = None, topic_feature: str = None, label_enc: LabelEncoder = None, id_filter: List[str] = None, max_col_count:int = None, shuffle_group:str=None): super().__init__(corpus, sherlock_features, topic_feature, label_enc, id_filter, max_col_count) l = len(self.df_header) self.tempcorpus = corpus self.shuffle_group = shuffle_group self.prng = np.random.RandomState(SEED) self.shuffle_order = self.prng.permutation(l)
Example #4
Source File: datasets.py From sato with Apache License 2.0 | 6 votes |
def __init__(self, df_dict: Dict[str, pd.DataFrame]=None, tensor_dict: Dict[str, torch.FloatTensor]=None, labels: List[str] =[], label_enc: LabelEncoder = None, shuffle_group: str = None): super().__init__(df_dict, tensor_dict, labels, label_enc) l = self.__len__() self.shuffle_group = shuffle_group prng = np.random.RandomState(SEED) self.shuffle_order = prng.permutation(l)
Example #5
Source File: feature_expansion.py From KDDCup2019_admin with MIT License | 6 votes |
def cat_onehot_encoder(df,y,col,selection=True): feat_x = df.values.reshape(-1,1) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le.fit(feat_x) feat_x = le.transform(feat_x) mlbs = OneHotEncoder(sparse=True).fit(feat_x.reshape(-1,1)) from scipy.sparse import csr_matrix features_tmp = mlbs.transform(feat_x.reshape(-1,1)) features_tmp = csr_matrix(features_tmp,dtype=float).tocsr() models = None auc_score = None if selection is True: auc_score, models = train_lightgbm_for_feature_selection(features_tmp, y) print(col, "auc", auc_score) #new_feature = pd.DataFrame(features_tmp,columns=["mul_feature_"+col]) new_feature = features_tmp return new_feature,mlbs,models,auc_score,le
Example #6
Source File: util.py From stock-price-prediction with MIT License | 6 votes |
def preprocessData(dataset): le = preprocessing.LabelEncoder() # in case divid-by-zero dataset.Open[dataset.Open == 0] = 1 # add prediction target: next day Up/Down threshold = 0.000 dataset['UpDown'] = (dataset['Close'] - dataset['Open']) / dataset['Open'] dataset.UpDown[dataset.UpDown >= threshold] = 'Up' dataset.UpDown[dataset.UpDown < threshold] = 'Down' dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown) dataset.UpDown = dataset.UpDown.shift(-1) # shift 1, so the y is actually next day's up/down dataset = dataset.drop(dataset.index[-1]) # drop last one because it has no up/down value return dataset
Example #7
Source File: prototypicalNet.py From DeepResearch with MIT License | 6 votes |
def get_query_y(self, Qy, Qyc, class_label): """ Returns labeled representation of classes of Query set and a list of labels. """ labels = [] m = len(Qy) for i in range(m): labels += [Qy[i]] * Qyc[i] labels = np.array(labels).reshape(len(labels), 1) label_encoder = LabelEncoder() Query_y = torch.Tensor( label_encoder.fit_transform(labels).astype(int)).long() if self.gpu: Query_y = Query_y.cuda() Query_y_labels = np.unique(labels) return Query_y, Query_y_labels
Example #8
Source File: loaders.py From category_encoders with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_cars_data(): """ Load the cars dataset, split it into X and y, and then call the label encoder to get an integer y column. :return: """ df = pd.read_csv('source_data/cars/car.data.txt') X = df.reindex(columns=[x for x in df.columns.values if x != 'class']) y = df.reindex(columns=['class']) y = preprocessing.LabelEncoder().fit_transform(y.values.reshape(-1, )) mapping = [ {'col': 'buying', 'mapping': [('vhigh', 0), ('high', 1), ('med', 2), ('low', 3)]}, {'col': 'maint', 'mapping': [('vhigh', 0), ('high', 1), ('med', 2), ('low', 3)]}, {'col': 'doors', 'mapping': [('2', 0), ('3', 1), ('4', 2), ('5more', 3)]}, {'col': 'persons', 'mapping': [('2', 0), ('4', 1), ('more', 2)]}, {'col': 'lug_boot', 'mapping': [('small', 0), ('med', 1), ('big', 2)]}, {'col': 'safety', 'mapping': [('high', 0), ('med', 1), ('low', 2)]}, ] return X, y, mapping
Example #9
Source File: loaders.py From category_encoders with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_mushroom_data(): """ Load the mushroom dataset, split it into X and y, and then call the label encoder to get an integer y column. :return: """ df = pd.read_csv('source_data/mushrooms/agaricus-lepiota.csv') X = df.reindex(columns=[x for x in df.columns.values if x != 'class']) y = df.reindex(columns=['class']) y = preprocessing.LabelEncoder().fit_transform(y.values.reshape(-1, )) # this data is truly categorical, with no known concept of ordering mapping = None return X, y, mapping
Example #10
Source File: loaders.py From category_encoders with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_splice_data(): """ Load the mushroom dataset, split it into X and y, and then call the label encoder to get an integer y column. :return: """ df = pd.read_csv('source_data/splice/splice.csv') X = df.reindex(columns=[x for x in df.columns.values if x != 'class']) X['dna'] = X['dna'].map(lambda x: list(str(x).strip())) for idx in range(60): X['dna_%d' % (idx, )] = X['dna'].map(lambda x: x[idx]) del X['dna'] y = df.reindex(columns=['class']) y = preprocessing.LabelEncoder().fit_transform(y.values.reshape(-1, )) # this data is truly categorical, with no known concept of ordering mapping = None return X, y, mapping
Example #11
Source File: 05_scaling_non_linear_models.py From dirty_cat with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_X_y(**kwargs): """simple wrapper around pd.read_csv that extracts features and labels Some systematic preprocessing is also carried out to avoid doing this transformation repeatedly in the code. """ global label_encoder df = pd.read_csv(info['path'], sep='\t', **kwargs) return preprocess(df, label_encoder) ############################################################################### # Classifier objects in |sklearn| often require :code:`y` to be integer labels. # Additionally, |APS| requires a binary version of the labels. For these two # purposes, we create: # # * a |LabelEncoder|, that we pre-fitted on the known :code:`y` classes # * a |OneHotEncoder|, pre-fitted on the resulting integer labels. # # Their |transform| methods can the be called at appopriate times.
Example #12
Source File: test_estimator_checks.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def fit(self, X, y): from sklearn.preprocessing import LabelEncoder from sklearn.utils import compute_class_weight label_encoder = LabelEncoder().fit(y) classes = label_encoder.classes_ class_weight = compute_class_weight(self.class_weight, classes, y) # Intentionally modify the balanced class_weight # to simulate a bug and raise an exception if self.class_weight == "balanced": class_weight += 1. # Simply assigning coef_ to the class_weight self.coef_ = class_weight return self
Example #13
Source File: f4_score.py From driverlessai-recipes with Apache License 2.0 | 6 votes |
def score(self, actual: np.array, predicted: np.array, sample_weight: typing.Optional[np.array] = None, labels: typing.Optional[np.array] = None, **kwargs) -> float: lb = LabelEncoder() labels = lb.fit_transform(labels) actual = lb.transform(actual) method = "binary" if len(labels) > 2: predicted = np.argmax(predicted, axis=1) method = "micro" else: predicted = (predicted > self._threshold) f4_score = fbeta_score(actual, predicted, labels=labels, average=method, sample_weight=sample_weight, beta=4) return f4_score
Example #14
Source File: cost.py From driverlessai-recipes with Apache License 2.0 | 6 votes |
def score(self, actual: np.array, predicted: np.array, sample_weight: typing.Optional[np.array] = None, labels: typing.Optional[np.array] = None, **kwargs) -> float: # label actuals as 1 or 0 lb = LabelEncoder() labels = lb.fit_transform(labels) actual = lb.transform(actual) # label predictions as 1 or 0 predicted = predicted >= self._threshold # use sklearn to get fp and fn cm = confusion_matrix(actual, predicted, sample_weight=sample_weight, labels=labels) tn, fp, fn, tp = cm.ravel() # calculate`$1*FP + $2*FN` return ((fp * self.__class__._fp_cost) + (fn * self.__class__._fn_cost)) / ( tn + fp + fn + tp) # divide by total weighted count to make loss invariant to data size
Example #15
Source File: f3_score.py From driverlessai-recipes with Apache License 2.0 | 6 votes |
def score(self, actual: np.array, predicted: np.array, sample_weight: typing.Optional[np.array] = None, labels: typing.Optional[np.array] = None, **kwargs) -> float: lb = LabelEncoder() labels = lb.fit_transform(labels) actual = lb.transform(actual) method = "binary" if len(labels) > 2: predicted = np.argmax(predicted, axis=1) method = "micro" else: predicted = (predicted > self._threshold) f3_score = fbeta_score(actual, predicted, labels=labels, average=method, sample_weight=sample_weight, beta=3) return f3_score
Example #16
Source File: amazon.py From driverlessai-recipes with Apache License 2.0 | 6 votes |
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) orig_cols = list(X.names) XX = X.to_pandas() params = { 'train_dir': user_dir(), 'allow_writing_files': False, 'thread_count': 10, # 'loss_function': 'Logloss' } from catboost import CatBoostClassifier model = CatBoostClassifier(**params) model.fit(XX, y=y, sample_weight=sample_weight, verbose=False, cat_features=list(X[:, [str, int]].names)) # Amazon specific, also no early stopping # must always set best_iterations self.set_model_properties(model=model, features=orig_cols, importances=model.feature_importances_, iterations=0)
Example #17
Source File: create_transactional_data_or_convert_to_iid.py From driverlessai-recipes with Apache License 2.0 | 6 votes |
def test_transactional_to_iid(): ret = TransactionalToIID.create_data() for name, X in ret.items(): le = LabelEncoder() y = le.fit_transform(X[target]).ravel() print(name) print(X.head(10)) print(X.tail(10)) for col in X.names: if "_past_" in col: auc = roc_auc_score(y, X[col].to_numpy().ravel()) print("%s: auc = %f" % (col, auc)) if "leaky" not in col: assert auc > 0.53 # all lags must have signal assert auc < 0.8 # but not too much else: assert auc > 0.75 # all leaky lags must have a lot of signal
Example #18
Source File: word2vec.py From Bidirectiona-LSTM-for-text-summarization- with MIT License | 5 votes |
def summonehot(corpus): allwords=[] annotated={} for sent in corpus: for word in wt(sent): allwords.append(word.lower()) print(len(set(allwords)), "unique characters in corpus") #maxcorp=int(input("Enter desired number of vocabulary: ")) maxcorp=int(len(set(allwords))/1.1) wordcount = Counter(allwords).most_common(maxcorp) allwords=[] for p in wordcount: allwords.append(p[0]) allwords=list(set(allwords)) print(len(allwords), "unique characters in corpus after max corpus cut") #integer encode label_encoder = LabelEncoder() integer_encoded = label_encoder.fit_transform(allwords) #one hot onehot_encoder = OneHotEncoder(sparse=False) integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) onehot_encoded = onehot_encoder.fit_transform(integer_encoded) #make look up dict for k in range(len(onehot_encoded)): inverted = cleantext(label_encoder.inverse_transform([argmax(onehot_encoded[k, :])])[0]).strip() annotated[inverted]=onehot_encoded[k] return label_encoder,onehot_encoded,annotated
Example #19
Source File: preprocess_for_nn.py From mercari-price-suggestion with MIT License | 5 votes |
def __init__(self): self.tok_raw = Tokenizer() self.le = {} self.cat_cols = ["brand_name", "subcat_0", "subcat_1", "subcat_2"] self.cat_vocab = {} for cat in self.cat_cols: self.le[cat] = LabelEncoder() self.freqs = {} self.max_freqs = {} self.voc = None
Example #20
Source File: utils_classical.py From interpret-text with MIT License | 5 votes |
def __init__(self): """Initializes the Encoder object and sets internal tokenizer, labelEncoder and vectorizer using predefined objects. """ self.tokenizer = BOWTokenizer( English() ) # the tokenizer must have a tokenize() and parse() function. self.labelEncoder = LabelEncoder() self.vectorizer = CountVectorizer( tokenizer=self.tokenizer.tokenize, ngram_range=(1, 1) ) self.decode_params = {} # The keep_ids flag, is used by explain local in the explainer to decode # importances over raw features.
Example #21
Source File: test_classical_explainer.py From interpret-text with MIT License | 5 votes |
def test_explain_model_local_default(self): """ Test for explain_local of classical explainer :return: """ X_train, X_test, y_train, y_test = setup_mnli_test_train_split() label_encoder = LabelEncoder() y_train = label_encoder.fit_transform(y_train) explainer = ClassicalTextExplainer() classifier, best_params = explainer.fit(X_train, y_train) explainer.preprocessor.labelEncoder = label_encoder local_explanation = explainer.explain_local(DOCUMENT) assert len(local_explanation.local_importance_values) == len(local_explanation.features)
Example #22
Source File: ecob_elm.py From Python-ELM with MIT License | 5 votes |
def main(): from sklearn import preprocessing from sklearn.datasets import fetch_openml as fetch_mldata from sklearn.model_selection import cross_val_score db_name = 'iris' hid_num = 1000 data_set = fetch_mldata(db_name, version=1) data_set.data = preprocessing.scale(data_set.data) data_set.target = preprocessing.LabelEncoder().fit_transform(data_set.target) print(db_name) print('ECOBELM', hid_num) e = ECOBELM(hid_num, c=2**5) ave = 0 for i in range(10): scores = cross_val_score( e, data_set.data, data_set.target, cv=5, scoring='accuracy') ave += scores.mean() ave /= 10 print("Accuracy: %0.2f " % (ave)) print('ELM', hid_num) e = ELM(hid_num) ave = 0 for i in range(10): scores = cross_val_score( e, data_set.data, data_set.target, cv=5, scoring='accuracy') ave += scores.mean() ave /= 10 print("Accuracy: %0.2f " % (ave))
Example #23
Source File: datasets.py From sato with Apache License 2.0 | 5 votes |
def __init__(self, df_dict: Dict[str, pd.DataFrame]=None, tensor_dict: Dict[str, torch.FloatTensor]=None, labels: List[str] =[], label_enc: LabelEncoder = None): assert not (df_dict is None and tensor_dict is None),\ print('df_dict and tensor_dict can\'t be both None') assert len(labels)>0, 'lables can\'t be empty' if label_enc is None: label_enc = LabelEncoder() label_enc.fit(labels) self.label_enc = label_enc self.label_ids = self.label_enc.transform(labels) if tensor_dict is not None: self.name_tensor_dict = tensor_dict self.f_g_names = list(tensor_dict.keys()) self.len = tensor_dict[self.f_g_names[0]].shape[0] else: self.f_g_names = df_dict.keys() self.len = len(list(df_dict.values())[0]) # df_dict must have at least one key-value pair assert len(df_dict) > 0 # Make sure each df has the same size for name, df in df_dict.items(): assert len(df) == len(list(df_dict.values())[0]) # Convert dataframe into a dictionary of FloatTensor to avoid on-the-fly conversion self.name_tensor_dict = {} for name, df in df_dict.items(): self.name_tensor_dict[name] = torch.FloatTensor(df.values.astype('float'))
Example #24
Source File: preprocessing.py From dataiku-contrib with Apache License 2.0 | 5 votes |
def encode_categorical_features(cls, df): cat_feature_map = OrderedDict() for pos, f in enumerate(df): if not np.issubdtype(df[f].dtype, np.number): encoder = LabelEncoder() df[f] = encoder.fit_transform(df[f]) #TODO: must ensure the mapping is consistent cat_feature_map[pos] = encoder.classes_.tolist() return cat_feature_map
Example #25
Source File: main.py From santander-product-recommendation-8th-place with MIT License | 5 votes |
def label_encode(df, features, name): df[name] = df[name].astype('str') if name in transformers: # test df[name] = transformers[name].transform(df[name]) else: # train transformers[name] = LabelEncoder() df[name] = transformers[name].fit_transform(df[name]) features.append(name)
Example #26
Source File: utils.py From deepchem with MIT License | 5 votes |
def one_hot_encode(sequences): sequence_length = len(sequences[0]) integer_type = np.int8 if sys.version_info[ 0] == 2 else np.int32 # depends on Python version integer_array = LabelEncoder().fit( np.array(('ACGTN',)).view(integer_type)).transform( sequences.view(integer_type)).reshape( len(sequences), sequence_length) one_hot_encoding = OneHotEncoder( sparse=False, n_values=5, dtype=integer_type).fit_transform(integer_array) return one_hot_encoding.reshape(len(sequences), 1, sequence_length, 5).swapaxes(2, 3)[:, :, [0, 1, 2, 4], :]
Example #27
Source File: gradient_boosting.py From pygbm with MIT License | 5 votes |
def _encode_y(self, y): # encode classes into 0 ... n_classes - 1 and sets attributes classes_ # and n_trees_per_iteration_ check_classification_targets(y) label_encoder = LabelEncoder() encoded_y = label_encoder.fit_transform(y) self.classes_ = label_encoder.classes_ n_classes = self.classes_.shape[0] # only 1 tree for binary classification. For multiclass classification, # we build 1 tree per class. self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes encoded_y = encoded_y.astype(np.float32, copy=False) return encoded_y
Example #28
Source File: data.py From malss with MIT License | 5 votes |
def __encode(self, X): Xenc = X.copy(deep=True) if self._label_encoder is None or self._onehot_encoder is None: self._label_encoder = [None] * len(Xenc.columns) self._onehot_encoder = [None] * len(Xenc.columns) del_columns = [] for i in range(len(Xenc.columns)): if Xenc.dtypes[i] == np.dtype('O'): if self._label_encoder[i] is None: self._label_encoder[i] = LabelEncoder().fit(Xenc.iloc[:,i]) col_enc = self._label_encoder[i].transform(Xenc.iloc[:,i]) if self._onehot_encoder[i] is None: self._onehot_encoder[i] = OneHotEncoder(categories='auto').fit( col_enc.reshape(-1, 1)) col_onehot = np.array(self._onehot_encoder[i].transform( col_enc.reshape(-1, 1)).todense()) col_names = [str(Xenc.columns[i]) + '_' + c for c in self._label_encoder[i].classes_] col_onehot = pd.DataFrame(col_onehot, columns=col_names, index=Xenc.index) Xenc = pd.concat([Xenc, col_onehot], axis=1) del_columns.append(Xenc.columns[i]) for col in del_columns: del Xenc[col] return Xenc, del_columns
Example #29
Source File: kerasNN2_2nd.py From kaggle_Otto with MIT License | 5 votes |
def preprocess_labels(labels, encoder=None, categorical=True): if not encoder: encoder = LabelEncoder() encoder.fit(labels) y = encoder.transform(labels).astype(np.int32) if categorical: y = np_utils.to_categorical(y) return y, encoder
Example #30
Source File: kerasNN4_tfidf.py From kaggle_Otto with MIT License | 5 votes |
def preprocess_labels(labels, encoder=None, categorical=True): if not encoder: encoder = LabelEncoder() encoder.fit(labels) y = encoder.transform(labels).astype(np.int32) if categorical: y = np_utils.to_categorical(y) return y, encoder