Python sklearn.preprocessing.LabelBinarizer() Examples

The following are 30 code examples of sklearn.preprocessing.LabelBinarizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.preprocessing , or try the search function .
Example #1
Source File: mmbot.py    From MaliciousMacroBot with MIT License 8 votes vote down vote up
def mmb_evaluate_model(self):
        """
        Returns scores from cross validation evaluation on the malicious / benign classifier
        """
        predictive_features = self.features['predictive_features']
        self.clf_X = self.modeldata[predictive_features].values
        self.clf_y = np.array(self.modeldata['label'])

        X_train, X_test, y_train, y_test = train_test_split(self.clf_X, self.clf_y, test_size=0.2, random_state=0)
        lb = LabelBinarizer()
        y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
        eval_cls = RandomForestClassifier(n_estimators=100, max_features=.2)
        eval_cls.fit(X_train, y_train)

        recall = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='recall')
        precision = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='precision')
        accuracy = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='accuracy')
        f1_score = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='f1_macro')

        return {'accuracy': accuracy, 'f1': f1_score, 'precision': precision, 'recall': recall} 
Example #2
Source File: test_k_neighbors_classifier.py    From coremltools with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_conversion_with_sparse_y(self):
        """Tests conversion of a model that's fitted with y values in a sparse format."""
        from sklearn.model_selection import train_test_split

        X_train, X_test, y_train, y_test = train_test_split(
            self.iris_X, self.iris_y, test_size=0.2, train_size=0.8
        )

        from sklearn import preprocessing

        lb = preprocessing.LabelBinarizer(sparse_output=True)
        binarized_y = lb.fit_transform(y_train)

        sklearn_model = KNeighborsClassifier(algorithm="brute")
        sklearn_model.fit(X_train, binarized_y)

        self.assertRaises(ValueError, sklearn.convert, sklearn_model) 
Example #3
Source File: test_preprocessing.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_LabelBinarizer(self):
        arr = np.array([1, 2, 3, 2])
        s = pdml.ModelSeries(arr, index=['a', 'b', 'c', 'd'])

        mod1 = s.pp.LabelBinarizer()
        s.fit(mod1)
        result = s.transform(mod1)

        expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]])

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.values, expected)
        tm.assert_index_equal(result.index, s.index)

        mod1 = s.pp.LabelBinarizer()
        result = s.fit_transform(mod1)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.values, expected)

        inversed = result.inverse_transform(mod1)
        self.assertIsInstance(inversed, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(inversed.values.flatten(), arr)
        tm.assert_index_equal(result.index, s.index) 
Example #4
Source File: test_sklearn_label_binariser_converter.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def test_model_label_binariser_binary_labels(self):
        X = np.array([1, 0, 0, 0, 1])
        model = LabelBinarizer().fit(X)
        model_onnx = convert_sklearn(
            model,
            "scikit-learn label binariser",
            [("input", Int64TensorType([None]))],
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X.astype(np.int64),
            model,
            model_onnx,
            basename="SklearnLabelBinariserBinaryLabels",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        ) 
Example #5
Source File: test_sklearn_label_binariser_converter.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def test_model_label_binariser_neg_label(self):
        X = np.array([1, 2, 6, 4, 2])
        model = LabelBinarizer(neg_label=-101).fit(X)
        model_onnx = convert_sklearn(
            model,
            "scikit-learn label binariser",
            [("input", Int64TensorType([None]))],
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X.astype(np.int64),
            model,
            model_onnx,
            basename="SklearnLabelBinariserNegLabel",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        ) 
Example #6
Source File: test_sklearn_label_binariser_converter.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def test_model_label_binariser_neg_pos_label(self):
        X = np.array([1, 2, 6, 4, 2])
        model = LabelBinarizer(neg_label=10, pos_label=20).fit(X)
        model_onnx = convert_sklearn(
            model,
            "scikit-learn label binariser",
            [("input", Int64TensorType([None]))],
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X.astype(np.int64),
            model,
            model_onnx,
            basename="SklearnLabelBinariserNegPosLabel",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        ) 
Example #7
Source File: test_preprocessing.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
        self.assertIs(df.preprocessing.FunctionTransformer,
                      pp.FunctionTransformer)
        self.assertIs(df.preprocessing.Imputer, pp.Imputer)
        self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
        self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
        self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
        self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
        self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
        self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
        self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
        self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
        self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
        self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
        self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler) 
Example #8
Source File: lstm.py    From mindmeld with Apache License 2.0 6 votes vote down vote up
def setup_model(self, config):
        self.set_params(**config.params)
        self.label_encoder = LabelBinarizer()
        self.gaz_encoder = LabelBinarizer()

        self.graph = tf.Graph()
        self.saver = None

        self.example_type = config.example_type
        self.features = config.features

        self.query_encoder = WordSequenceEmbedding(
            self.padding_length,
            self.token_embedding_dimension,
            self.token_pretrained_embedding_filepath,
        )

        if self.use_char_embeddings:
            self.char_encoder = CharacterSequenceEmbedding(
                self.padding_length,
                self.character_embedding_dimension,
                self.max_char_per_word,
            ) 
Example #9
Source File: lstm.py    From mindmeld with Apache License 2.0 6 votes vote down vote up
def _gaz_transform(self, list_of_tokens_to_transform):
        """This function is used to handle special logic around SKLearn's LabelBinarizer
        class which behaves in a non-standard way for 2 classes. In a 2 class system,
        it encodes the classes as [0] and [1]. However, in a 3 class system, it encodes
        the classes as [0,0,1], [0,1,0], [1,0,0] and sustains this behavior for num_class > 2.

        We want to encode 2 class systems as [0,1] and [1,0]. This function does that.

        Args:
            list_of_tokens_to_transform (list): A sequence of class labels

        Returns:
            (array): corrected encoding from the binarizer
        """
        output = self.gaz_encoder.transform(list_of_tokens_to_transform)
        if len(self.gaz_encoder.classes_) == 2:
            output = np.hstack((1 - output, output))
        return output 
Example #10
Source File: train_model.py    From production-tools with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_mnist_data():
    """Loads the MNIST data set into memory.

    Returns
    -------
    X : array-like, shape=[n_samples, n_features]
        Training data for the MNIST data set.
        
    y : array-like, shape=[n_samples,]
        Labels for the MNIST data set.
    """
    digits = load_digits()
    X, y = digits.data, digits.target
    y = LabelBinarizer().fit_transform(y)

    return X, y 
Example #11
Source File: test_classify.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_sklearn_labelbin(self):

        m = np.array([1.0, .81, .85, .81, .85, .81])
        u = np.array([1.0, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(
            1000, 500, m=m, u=u, random_state=535, return_links=True)

        binarizer = LabelBinarizer()
        binarizer.fit(X_train.iloc[:, 0])
        assert len(binarizer.classes_) == 1

        binarizer.classes_ = np.array([0, 1])
        assert len(binarizer.classes_) == 2

        binarizer.transform(X_train.iloc[:, 1])
        assert len(binarizer.classes_) == 2 
Example #12
Source File: stacking.py    From stacked_generalization with Apache License 2.0 6 votes vote down vote up
def _get_child_predict(self, clf, X, index=None):
        if self.stack_by_proba and hasattr(clf, 'predict_proba'):
            if self.save_stage0 and index is not None:
                proba = util.saving_predict_proba(clf, X, index)
            else:
                proba = clf.predict_proba(X)
            return proba[:, 1:]
        elif hasattr(clf, 'predict'):
            predict_result = clf.predict(X)
            if isinstance(clf, ClassifierMixin):
                lb = LabelBinarizer()
                lb.fit(predict_result)
                return lb.fit_transform(predict_result)
            else:
                return predict_result.reshape((predict_result.size, 1))
        else:
            return clf.fit_transform(X) 
Example #13
Source File: test_sklearn_label_binariser_converter.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def test_model_label_binariser_pos_label(self):
        X = np.array([1, 2, 6, 4, 2])
        model = LabelBinarizer(pos_label=123).fit(X)
        model_onnx = convert_sklearn(
            model,
            "scikit-learn label binariser",
            [("input", Int64TensorType([None]))],
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X.astype(np.int64),
            model,
            model_onnx,
            basename="SklearnLabelBinariserPosLabel",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        ) 
Example #14
Source File: test_sklearn_label_binariser_converter.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def test_model_label_binariser_default(self):
        X = np.array([1, 2, 6, 4, 2])
        model = LabelBinarizer().fit(X)
        model_onnx = convert_sklearn(
            model,
            "scikit-learn label binariser",
            [("input", Int64TensorType([None]))],
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X.astype(np.int64),
            model,
            model_onnx,
            basename="SklearnLabelBinariserDefault",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        ) 
Example #15
Source File: base.py    From polylearn with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def _check_X_y(self, X, y):

        # helpful error message for sklearn < 1.17
        is_2d = hasattr(y, 'shape') and len(y.shape) > 1 and y.shape[1] >= 2

        if is_2d or type_of_target(y) != 'binary':
            raise TypeError("Only binary targets supported. For training "
                            "multiclass or multilabel models, you may use the "
                            "OneVsRest or OneVsAll metaestimators in "
                            "scikit-learn.")

        X, Y = check_X_y(X, y, dtype=np.double, accept_sparse='csc',
                         multi_output=False)

        self.label_binarizer_ = LabelBinarizer(pos_label=1, neg_label=-1)
        y = self.label_binarizer_.fit_transform(Y).ravel().astype(np.double)
        return X, y 
Example #16
Source File: SpectraLearnPredict.py    From SpectralMachine with GNU General Public License v3.0 6 votes vote down vote up
def formatClass(rootFile, Cl):
    import sklearn.preprocessing as pp
    print('==========================================================================\n')
    print(' Running basic TensorFlow. Creating class data in binary form...')
    Cl2 = pp.LabelBinarizer().fit_transform(Cl)
    
    import matplotlib.pyplot as plt
    plt.hist([float(x) for x in Cl], bins=np.unique([float(x) for x in Cl]), edgecolor="black")
    plt.xlabel('Class')
    plt.ylabel('Occurrances')
    plt.title('Class distibution')
    plt.savefig(rootFile + '_ClassDistrib.png', dpi = 160, format = 'png')  # Save plot
    if tfDef.plotClassDistribTF == True:
        print(' Plotting Class distibution \n')
        plt.show()
    
    return Cl2

#******************************************************************************** 
Example #17
Source File: test_mlp_classifier.py    From muffnn with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_cross_val_predict():
    # Make sure it works in cross_val_predict for multiclass.

    X, y = load_iris(return_X_y=True)
    y = LabelBinarizer().fit_transform(y)
    X = StandardScaler().fit_transform(X)

    mlp = MLPClassifier(n_epochs=10,
                        solver_kwargs={'learning_rate': 0.05},
                        random_state=4567).fit(X, y)

    cv = KFold(n_splits=4, random_state=457, shuffle=True)
    y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba')
    auc = roc_auc_score(y, y_oos, average=None)

    assert np.all(auc >= 0.96) 
Example #18
Source File: crf_unit.py    From medical-entity-recognition with Apache License 2.0 6 votes vote down vote up
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a l ist of BIOSE-encoded sequences.
    It computes token-level metrics and discards 'O' labels.
    :param y_true:
    :param y_pred:
    :return:
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(y_true)
    y_pred_combined = lb.transform(y_pred)

    tagset = set(lb.classes_) - {'O'}
    tagset = set(lb.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {
        cls: idx for idx, cls in enumerate(lb.classes_)
    }

    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels=[class_indices[cls] for cls in tagset],
        target_names=tagset
    ) 
Example #19
Source File: crf_sent_tagger.py    From Jiayan with MIT License 6 votes vote down vote up
def eval(self, test_x, test_y, crf_model):
        tagger = pycrfsuite.Tagger()
        tagger.open(crf_model)

        y_pred = []
        for feat_list in test_x:
            preds = tagger.tag(feat_list)
            y_pred.append(preds)

        lb = LabelBinarizer()
        y_true_all = lb.fit_transform(list(chain.from_iterable(test_y)))
        y_pred_all = lb.transform(list(chain.from_iterable(y_pred)))

        tagset = sorted(set(lb.classes_))
        class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

        print(classification_report(
            y_true_all,
            y_pred_all,
            labels=[class_indices[cls] for cls in tagset],
            target_names=tagset,
            digits=5
        )) 
Example #20
Source File: xgboost.py    From sklearn2pmml with GNU Affero General Public License v3.0 6 votes vote down vote up
def make_xgboost_dataframe_mapper(dtypes, missing_value_aware = True):
	"""Construct a DataFrameMapper for feeding complex data into an XGBModel.

	Parameters
	----------

	dtypes: iterable of tuples (column, dtype)

	missing_value_aware: boolean
		If true, use missing value aware transformers.

	Returns
	-------
	DataFrameMapper

	"""
	features = list()
	for column, dtype in dtypes.items():
		if _is_categorical(dtype):
			features.append(([column], PMMLLabelBinarizer(sparse_output = True) if missing_value_aware else LabelBinarizer(sparse_output = True)))
		else:
			features.append(([column], None))
	return DataFrameMapper(features) 
Example #21
Source File: crf_pos_tagger.py    From Jiayan with MIT License 6 votes vote down vote up
def eval(self, test_x, test_y, crf_model):
        tagger = pycrfsuite.Tagger()
        tagger.open(crf_model)

        y_pred = []
        for feat_list in test_x:
            preds = tagger.tag(feat_list)
            y_pred.append(preds)

        lb = LabelBinarizer()
        y_true_all = lb.fit_transform(list(chain.from_iterable(test_y)))
        y_pred_all = lb.transform(list(chain.from_iterable(y_pred)))

        tagset = sorted(set(lb.classes_))
        class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

        print(classification_report(
            y_true_all,
            y_pred_all,
            labels=[class_indices[cls] for cls in tagset],
            target_names=tagset,
            digits=5
        )) 
Example #22
Source File: char2ir_gpu.py    From plastering with MIT License 6 votes vote down vote up
def encode_labels(self, label_dict, srcids):
        flat_labels = ['O']
        if self.use_brick_flag:
            with open('brick/tags.json', 'r') as fp:
                brick_tags = json.load(fp)
            flat_labels += ['B_' + tag for tag in brick_tags] + \
                           ['I_' + tag for tag in brick_tags]
        flat_labels += reduce(adder, [reduce(adder, label_dict[srcid].values()) for srcid in srcids])
        self.le = LabelBinarizer().fit(flat_labels)
        stack = []
        for srcid in srcids:
            labels = label_dict[srcid]
            sentences = self.sentence_dict[srcid]
            for metadata_type in self.sentence_dict[srcid].keys():
                labels = label_dict[srcid][metadata_type]
                if len(labels) == 0:
                    encoded = np.zeros((self.max_len, encoded.shape[1]))
                else:
                    encoded = self.le.transform(labels)
                    encoded = np.vstack([encoded, np.zeros(
                                         (self.max_len - encoded.shape[0],
                                          encoded.shape[1]))])
                stack.append(encoded)
        return np.stack(stack) 
Example #23
Source File: one_against_rest.py    From qiskit-aqua with Apache License 2.0 6 votes vote down vote up
def train(self, x, y):
        """
        Training multiple estimators each for distinguishing a pair of classes.

        Args:
            x (numpy.ndarray): input points
            y (numpy.ndarray): input labels
        Raises:
            Exception: given all data points are assigned to the same class,
                        the prediction would be boring
        """
        self.label_binarizer_ = LabelBinarizer(neg_label=0)
        Y = self.label_binarizer_.fit_transform(y)
        self.classes = self.label_binarizer_.classes_
        columns = (np.ravel(col) for col in Y.T)
        self.estimators = []
        for _, column in enumerate(columns):
            unique_y = np.unique(column)
            if len(unique_y) == 1:
                raise Exception("given all data points are assigned to the same class, "
                                "the prediction would be boring.")
            estimator = self.estimator_cls(*self.params)
            estimator.fit(x, column)
            self.estimators.append(estimator) 
Example #24
Source File: training.py    From OpenCV-3-x-with-Python-By-Example with MIT License 6 votes vote down vote up
def __init__(self, feature_vector_size, label_words):
        self.ann = cv2.ml.ANN_MLP_create()
        # Number of centroids used to build the feature vectors
        input_size = feature_vector_size
        # Number of models to recongnize
        output_size = len(label_words)
        # Applying Heaton rules
        hidden_size = (input_size * (2 / 3)) + output_size
        nn_config = np.array([input_size, hidden_size, output_size], dtype=np.uint8)
        self.label_words = label_words
        self.ann.setLayerSizes(np.array(nn_config))
        # Symmetrical Sigmoid as activation function
        self.ann.setActivationFunction(cv2.ml.ANN_MLP_SIGMOID_SYM)
        # Map models as tuples of probabilities
        self.le = preprocessing.LabelBinarizer()
        self.le.fit(label_words)  # Label words are ['dress', 'footwear', 'backpack'] 
Example #25
Source File: elm.py    From SVM-CNN with Apache License 2.0 6 votes vote down vote up
def __init__(self, n_hidden=20, alpha=0.5, rbf_width=1.0,
                 activation_func='tanh', activation_args=None,
                 user_components=None, regressor=None,
                 binarizer=LabelBinarizer(-1, 1),
                 random_state=None):

        super(ELMClassifier, self).__init__(n_hidden=n_hidden,
                                            alpha=alpha,
                                            random_state=random_state,
                                            activation_func=activation_func,
                                            activation_args=activation_args,
                                            user_components=user_components,
                                            rbf_width=rbf_width,
                                            regressor=regressor)

        self.classes_ = None
        self.binarizer = binarizer 
Example #26
Source File: naive_bayes.py    From plume with MIT License 6 votes vote down vote up
def fit(self, X, y):
        """
        :param X_: shape = [n_samples, n_features] 
        :param y: shape = [n_samples] 
        :return: self
        """
        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes = labelbin.classes_
        self.class_count = np.zeros(Y.shape[1], dtype=np.float64)
        self.feature_count = np.zeros((Y.shape[1], X.shape[1]),
                                      dtype=np.float64)

        self.feature_count += Y.T @ X
        self.class_count += Y.sum(axis=0)
        smoothed_fc = self.feature_count + self.alpha
        smoothed_cc = smoothed_fc.sum(axis=1)

        self.feature_log_prob = (np.log(smoothed_fc) -
                                 np.log(smoothed_cc.reshape(-1, 1))) 
Example #27
Source File: transformers.py    From scikit-optimize with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self):
        """Convert labeled categories into one-hot encoded features."""
        self._lb = LabelBinarizer() 
Example #28
Source File: calc_auc_roc.py    From zincbase with MIT License 5 votes vote down vote up
def calc_auc_roc(truth, pred, average="macro"):

    lb = LabelBinarizer()
    lb.fit(truth)
    truth = lb.transform(truth)
    pred = lb.transform(pred)
    return roc_auc_score(truth, pred, average=average) 
Example #29
Source File: community_weighting.py    From reveal-graph-embedding with Apache License 2.0 5 votes vote down vote up
def chi2_contingency_matrix(X_train, y_train):
    X = X_train.copy()
    X.data = np.ones_like(X.data)

    X = check_array(X, accept_sparse='csr')
    if np.any((X.data if issparse(X) else X) < 0):
        raise ValueError("Input X must be non-negative.")

    Y = LabelBinarizer().fit_transform(y_train)
    if Y.shape[1] == 1:
        Y = np.append(1 - Y, Y, axis=1)

    observed = safe_sparse_dot(Y.T, X)  # n_classes * n_features

    # feature_count = check_array(X.sum(axis=0))
    # class_prob = check_array(Y.mean(axis=0))
    feature_count = X.sum(axis=0).reshape(1, -1)
    class_prob = Y.mean(axis=0).reshape(1, -1)
    expected = np.dot(class_prob.T, feature_count)

    observed = np.asarray(observed, dtype=np.float64)

    k = len(observed)
    # Reuse observed for chi-squared statistics
    contingency_matrix = observed
    contingency_matrix -= expected
    contingency_matrix **= 2

    expected[expected == 0.0] = 1.0

    contingency_matrix /= expected

    # weights = contingency_matrix.max(axis=0)

    return contingency_matrix 
Example #30
Source File: loadData.py    From birdsong-keras with GNU General Public License v3.0 5 votes vote down vote up
def getOneHotOFGS(df):
    from sklearn.preprocessing import LabelBinarizer
    lb = LabelBinarizer()
    lb.fit(df["OFGS"])
    return ( lb, lb.transform(df["OFGS"]) )

# create onehot encoding for classid