Python sklearn.preprocessing.MultiLabelBinarizer() Examples

The following are 30 code examples of sklearn.preprocessing.MultiLabelBinarizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.preprocessing , or try the search function

Example #1

Source File: evaluation.py From EUSIPCO2017 with GNU Affero General Public License v3.0

7 votes

def __init__(self, model_module, weights_path, evaluation_strategy="s2"):
        """
        Test metadata format
        ---------------------
        filename : string
        class_ids: string of ints with space as a delimiter
        """
        test_dataset = pd.read_csv(IRMAS_TESTING_META_PATH, names=["filename", "class_ids"])
        self.X = list(test_dataset.filename)
        targets = [[int(category) for category in target.split()] for target in test_dataset.class_ids]
        self.ml_binarizer = MultiLabelBinarizer().fit(targets)
        self.y_true = self.ml_binarizer.transform(targets)

        self.y_pred = np.zeros(shape=self.y_true.shape)
        self.y_pred_raw = np.zeros(shape=self.y_true.shape)
        self.y_pred_raw_average = np.zeros(shape=self.y_true.shape)
        self.model_module = model_module
        self.weights_path = weights_path
        self.feature_filenames = os.listdir(os.path.join(IRMAS_TEST_FEATURE_BASEPATH, model_module.BASE_NAME))
        self.dataset_mean = np.load(os.path.join(MODEL_MEANS_BASEPATH, "{}_mean.npy".format(model_module.BASE_NAME)))
        self.evaluation_strategy = evaluation_strategy
        self.thresholds_s1 = [0.10, 0.12, 0.14, 0.16, 0.18, 0.20, 0.22, 0.24]
        self.thresholds_s2 = [0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]

Example #2

Source File: data.py From keras-text with MIT License

6 votes

def __init__(self, inputs, labels, test_indices=None, **kwargs):
        """Encapsulates all pieces of data to run an experiment. This is basically a bag of items that makes it
        easy to serialize and deserialize everything as a unit.

        Args:
            inputs: The raw model inputs. This can be set to None if you dont want
                to serialize this value when you save the dataset.
            labels: The raw output labels.
            test_indices: The optional test indices to use. Ideally, this should be generated one time and reused
                across experiments to make results comparable. `generate_test_indices` can be used generate first
                time indices.
            **kwargs: Additional key value items to store.
        """
        self.X = np.array(inputs)
        self.y = np.array(labels)
        for key, value in kwargs.items():
            setattr(self, key, value)

        self._test_indices = None
        self._train_indices = None
        self.test_indices = test_indices

        self.is_multi_label = isinstance(labels[0], (set, list, tuple))
        self.label_encoder = MultiLabelBinarizer() if self.is_multi_label else LabelBinarizer()
        self.y = self.label_encoder.fit_transform(self.y).flatten()

Example #3

Source File: _machine_learning.py From qlik-py-tools with MIT License

6 votes

def text_similarity(df, col):
        """
        Convert strings to their unicode representation and then apply one hot encoding, creating one feature for each unique character in the column. 
        This can be useful when similarity between strings is significant.
        """
        
        unique = pd.DataFrame(df[col].unique(), columns=[col])
        
        encoded = pd.DataFrame(unique.loc[:,col].apply(lambda s: [ord(a) for a in s]), index=unique.index)
        
        mlb = preprocessing.MultiLabelBinarizer()
        encoded = pd.DataFrame(mlb.fit_transform(encoded[col]),columns=mlb.classes_, index=encoded.index).add_prefix(col+"_")
        
        unique = unique.join(encoded)
        
        return unique.set_index(col)

Example #4

Source File: default_processor.py From text2vec with Apache License 2.0

6 votes

def _build_label_dict(self,
                          labels: List[str]):
        from sklearn.preprocessing import MultiLabelBinarizer
        if self.multi_label:
            label_set = set()
            for i in labels:
                label_set = label_set.union(list(i))
        else:
            label_set = set(labels)
        self.label2idx = {}
        for idx, label in enumerate(sorted(label_set)):
            self.label2idx[label] = len(self.label2idx)

        self.idx2label = dict([(value, key) for key, value in self.label2idx.items()])
        self.dataset_info['label_count'] = len(self.label2idx)
        self.multi_label_binarizer = MultiLabelBinarizer(classes=list(self.label2idx.keys()))

Example #5

Source File: train.py From stacks-usecase with Apache License 2.0

6 votes

def feature_vectorizer(X_train, X_test, y_train, y_test):
    """prepare X data with tfidf and y with multi label binarizer"""
    vectorizer = TfidfVectorizer(
        analyzer="word", min_df=0.0,
        max_df=1.0, strip_accents=None,
        encoding="utf-8", preprocessor=None,
        token_pattern=r"(?u)\S\S+", max_features=1000,
    )
    # fit only training data
    vectorizer.fit(X_train)
    _save_data(vectorizer, "/workdir/models/X_vectorizer.pk")
    X_train_features = vectorizer.transform(X_train)
    X_test_features = vectorizer.transform(X_test)
    # use multiLabelBinarizer to create one-hot encoding of labels for y data
    mlb = MultiLabelBinarizer()
    # fit only training data
    mlb.fit(y_train)
    _save_data(mlb, "/workdir/models/label_binarizer.pk")
    y_train_features = mlb.transform(y_train)
    y_test_features = mlb.transform(y_test)
    return X_train_features, X_test_features, y_train_features, y_test_features

Example #6

Source File: pipeline.py From image-captioning-for-mortals with BSD 3-Clause "New" or "Revised" License

6 votes

def prepVect(min_df=2, max_features=50000, n_captions=5, n_sbu=None,
             multilabel=False):
    print "prepping the Word Tokenizer..."
    _0, _1, trY, _3 = coco(mode='full', n_captions=n_captions)
    if n_sbu:
        _4, sbuY, _5 = sbuXYFilenames(n_sbu)
        trY.extend(sbuY)
    vect = Tokenizer(min_df=min_df, max_features=max_features)
    captions = sampleCaptions(trY, n_captions)
    vect.fit(captions)
    if multilabel:
        mlb = MultiLabelBinarizer()
        mlb.fit(vect.transform(captions))
        return vect, mlb
    # if not multilabel:
    return vect

Example #7

Source File: char2ir_gpu.py From plastering with MIT License

6 votes

def evaluate(self, preds):
        acc = eval_func.sequential_accuracy(
            [self.label_dict[srcid] for srcid in preds.keys()],
            [preds[srcid] for srcid in preds.keys()])
        pred = [preds[srcid] for srcid in preds.keys()]
        true = [self.label_dict[srcid] for srcid in preds.keys()]
        mlb = MultiLabelBinarizer()
        mlb.fit(pred + true)
        encoded_true = mlb.transform(true)
        encoded_pred = mlb.transform(pred)
        macro_f1 = f1_score(encoded_true, encoded_pred, average='macro')
        f1 = f1_score(encoded_true, encoded_pred, average='weighted')
        res = {
            'accuracy': acc,
            'f1': f1,
            'macro_f1': macro_f1
        }
        return res

Example #8

Source File: test_preprocessing.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
        self.assertIs(df.preprocessing.FunctionTransformer,
                      pp.FunctionTransformer)
        self.assertIs(df.preprocessing.Imputer, pp.Imputer)
        self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
        self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
        self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
        self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
        self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
        self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
        self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
        self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
        self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
        self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
        self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)

Example #9

Source File: evaluation.py From BioNEV with MIT License

6 votes

def NodeClassification(embedding_look_up, node_list, labels, testing_ratio, seed):

    X_train, y_train, X_test, y_test = split_train_test_classify(embedding_look_up, node_list, labels,
                                                                 testing_ratio=testing_ratio,seed=seed)
    binarizer = MultiLabelBinarizer(sparse_output=True)
    y_all = np.append(y_train, y_test)
    binarizer.fit(y_all)
    y_train = binarizer.transform(y_train).todense()
    y_test = binarizer.transform(y_test).todense()
    model = OneVsRestClassifier(LogisticRegression(random_state=seed, solver='lbfgs'))
    model.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_test)

    ## small trick : we assume that we know how many label to predict
    y_pred = get_y_pred(y_test, y_pred_prob)

    accuracy = accuracy_score(y_test, y_pred)
    micro_f1 = f1_score(y_test, y_pred, average="micro")
    macro_f1 = f1_score(y_test, y_pred, average="macro")

    print('#' * 9 + ' Node Classification Performance ' + '#' * 9)
    print(f'Accuracy: {accuracy:.3f}, Micro-F1: {micro_f1:.3f}, Macro-F1: {macro_f1:.3f}')
    print('#' * 50)
    return accuracy, micro_f1, macro_f1

Example #10

Source File: test_discovery.py From fairtest with Apache License 2.0

6 votes

def setUp(self):
        FILENAME = "../data/images/overfeat_raw.txt"
        data = prepare.data_from_csv(FILENAME, sep='\\t')

        TARGET = 'Labels'
        self.SENS = ['Race']
        self.EXPL = []

        labeled_data = [ast.literal_eval(s) for s in data[TARGET]]
        for l in labeled_data:
            assert len(l) == 5
        label_encoder = preprocessing.MultiLabelBinarizer()
        labeled_data = label_encoder.fit_transform(labeled_data)
        labels = label_encoder.classes_
        df_labels = pd.DataFrame(labeled_data, columns=labels)
        self.data = DataSource(pd.concat([data.drop(TARGET, axis=1), df_labels],
                                         axis=1))
        self.TARGET = labels.tolist()

Example #11

Source File: feature_expansion.py From KDDCup2019_admin with MIT License

6 votes

def cat_onehot_encoder_m(df,y,col,selection=True):
    ## ZJN: test raise memory error
    # raise MemoryError


    mlbs = MultiLabelBinarizer(sparse_output=True).fit(df.values)
    from scipy.sparse import csr_matrix
    features_tmp = mlbs.transform(df.values)
    features_tmp = csr_matrix(features_tmp,dtype=float).tocsr()
    models = None
    auc_score = None
    if selection is True:
        auc_score, models = train_lightgbm_for_feature_selection(features_tmp, y)
        print(col, "auc", auc_score)
    #new_feature = pd.DataFrame(features_tmp,columns=["mul_feature_"+col])
    new_feature = features_tmp
    from scipy.sparse import hstack



    return new_feature,mlbs,models,auc_score

Example #12

Source File: data_helper.py From HFT-CNN with MIT License

6 votes

def build_input_label_data(labels, class_order):
    from sklearn.preprocessing import MultiLabelBinarizer
    from itertools import chain

    bml = MultiLabelBinarizer(classes=class_order, sparse_output=True)
    indexes = sp.find(bml.fit_transform(labels)) 
    y = []

    for i in range(len(labels)):
        y.append([])
    for i,j in zip(indexes[0], indexes[1]):
        y[i].append(j)
    return y

# padding operation
# =========================================================

Example #13

Source File: multi_class_svm.py From JusticeAI with MIT License

5 votes

def __classify_precedent(self, precedent):
        """
        1) The data looks as such: [1, 1, 1, 0, 1, 0, 0, 1...]

        2) We must create a new list with only the index of the columns where
             there are values of '1'. This is necessary because the sklearn
             algorithm expects this kind of input.

        3) Reshape the y data
           The MultiLabelBinarizer expects a series of labels for binarization.
           From all the collected labels, it finds all the unique ones in order
           to figure out how many columns are needed in the vector. From this,
           it will place 1's and 0's accordingly in the columns. For this purpose,
           we cannot create a binarized vector here but instead we return the labels
           which are true for an outcome.

            Example:        (transformation)
            [1, 1, 0, 0, 1] ------------------> [0, 1, 4]

        4) Create a 2D numpy array from the new list:[
            [precedent #1 outcomes],
            [precedent #2 outcomes],
            ...
            ]
        :param precedent:
            dict{
                'facts_vector': [],
                'outcomes_vector': [],
                'demands_vector': []
            }
        :return: np.array([0, 1, 4, ...])
        """
        classified_precedent = []
        outcome_vector = precedent['outcomes_vector']
        for i in range(len(outcome_vector)):
            if outcome_vector[i] >= 1:
                classified_precedent.append(i)
        return classified_precedent

Example #14

Source File: multi_class_svm.py From JusticeAI with MIT License

5 votes

def train(self):
        """
        Train a classifier using Linear SVC
        1) reshape date in a format that sklearn understands
        2) Binarize data for multi output
        3) split training data
        4) train (fit)
        5) test model
        :return: None
        """
        x_total, y_total = self.reshape_dataset()  # 1
        self.mlb = MultiLabelBinarizer()  # 2
        y_total = self.mlb.fit_transform(y_total)

        x_train, x_test, y_train, y_test = train_test_split(
            x_total, y_total, test_size=0.20, random_state=42)  # 3

        Log.write("Sample size: {}".format(len(x_total)))
        Log.write("Train size: {}".format(len(x_train)))
        Log.write("Test size: {}".format(len(x_test)))
        Log.write("Training Classifier Using Multi Class SVM")

        clf = OneVsRestClassifier(SVC(kernel='linear', random_state=42, probability=True))  # 4
        clf.fit(x_train, y_train)
        self.model = clf
        self.__test(x_test, y_test)  # 5

Example #15

Source File: preprocessing.py From arxiv-twitterbot with MIT License

5 votes

def get_features_matrix(df, min_author_freq=3, min_term_freq=30, ngram_range=(1, 3)):
    """Return numpy array of data for sklearn models"""
    text = [title + ' ' + summary for title, summary in zip(df.title.values, df.summary.values)]
    vectorizer = TfidfVectorizer(min_df=min_term_freq, stop_words='english', ngram_range=ngram_range)
    text_features = vectorizer.fit_transform(text).toarray()

    author_counts = pd.Series([a for author_set in df.authors.values for a in author_set]).value_counts()
    allowed_authors = author_counts[author_counts >= min_author_freq].index
    filtered_authors = df.authors.apply(lambda authors: [a for a in authors if a in allowed_authors])

    author_binarizer = MultiLabelBinarizer()
    author_features = author_binarizer.fit_transform(filtered_authors.values)

    category_dummies = pd.get_dummies(df.category)
    category_features = category_dummies.values

    all_features = [text_features, author_features, category_features]

    x = np.concatenate(all_features, axis=1)

    if 'tweeted' in df:
        y = df.tweeted.astype(int).values
    else:
        y = None

    feature_names = np.concatenate((vectorizer.get_feature_names(),
                                    category_dummies.columns.values,
                                    author_binarizer.classes_))
    return x, y, feature_names

Example #16

Source File: optimization.py From sports-betting with MIT License

5 votes

def calculate_yields(score1, score2, bets, odds, targets):
    """Calculate the yields."""

    # Check odds
    odds = check_array(odds)
    targets = check_array(targets, dtype=object, ensure_2d=False)

    # Generate yields
    bets = MultiLabelBinarizer(classes=['-'] + targets.tolist()).fit_transform([[bet] for bet in bets])[:, 1:]
    yields = ((extract_multi_labels(score1, score2, targets) * odds - 1.0) * bets).sum(axis=1)

    return yields

Example #17

Source File: classify.py From BioNEV with MIT License

5 votes

def __init__(self, vectors, clf):
        self.embeddings = vectors
        self.clf = TopKRanker(clf)
        self.binarizer = MultiLabelBinarizer(sparse_output=True)

Example #18

Source File: test_BRKNN.py From Quadflor with BSD 3-Clause "New" or "Revised" License

5 votes

def test_BRKnnb_auto_optimize_k(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [0, 1.1], [1.1, 1]])
        train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid2', 'lid3'], ['lid0', 'lid1']]
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(mode='b', n_neighbor_candidates=[1, 3], auto_optimize_k=True)

        # noinspection PyUnusedLocal
        def fun(s, X, y_):
            return data[[1, 2, 3]], data[[0]], y[[1, 2, 3]], y[[0]]

        BRKNeighborsClassifier._get_split = fun
        knn.fit(data, y)
        self.assertEquals(3, knn.n_neighbors)
        pred = knn.predict(csr.csr_matrix([[0.1, 1], [2, 2]])).todense()
        np.testing.assert_array_equal([[1, 1, 0, 0], [1, 1, 0, 0]], pred)

        # def test_time_brknnb(self):
        #     times = []
        #     X = sp.rand(10000, 5000, density=0.005, format='csr')
        #     y = sp.rand(10000, 3000, density=0.005, format='csr')
        #     knn = BRKNeighborsClassifier(n_neighbors=100)
        #     knn.fit(X,y)
        #     X_test = sp.rand(1000, 5000, density=0.005, format ='csr')
        #     for _ in range(5):
        #         start = default_timer()
        #         knn.predict(X_test)
        #         times.append(default_timer() - start)
        #     print(np.mean(times))

Example #19

Source File: multiclass.py From sk-dist with Apache License 2.0

5 votes

def fit(self, X, y, **fit_params):
        """
        Fit underlying estimators. Parallelize fit operation using spark.

        Args:
            X (array-like, shape = [n_samples, n_features]): input data
            y (array-like, shape = [n_samples, ], [n_samples, n_classes]): multi-class targets
            **fit_params (dict of string -> object): parameters passed 
                to the ``fit`` method of the estimator
        """
        _check_estimator(self, verbose=self.verbose)

        if (not self.mlb_override and not hasattr(y[0], '__array__') 
                and isinstance(y[0], Sequence)
                and not isinstance(y[0], str)):
            self.mlb = MultiLabelBinarizer()
            y = self.mlb.fit_transform(y)

        if isinstance(X, pd.DataFrame):
            X.index = list(range(len(X)))

        self.label_binarizer_ = LabelBinarizer(sparse_output=True)
        self.label_binarizer_.fit(y)
        self.classes_ = self.label_binarizer_.classes_
        self._fit(X, y, **fit_params)
        del self.sc
        if hasattr(self.estimator, "sc"):
            del self.estimator.sc
        return self

Example #20

Source File: test_BRKNN.py From Quadflor with BSD 3-Clause "New" or "Revised" License

5 votes

def test_BRKnnb_predict_dense(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1.5, 1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid3'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=False)
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(mode='b', n_neighbors=3)
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
        np.testing.assert_array_equal([[1, 1, 0, 0, 0]], pred)

Example #21

Source File: test_BRKNN.py From Quadflor with BSD 3-Clause "New" or "Revised" License

5 votes

def test_BRKnnb_predict(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1.5, 1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid3'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=True)
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(mode='b', n_neighbors=3)
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
        np.testing.assert_array_equal([[1, 1, 0, 0, 0]], pred)

Example #22

Source File: test_BRKNN.py From Quadflor with BSD 3-Clause "New" or "Revised" License

5 votes

def test_BRKnna_predict_dense(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a')
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense()
        np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)

Example #23

Source File: modularity_explicitness.py From disentanglement_lib with Apache License 2.0

5 votes

def explicitness_per_factor(mus_train, y_train, mus_test, y_test):
  """Compute explicitness score for a factor as ROC-AUC of a classifier.

  Args:
    mus_train: Representation for training, (num_codes, num_points)-np array.
    y_train: Ground truth factors for training, (num_factors, num_points)-np
      array.
    mus_test: Representation for testing, (num_codes, num_points)-np array.
    y_test: Ground truth factors for testing, (num_factors, num_points)-np
      array.

  Returns:
    roc_train: ROC-AUC score of the classifier on training data.
    roc_test: ROC-AUC score of the classifier on testing data.
  """
  x_train = np.transpose(mus_train)
  x_test = np.transpose(mus_test)
  clf = LogisticRegression().fit(x_train, y_train)
  y_pred_train = clf.predict_proba(x_train)
  y_pred_test = clf.predict_proba(x_test)
  mlb = MultiLabelBinarizer()
  roc_train = roc_auc_score(mlb.fit_transform(np.expand_dims(y_train, 1)),
                            y_pred_train)
  roc_test = roc_auc_score(mlb.fit_transform(np.expand_dims(y_test, 1)),
                           y_pred_test)
  return roc_train, roc_test

Example #24

Source File: test_BRKNN.py From Quadflor with BSD 3-Clause "New" or "Revised" License

5 votes

def test_BRKnna_predict(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=True)
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a')
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense()
        np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)

Example #25

Source File: eval_func.py From plastering with MIT License

5 votes

def binarize_labels(true_labels, pred_labels, excluding_labels=[]):
    excluding_labels = ['building-ebu3b']
    srcids = list(pred_labels.keys())
    tot_labels = [[label for label in labels if label not in excluding_labels]
                   for labels in list(pred_labels.values()) + list(true_labels.values())
                  ]
    mlb = MultiLabelBinarizer().fit(tot_labels)
    pred_mat = mlb.transform(pred_labels.values())
    true_mat = mlb.transform(true_labels.values())
    return true_mat, pred_mat

Example #26

Source File: classify.py From OpenNE with MIT License

5 votes

def __init__(self, vectors, clf):
        self.embeddings = vectors
        self.clf = TopKRanker(clf)
        self.binarizer = MultiLabelBinarizer(sparse_output=True)

Example #27

Source File: test_multilabel_realdata.py From libact with BSD 2-Clause "Simplified" License

5 votes

def setUp(self):
        dataset_filepath = os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            'datasets/yeast_train.svm')
        X, y = load_svmlight_file(dataset_filepath, multilabel=True)
        self.X = X.todense().tolist()
        self.y = MultiLabelBinarizer().fit_transform(y).tolist()
        self.quota = 10

Example #28

Source File: discovery.py From fairtest with Apache License 2.0

5 votes

def main(argv=sys.argv):
    if len(argv) != 1:
        usage(argv)

    FILENAME = "../../../data/recommender/recommendations.txt"
    OUTPUT_DIR = "."
    data = prepare.data_from_csv(FILENAME, sep='\\t',
                                 to_drop=['RMSE', 'Avg Movie Age',
                                          'Avg Recommended Rating',
                                          'Avg Seen Rating', 'Occupation'])
    TARGET = 'Types'
    SENS = ['Gender']

    EXPL = []
    labeled_data = [ast.literal_eval(s) for s in data[TARGET]]
    for labels in labeled_data:
        assert len(labels) == 5
    label_encoder = preprocessing.MultiLabelBinarizer()
    labeled_data = label_encoder.fit_transform(labeled_data)
    labels = label_encoder.classes_
    df_labels = pd.DataFrame(labeled_data, columns=labels)
    data = pd.concat([data.drop(TARGET, axis=1), df_labels], axis=1)
    TARGET = labels.tolist()

    data_source = DataSource(data)

    # Instantiate the experiment
    inv = Discovery(data_source, SENS, TARGET, EXPL, topk=10, random_state=0)

    # Train the classifier
    train([inv])

    # Evaluate on the testing set
    test([inv])

    # Create the report
    report([inv], "discovery", OUTPUT_DIR)

Example #29

Source File: classify.py From GraphEmbedding with MIT License

5 votes

def __init__(self, embeddings, clf):
        self.embeddings = embeddings
        self.clf = TopKRanker(clf)
        self.binarizer = MultiLabelBinarizer(sparse_output=True)

Example #30

Source File: preprocess.py From gnn-benchmark with MIT License

5 votes

def binarize_labels(labels, sparse_output=False, return_classes=False):
    """Convert labels vector to a binary label matrix.

    In the default single-label case, labels look like
    labels = [y1, y2, y3, ...].
    Also supports the multi-label format.
    In this case, labels should look something like
    labels = [[y11, y12], [y21, y22, y23], [y31], ...].

    Parameters
    ----------
    labels : array-like, shape [num_samples]
        Array of node labels in categorical single- or multi-label format.
    sparse_output : bool, default False
        Whether return the label_matrix in CSR format.
    return_classes : bool, default False
        Whether return the classes corresponding to the columns of the label matrix.

    Returns
    -------
    label_matrix : np.ndarray or sp.csr_matrix, shape [num_samples, num_classes]
        Binary matrix of class labels.
        num_classes = number of unique values in "labels" array.
        label_matrix[i, k] = 1 <=> node i belongs to class k.
    classes : np.array, shape [num_classes], optional
        Classes that correspond to each column of the label_matrix.

    """
    if hasattr(labels[0], '__iter__'):  # labels[0] is iterable <=> multilabel format
        binarizer = MultiLabelBinarizer(sparse_output=sparse_output)
    else:
        binarizer = LabelBinarizer(sparse_output=sparse_output)
    label_matrix = binarizer.fit_transform(labels).astype(np.float32)
    return (label_matrix, binarizer.classes_) if return_classes else label_matrix