Python sklearn.model_selection.StratifiedShuffleSplit() Examples

The following are code examples for showing how to use sklearn.model_selection.StratifiedShuffleSplit(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: Neural_Temporality_Adaptation   Author: xiaoleihuang   File: data_helper.py    Apache License 2.0 7 votes vote down vote up
def stratified_split(dataset, test=0.2, label_idx=-1, cv_num=1):
    """
    random stratified split by label, will update for year later
    :param dataset: list of dataset
    :param test:
    :param label_idx:
    :param cv_num: the number of cross_validation
    :return:
    """
    sss = StratifiedShuffleSplit(n_splits=cv_num, test_size=test, random_state=0)
    y = [item[label_idx] for item in dataset]

    train_idx, test_idx = [], []

    for train, test in sss.split(dataset, y):
        train_idx.append(train)
        test_idx.append(test)
    return train_idx, test_idx 
Example 2
Project: cwcf   Author: jaromiru   File: hpc_svm.py    MIT License 6 votes vote down vote up
def get_full_rbf_svm_clf(train_x, train_y, c_range=None, gamma_range=None):
		param_grid = dict(gamma=gamma_range, C=c_range)
		cv = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)
		grid = GridSearchCV(SVC(cache_size=1024), param_grid=param_grid, cv=cv, n_jobs=14, verbose=10)
		grid.fit(train_x, train_y)
		
		print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))
		
		scores = grid.cv_results_['mean_test_score'].reshape(len(c_range), len(gamma_range))
		print("Scores:")
		print(scores)
		
		print("c_range:", c_range)
		print("gamma_range:", gamma_range)

		c_best = grid.best_params_['C']
		gamma_best = grid.best_params_['gamma']

		clf = SVC(C=c_best, gamma=gamma_best, verbose=True)
		return clf

#---------------- 
Example 3
Project: brain-score   Author: brain-score   File: transformations.py    MIT License 6 votes vote down vote up
def __init__(self,
                 splits=Defaults.splits, train_size=None, test_size=None,
                 split_coord=Defaults.split_coord, stratification_coord=Defaults.stratification_coord,
                 unique_split_values=Defaults.unique_split_values, random_state=Defaults.random_state):
        super().__init__()
        if train_size is None and test_size is None:
            train_size = self.Defaults.train_size
        self._stratified_split = StratifiedShuffleSplit(
            n_splits=splits, train_size=train_size, test_size=test_size, random_state=random_state)
        self._shuffle_split = ShuffleSplit(
            n_splits=splits, train_size=train_size, test_size=test_size, random_state=random_state)
        self._split_coord = split_coord
        self._stratification_coord = stratification_coord
        self._unique_split_values = unique_split_values

        self._logger = logging.getLogger(fullname(self)) 
Example 4
Project: BioCompoundML   Author: sandialabs   File: cross_validate.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def cross_validate(self, n_iter=10):
        '''Actually run the cross validation'''
        cv = StratifiedShuffleSplit(n_iter=10, test_size=0.5)
        (acc, prec, recall, roc) = fit_and_score_CV(self.clf, self.train,
                                                    self.y, cv, self.weights)
        print_line = "For " + str(n_iter) + " resamples at 50%"
        verbose_print(self.verbose, print_line)
        print_line = ("\tAccuracy: %0.4f +/- %0.4f" % (np.mean(acc),
                                                       np.std(acc) * 2))
        verbose_print(self.verbose, print_line)
        print_line = ("\tPrecision: %0.4f +/- %0.4f" % (np.mean(prec),
                                                        np.std(prec) * 2))
        verbose_print(self.verbose, print_line)
        print_line = ("\tRecall: %0.4f +/- %0.4f" % (np.mean(recall),
                                                     np.std(recall) * 2))
        verbose_print(self.verbose, print_line)
        print_line = ("\tReceiver Operator, AUC: %0.4f +/- %0.4f" %
                      (np.mean(roc),
                       np.std(roc) * 2))
        verbose_print(self.verbose, print_line)
        self.acc = np.mean(acc)
        self.prec = np.mean(prec)
        self.recall = np.mean(recall)
        self.roc = np.mean(roc) 
Example 5
Project: hominid   Author: blekhmanlab   File: stability_selection.py    MIT License 6 votes vote down vote up
def select_features(aligned_snp_df, aligned_taxa_df, lo_alpha_coef):
    X = aligned_taxa_df.values
    y = aligned_snp_df.values.flatten()
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        warnings.simplefilter('ignore', ConvergenceWarning)
        #lars_cv = LassoLarsCV(cv=StratifiedShuffleSplit(y, n_iter=100, test_size=0.2)).fit(X, y)
        lars_cv = LassoLarsCV(cv=StratifiedShuffleSplit(n_splits=100, test_size=0.2)).fit(X, y)

    print('lars_cv.alphas_: {}'.format(lars_cv.alphas_))
    alphas = np.linspace(lars_cv.alphas_[0], lo_alpha_coef * lars_cv.alphas_[0], 10)
    print('alphas: {}'.format(alphas))
    clf = RandomizedLasso(
        alpha=alphas,
        sample_fraction=0.8,
        n_resampling=1000
        #random_state=13
    ).fit(X, y)

    feature_scores_df = pd.DataFrame(clf.scores_, index=aligned_taxa_df.columns)

    return feature_scores_df, lars_cv.alphas_ 
Example 6
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_split.py    Apache License 2.0 6 votes vote down vote up
def test_stratified_shuffle_split_overlap_train_test_bug():
    # See https://github.com/scikit-learn/scikit-learn/issues/6121 for
    # the original bug report
    y = [0, 1, 2, 3] * 3 + [4, 5] * 5
    X = np.ones_like(y)

    sss = StratifiedShuffleSplit(n_splits=1,
                                 test_size=0.5, random_state=0)

    train, test = next(sss.split(X=X, y=y))

    # no overlap
    assert_array_equal(np.intersect1d(train, test), [])

    # complete partition
    assert_array_equal(np.union1d(train, test), np.arange(len(y))) 
Example 7
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_search.py    Apache License 2.0 6 votes vote down vote up
def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             gs.fit, X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y) 
Example 8
Project: pydl   Author: rafaeltg   File: methods.py    MIT License 6 votes vote down vote up
def get_cv_method(method, **kwargs):

    if method == 'kfold':
        return KFold(**kwargs)
    elif method == 'skfold':
        return StratifiedKFold(**kwargs)
    elif method == 'loo':
        return LeaveOneOut()
    elif method == 'shuffle_split':
        return ShuffleSplit(**kwargs)
    elif method == 'split':
        return TrainTestSplit(**kwargs)
    elif method == 's_shuffle_split':
        return StratifiedShuffleSplit(**kwargs)
    elif method == 'time_series':
        return TimeSeriesSplit(**kwargs)
    else:
        raise AttributeError('Invalid CV method - %s!' % method) 
Example 9
Project: spae   Author: arranger1044   File: enc_dec_classify_repr.py    GNU General Public License v3.0 6 votes vote down vote up
def unlabel_instances(X, y, n_labelled=100, unlabel=-1, rand_gen=None, max_labelled_prop=0.9):

    if rand_gen is None:
        rand_gen = numpy.random.RandomState(1337)

    n_instances = X.shape[0]
    assert y.shape[0] == n_instances, "Non-matching number of instances {} {}".format(y.shape[0])

    processed_y = numpy.zeros(y.shape, dtype=numpy.int32)
    processed_y[:] = unlabel

    labelled_prop = min(max_labelled_prop, n_labelled / n_instances)
    if labelled_prop > 0.0:
        sss = StratifiedShuffleSplit(n_splits=1, test_size=labelled_prop, random_state=rand_gen)
        unlabelled_ids, labelled_ids = list(sss.split(X, y))[0]

        processed_y[labelled_ids] = y[labelled_ids]

    logging.info('Number of labelled instances {}'.format((processed_y != unlabel).sum()))

    return processed_y 
Example 10
Project: dac   Author: KBNLresearch   File: models.py    GNU General Public License v3.0 5 votes vote down vote up
def validate(self):
        '''
        Ten-fold cross-validation with stratified sampling.
        '''
        print('Validating new model: {}()'.format(self.__class__.__name__))

        accuracy_scores = []
        precision_scores = []
        recall_scores = []
        f1_scores = []

        sss = StratifiedShuffleSplit(n_splits=10)
        for train_index, test_index in sss.split(self.data, self.labels):
            x_train, x_test = self.data[train_index], self.data[test_index]
            y_train, y_test = self.labels[train_index], self.labels[test_index]
            self.model.fit(x_train, y_train)

            y_pred = self.model.predict(x_test)
            accuracy_scores.append(accuracy_score(y_test, y_pred))
            precision_scores.append(precision_score(y_test, y_pred))
            recall_scores.append(recall_score(y_test, y_pred))
            f1_scores.append(f1_score(y_test, y_pred))

        print('Accuracy: {}'.format(np.mean(accuracy_scores)))
        print('Precision: {}'.format(np.mean(precision_scores)))
        print('Recall: {}'.format(np.mean(recall_scores)))
        print('F1-measure: {}'.format(np.mean(f1_scores))) 
Example 11
Project: dac   Author: KBNLresearch   File: models.py    GNU General Public License v3.0 5 votes vote down vote up
def validate(self):
        '''
        Ten-fold cross-validation with stratified sampling.
        '''
        print('Validating new model: {}()'.format(self.__class__.__name__))

        accuracy_scores = []
        precision_scores = []
        recall_scores = []
        f1_scores = []

        sss = StratifiedShuffleSplit(n_splits=10)
        for train_index, test_index in sss.split(self.data, self.labels):
            x_train, x_test = self.data[train_index], self.data[test_index]
            y_train, y_test = self.labels[train_index], self.labels[test_index]

            model = self.create_model()
            model.fit(x_train, y_train, epochs=100, batch_size=128,
                      class_weight=self.class_weight)
            y_pred = model.predict_classes(x_test, batch_size=128)

            accuracy_scores.append(accuracy_score(y_test, y_pred))
            precision_scores.append(precision_score(y_test, y_pred))
            recall_scores.append(recall_score(y_test, y_pred))
            f1_scores.append(f1_score(y_test, y_pred))

        print('')
        print('Accuracy: {}'.format(np.mean(accuracy_scores)))
        print('Precision: {}'.format(np.mean(precision_scores)))
        print('Recall: {}'.format(np.mean(recall_scores)))
        print('F1-measure: {}'.format(np.mean(f1_scores))) 
Example 12
Project: dac   Author: KBNLresearch   File: models.py    GNU General Public License v3.0 5 votes vote down vote up
def validate(self):
        '''
        Ten-fold cross-validation with stratified sampling.
        '''
        print('Validating new model: {}()'.format(self.__class__.__name__))

        accuracy_scores = []
        precision_scores = []
        recall_scores = []
        f1_scores = []

        sss = StratifiedShuffleSplit(n_splits=10)

        for train_index, test_index in sss.split(self.data[0], self.labels):
            x_train_0, x_test_0 = (self.data[0][train_index],
                                   self.data[0][test_index])
            x_train_1, x_test_1 = (self.data[1][train_index],
                                   self.data[1][test_index])
            x_train_2, x_test_2 = (self.data[2][train_index],
                                   self.data[2][test_index])

            y_train, y_test = self.labels[train_index], self.labels[test_index]

            model = self.create_model()
            model.fit([x_train_0, x_train_1, x_train_2], y_train, epochs=10,
                      batch_size=128, class_weight=self.class_weight)

            y_pred = model.predict([x_test_0, x_test_1, x_test_2],
                                   batch_size=128)
            y_pred = [1 if y[0] > 0.5 else 0 for y in y_pred]

            accuracy_scores.append(accuracy_score(y_test, y_pred))
            precision_scores.append(precision_score(y_test, y_pred))
            recall_scores.append(recall_score(y_test, y_pred))
            f1_scores.append(f1_score(y_test, y_pred))

        print('')
        print('Accuracy: {}'.format(np.mean(accuracy_scores)))
        print('Precision: {}'.format(np.mean(precision_scores)))
        print('Recall: {}'.format(np.mean(recall_scores)))
        print('F1-measure: {}'.format(np.mean(f1_scores))) 
Example 13
Project: tensorflow-nlp   Author: alsora   File: train_text_classification.py    MIT License 5 votes vote down vote up
def preprocess():
    # Data Preparation
    # ==================================================

    # Load data
    print("Loading data...")
    files_list = FLAGS.data.split(",")
    x_text, y_text = load_utils.load_data_and_labels(files_list)

    word_dict, reversed_dict = vocab_utils.build_dict_words(x_text, "text_classification", FLAGS.model_dir)
    labels_dict, _ = vocab_utils.build_dict_labels(y_text, FLAGS.model_dir)

    x = vocab_utils.transform_text(x_text, word_dict)
    y = vocab_utils.transform_labels(y_text, labels_dict)

    x = np.array(x)
    y = np.array(y)

    # Randomly shuffle data
    sss = StratifiedShuffleSplit(n_splits=1, test_size=FLAGS.dev_sample_percentage, random_state=None)
    for train_index, valid_index in sss.split(x, y):
        x_train, x_valid =  x[train_index], x[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]

    del x, y

    print("Vocabulary Size: {:d}".format(len(word_dict)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_valid)))
    
    return x_train, y_train, word_dict, reversed_dict, x_valid, y_valid 
Example 14
Project: aws-ml-experimenter   Author: rikturr   File: helpers.py    MIT License 5 votes vote down vote up
def monte_carlo(pipeline, x, y, n_runs, random_state, df=True):
    sss = StratifiedShuffleSplit(n_splits=n_runs, test_size=0.3, random_state=random_state)
    out = []
    for train_index, test_index in sss.split(x, y):
        if df:
            x_train, y_train = x.iloc[train_index], y.iloc[train_index]
            x_test, y_test = x.iloc[test_index], y.iloc[test_index]
        else:
            x_train, y_train = x[train_index, :], y[train_index]
            x_test, y_test = x[test_index, :], y[test_index]
        pipeline.fit(x_train, y_train)
        predicted = pipeline.predict_proba(x_test)
        predicted = predicted[:, 1] if len(predicted.shape) > 1 else predicted
        out.append(pd.DataFrame({'predicted': predicted, 'actual': y_test, 'run': [len(out)] * x_test.shape[0]}))
    return pd.concat(out) 
Example 15
Project: iglovikov_helper_functions   Author: ternaus   File: generate_splits.py    MIT License 5 votes vote down vote up
def stratified_sampling(Y, split, random_state=2016):
    train_inx = []
    valid_inx = []

    n_classes = Y.shape[1]
    inx = np.arange(Y.shape[0])

    for i in range(0, n_classes):
        sss = StratifiedShuffleSplit(n_splits=1, test_size=split, random_state=random_state + i)
        b_train_inx, b_valid_inx = next(sss.split(inx, Y[:, i]))
        # to ensure there is no repetetion within each split and between the splits
        train_inx = train_inx + list(set(list(b_train_inx)) - set(train_inx) - set(valid_inx))
        valid_inx = valid_inx + list(set(list(b_valid_inx)) - set(train_inx) - set(valid_inx))

    return np.array(train_inx), np.array(valid_inx) 
Example 16
Project: skorch   Author: skorch-dev   File: dataset.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _is_stratified(self, cv):
        return isinstance(cv, (StratifiedKFold, StratifiedShuffleSplit)) 
Example 17
Project: skorch   Author: skorch-dev   File: dataset.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _check_cv_float(self):
        cv_cls = StratifiedShuffleSplit if self.stratified else ShuffleSplit
        return cv_cls(test_size=self.cv, random_state=self.random_state) 
Example 18
Project: AdaptApprox   Author: fnan   File: adaptive_sparse_helpers.py    MIT License 5 votes vote down vote up
def get_full_rbf_svm_clf(dataset, X_train, y_train, retrain=False, C_range=None, gamma_range=None):
    if retrain:
#        if C_range is None:
#            C_range = np.logspace(-2, 10, 13)
#        if gamma_range is None:
#            gamma_range = np.logspace(-9, 3, 13)
        if C_range is None:
            c_best = 1
            gamma_best = 1
        else:
            param_grid = dict(gamma=gamma_range, C=C_range)
            cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
            grid = GridSearchCV(SVC(verbose=True), param_grid=param_grid, cv=cv, n_jobs=40,verbose=10)
            grid.fit(X_train, y_train)
            
            print("The best parameters are %s with a score of %0.2f"
                  % (grid.best_params_, grid.best_score_))
            c_best = grid.best_params_['C']
            gamma_best = grid.best_params_['gamma']

        clf = SVC(C=c_best, gamma=gamma_best, probability=True)
        clf.fit(X_train, y_train)    
        joblib.dump(clf, '%s_rbf_svm.pkl'%dataset)
        
        return clf
    else:
        clf = joblib.load('%s_rbf_svm.pkl'%dataset)
        return clf 
Example 19
Project: Baidu_XJTU_2018_company_logo_recognition   Author: JingangLang   File: densenet.py    MIT License 5 votes vote down vote up
def train_model_on_k_fold(get_model, train_model, X, y, n_folds=5):
    # sss = StratifiedShuffleSplit(n_splits=n_folds,test_size=0.16,random_state=42)
    # sss = StratifiedKFold(n_splits=n_folds,random_state=42,shuffle=True)
    n_models = 0
    models = []
    for k in range(n_folds):
        print("Building model...")
        models.append(get_model())
        print("Trainning model...")
        train_model(models[len(models) - 1], X, y, model_sn=n_models)

        n_models += 1
    # for k,(train_index, test_index) in enumerate(sss.split(X, y)):
    #     print(k)
    #     print("Using {} for training and {} for validation".format(len(train_index), len(test_index)))
    #     x_train, x_valid = X[train_index], X[test_index]
    #     y_train, y_valid = y[train_index], y[test_index]
    #
    #     print("Building model...")
    #     models.append(get_model())
    #     print("Trainning model...")
    #     train_model(models[len(models) - 1], x_train, y_train, x_valid, y_valid, model_sn=n_models)
    #
    #     n_models += 1

    return models 
Example 20
Project: autoreject   Author: autoreject   File: autoreject.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _compute_thresholds(epochs, method='bayesian_optimization',
                        random_state=None, picks=None, augment=True,
                        dots=None, verbose='progressbar', n_jobs=1):
    if method not in ['bayesian_optimization', 'random_search']:
        raise ValueError('`method` param not recognized')
    picks = _handle_picks(info=epochs.info, picks=picks)
    _check_data(epochs, picks, verbose=verbose,
                ch_constraint='data_channels')
    picks_by_type = _get_picks_by_type(picks=picks, info=epochs.info)
    picks_by_type = None if len(picks_by_type) == 1 else picks_by_type  # XXX
    if picks_by_type is not None:
        threshes = dict()
        for ch_type, this_picks in picks_by_type:
            threshes.update(_compute_thresholds(
                epochs=epochs, method=method, random_state=random_state,
                picks=this_picks, augment=augment, dots=dots,
                verbose=verbose, n_jobs=n_jobs))
    else:
        n_epochs = len(epochs)
        data, y = epochs.get_data(), np.ones((n_epochs, ))
        if augment:
            epochs_interp = _clean_by_interp(epochs, picks=picks,
                                             dots=dots, verbose=verbose)
            # non-data channels will be duplicate
            data = np.concatenate((epochs.get_data(),
                                   epochs_interp.get_data()), axis=0)
            y = np.r_[np.zeros((n_epochs, )), np.ones((n_epochs, ))]
        cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2,
                                    random_state=random_state)

        ch_names = epochs.ch_names

        my_thresh = delayed(_compute_thresh)
        parallel = Parallel(n_jobs=n_jobs, verbose=0)
        desc = 'Computing thresholds ...'
        threshes = parallel(
            my_thresh(data[:, pick], cv=cv, method=method, y=y,
                      random_state=random_state)
            for pick in _pbar(picks, desc=desc, verbose=verbose))
        threshes = {ch_names[p]: thresh for p, thresh in zip(picks, threshes)}
    return threshes 
Example 21
Project: torchetl   Author: amajidsinar   File: etl.py    MIT License 5 votes vote down vote up
def _stratify_sampling(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
		"""Sklearn stratified sampling uses a whole array so we must build it first

		Parameters
		----------
		None

		Returns
		-------
		Tuple of train(X, y), validation(X, y), and test(X, y)	
		"""
		x, y = self._create_dataset_array()
		sss = StratifiedShuffleSplit(n_splits=1, train_size=self.training_size, test_size=self.test_size, random_state=self.random_state)

		for train_index, validation_test_index in sss.split(x, y):
			x_train, x_validation_test = x[train_index], x[validation_test_index]
			y_train, y_validation_test = y[train_index], y[validation_test_index]

		sss = StratifiedShuffleSplit(n_splits=1, train_size=0.5, test_size=0.5,random_state=self.random_state)
		for validation_index, test_index in sss.split(x_validation_test, y_validation_test):
			x_validation, x_test = x_validation_test[validation_index], x_validation_test[test_index]
			y_validation, y_test = y_validation_test[validation_index], y_validation_test[test_index]

		train = np.c_[x_train, y_train]
		validation = np.c_[x_validation, y_validation]
		test = np.c_[x_test, y_test]

		if self.verbose:
			print("Finished splitting dataset into train, validation, and test")
		return train, validation, test 
Example 22
Project: torchetl   Author: amajidsinar   File: etl.py    MIT License 5 votes vote down vote up
def stratify(x, y, train_size, random_state=69):
        sss = StratifiedShuffleSplit(n_splits=1, train_size=train_size, test_size=1-train_size, random_state=random_state)
        Partition = namedtuple('Partition', ['x_train', 'y_train', 'x_test', 'y_test'])
        for train_index, test_index in sss.split(x, y):
            partition = Partition
            x_train, x_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]

        partition = Partition(x_train, y_train, x_test, y_test)
        return partition 
Example 23
Project: torchetl   Author: amajidsinar   File: etl.py    MIT License 5 votes vote down vote up
def stratify(x, y, train_size, random_state=69):
        sss = StratifiedShuffleSplit(n_splits=1, train_size=train_size, test_size=1-train_size, random_state=random_state)
        Partition = namedtuple('Partition', ['x_train', 'y_train', 'x_test', 'y_test'])
        for train_index, test_index in sss.split(x, y):
            partition = Partition
            x_train, x_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]

        partition = Partition(x_train, y_train, x_test, y_test)
        return partition 
Example 24
Project: torchetl   Author: amajidsinar   File: etl-checkpoint.py    MIT License 5 votes vote down vote up
def _stratify_sampling(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
		"""Sklearn stratified sampling uses a whole array so we must build it first

		Parameters
		----------
		None

		Returns
		-------
		Tuple of train(X, y), validation(X, y), and test(X, y)	
		"""
		x, y = self._create_dataset_array()
		sss = StratifiedShuffleSplit(n_splits=1, train_size=self.training_size, test_size=self.test_size, random_state=self.random_state)

		for train_index, validation_test_index in sss.split(x, y):
			x_train, x_validation_test = x[train_index], x[validation_test_index]
			y_train, y_validation_test = y[train_index], y[validation_test_index]

		sss = StratifiedShuffleSplit(n_splits=1, train_size=0.5, test_size=0.5,random_state=self.random_state)
		for validation_index, test_index in sss.split(x_validation_test, y_validation_test):
			x_validation, x_test = x_validation_test[validation_index], x_validation_test[test_index]
			y_validation, y_test = y_validation_test[validation_index], y_validation_test[test_index]

		train = np.c_[x_train, y_train]
		validation = np.c_[x_validation, y_validation]
		test = np.c_[x_test, y_test]

		if self.verbose:
			print("Finished splitting dataset into train, validation, and test")
		return train, validation, test 
Example 25
Project: torchetl   Author: amajidsinar   File: mark_and_recall-checkpoint.py    MIT License 5 votes vote down vote up
def _stratify_sampling(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
		"""Sklearn stratified sampling uses a whole array so we must build it first

		Parameters
		----------
		None

		Returns
		-------
		Tuple of train(X, y), validation(X, y), and test(X, y)	
		"""
		x, y = self._create_dataset_array()
		sss = StratifiedShuffleSplit(n_splits=1, train_size=self.training_size, test_size=self.test_size, random_state=self.random_state)

		for train_index, validation_test_index in sss.split(x, y):
			x_train, x_validation_test = x[train_index], x[validation_test_index]
			y_train, y_validation_test = y[train_index], y[validation_test_index]

		sss = StratifiedShuffleSplit(n_splits=1, train_size=0.5, test_size=0.5,random_state=self.random_state)
		for validation_index, test_index in sss.split(x_validation_test, y_validation_test):
			x_validation, x_test = x_validation_test[validation_index], x_validation_test[test_index]
			y_validation, y_test = y_validation_test[validation_index], y_validation_test[test_index]

		train = np.c_[x_train, y_train]
		validation = np.c_[x_validation, y_validation]
		test = np.c_[x_test, y_test]

		if self.verbose:
			print("Finished splitting dataset into train, validation, and test")
		return train, validation, test 
Example 26
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_split.py    Apache License 2.0 5 votes vote down vote up
def test_stratified_shuffle_split_init():
    X = np.arange(7)
    y = np.asarray([0, 1, 1, 1, 2, 2, 2])
    # Check that error is raised if there is a class with only one sample
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(3, 0.2).split(X, y))

    # Check that error is raised if the test set size is smaller than n_classes
    assert_raises(ValueError, next, StratifiedShuffleSplit(3, 2).split(X, y))
    # Check that error is raised if the train set size is smaller than
    # n_classes
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(3, 3, 2).split(X, y))

    X = np.arange(9)
    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, StratifiedShuffleSplit, 3, 0.5, 0.6)
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(3, 8, 0.6).split(X, y))
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(3, 0.6, 8).split(X, y))

    # Train size or test size too small
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(train_size=2).split(X, y))
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(test_size=2).split(X, y)) 
Example 27
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_split.py    Apache License 2.0 5 votes vote down vote up
def test_stratified_shuffle_split_respects_test_size():
    y = np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2])
    test_size = 5
    train_size = 10
    sss = StratifiedShuffleSplit(6, test_size=test_size, train_size=train_size,
                                 random_state=0).split(np.ones(len(y)), y)
    for train, test in sss:
        assert_equal(len(train), train_size)
        assert_equal(len(test), test_size) 
Example 28
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_split.py    Apache License 2.0 5 votes vote down vote up
def test_stratified_shuffle_split_iter():
    ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
          np.array([-1] * 800 + [1] * 50),
          np.concatenate([[i] * (100 + i) for i in range(11)]),
          [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3],
          ['1', '1', '1', '1', '2', '2', '2', '3', '3', '3', '3', '3'],
          ]

    for y in ys:
        sss = StratifiedShuffleSplit(6, test_size=0.33,
                                     random_state=0).split(np.ones(len(y)), y)
        y = np.asanyarray(y)  # To make it indexable for y[train]
        # this is how test-size is computed internally
        # in _validate_shuffle_split
        test_size = np.ceil(0.33 * len(y))
        train_size = len(y) - test_size
        for train, test in sss:
            assert_array_equal(np.unique(y[train]), np.unique(y[test]))
            # Checks if folds keep classes proportions
            p_train = (np.bincount(np.unique(y[train],
                                   return_inverse=True)[1]) /
                       float(len(y[train])))
            p_test = (np.bincount(np.unique(y[test],
                                  return_inverse=True)[1]) /
                      float(len(y[test])))
            assert_array_almost_equal(p_train, p_test, 1)
            assert_equal(len(train) + len(test), y.size)
            assert_equal(len(train), train_size)
            assert_equal(len(test), test_size)
            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) 
Example 29
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_split.py    Apache License 2.0 5 votes vote down vote up
def test_stratifiedshufflesplit_list_input():
    # Check that when y is a list / list of string labels, it works.
    sss = StratifiedShuffleSplit(test_size=2, random_state=42)
    X = np.ones(7)
    y1 = ['1'] * 4 + ['0'] * 3
    y2 = np.hstack((np.ones(4), np.zeros(3)))
    y3 = y2.tolist()

    np.testing.assert_equal(list(sss.split(X, y1)),
                            list(sss.split(X, y2)))
    np.testing.assert_equal(list(sss.split(X, y3)),
                            list(sss.split(X, y2))) 
Example 30
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_split.py    Apache License 2.0 5 votes vote down vote up
def test_nested_cv():
    # Test if nested cross validation works with different combinations of cv
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 5, 15)

    cvs = [LeaveOneGroupOut(), LeaveOneOut(), GroupKFold(), StratifiedKFold(),
           StratifiedShuffleSplit(n_splits=3, random_state=0)]

    for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
        gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]},
                          cv=inner_cv)
        cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv,
                        fit_params={'groups': groups}) 
Example 31
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_split.py    Apache License 2.0 5 votes vote down vote up
def test_train_test_default_warning():
    assert_warns(FutureWarning, ShuffleSplit, train_size=0.75)
    assert_warns(FutureWarning, GroupShuffleSplit, train_size=0.75)
    assert_warns(FutureWarning, StratifiedShuffleSplit, train_size=0.75)
    assert_warns(FutureWarning, train_test_split, range(3),
                 train_size=0.75) 
Example 32
Project: AutoTableGBDT   Author: SpongebBob   File: model.py    MIT License 5 votes vote down vote up
def train_test_split(self, X, y, test_size, random_state=2018):
        sss = list(StratifiedShuffleSplit(
            n_splits=1, test_size=test_size, random_state=random_state).split(X, y))
        print(sss[0])
        X_train = np.take(X, sss[0][0], axis=0)
        X_test = np.take(X, sss[0][1], axis=0)
        y_train = np.take(y, sss[0][0], axis=0)
        y_test = np.take(y, sss[0][1], axis=0)
        return [X_train, X_test, y_train, y_test] 
Example 33
Project: ai-starthon-2019   Author: ildoonet   File: data_local_loader.py    MIT License 5 votes vote down vote up
def data_loader_with_split(root, cv_ratio=0.2, cv=0, batch_size=256, val_label_file='./val_label'):
    dataset_tr = CustomDataset(root, transform=get_transform(random_crop=True))
    dataset_vl = CustomDataset(root, transform=get_transform(random_crop=False))

    sss = StratifiedShuffleSplit(n_splits=5, test_size=cv_ratio, random_state=0)
    sss = sss.split(list(range(len(dataset_tr))), dataset_tr.targets)
    for _ in range(cv + 1):
        train_idx, valid_idx = next(sss)
    print('datasize=', len(train_idx), len(valid_idx))

    train_sampler = StratifiedSampler([x for i, x in enumerate(dataset_tr.targets) if i in train_idx])
    train_set = Subset(dataset_tr, train_idx)
    valid_set = Subset(dataset_vl, valid_idx)

    dataloader = FixedSizeDataLoader(train_set, steps=None, batch_size=batch_size, num_workers=6, drop_last=True,
                                     sampler=train_sampler)
    dataloader = PrefetchDataLoader(dataloader, device=torch.device('cuda', 0))

    # tr_loader = data.DataLoader(dataset=train_set, batch_size=batch_size, num_workers=4, pin_memory=True,
    #                             sampler=train_sampler, drop_last=True)
    tr_loader = dataloader
    val_loader = data.DataLoader(dataset=valid_set, batch_size=batch_size, num_workers=6, pin_memory=True,
                                 shuffle=False, drop_last=False)

    print('generate val labels+')
    gt_labels = {}
    for idx in valid_idx:
        path, target = dataset_tr.samples[idx]
        image_id = path.split('/')[-1]
        gt_labels[image_id] = target
    gt_labels_string = [' '.join([str(s) for s in l]) for l in tqdm(list(gt_labels.items()))]
    with open(val_label_file, 'w') as file_writer:
        file_writer.write("\n".join(gt_labels_string))
    print('generate val labels-')

    return tr_loader, val_loader, val_label_file


# ----------------------------------------------------- for ret -------------------------------------------------------- 
Example 34
Project: torchskeleton   Author: wbaek   File: utils.py    Apache License 2.0 5 votes vote down vote up
def split(dataset, labels, cv_ratio, cv_index=0, seed=0xC0FFEE):
    sss = StratifiedShuffleSplit(n_splits=cv_index + 1, test_size=cv_ratio, random_state=seed)
    sss = sss.split(list(range(len(labels))), labels)

    for _ in range(cv_index + 1):
        train_idx, valid_idx = next(sss)

    train_dataset = torch.utils.data.Subset(dataset, train_idx)
    valid_dataset = torch.utils.data.Subset(dataset, valid_idx)
    return train_dataset, valid_dataset 
Example 35
Project: genre-classifier   Author: KBNLresearch   File: train.py    GNU General Public License v3.0 5 votes vote down vote up
def validate():
    # Load an existing training set
    X_train, y_train = dataset.load_training('data/training.txt')

    # Ten-fold cross-validation with stratified sampling
    cv = StratifiedShuffleSplit(n_splits=10)
    scores = cross_val_score(clf, X_train, y_train, cv=cv)
    print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) 
Example 36
Project: gumpy   Author: gumpy-bci   File: split.py    MIT License 5 votes vote down vote up
def  stratified_shuffle_Split(features, labels, n_splits,test_size,random_state):

    """Stratified ShuffleSplit cross-validator
    """
    cv = StratifiedShuffleSplit(n_splits, test_size, random_state=random_state)
    for train_index, test_index in cv.split(features,labels):
        X_train = features[train_index]
        X_test = features[test_index]
        Y_train = labels[train_index]
        Y_test = labels[test_index]
    return X_train, X_test, Y_train, Y_test


#Random permutation cross-validator 
Example 37
Project: Forecast-Loan-Default   Author: Perry961002   File: train_use_dtc.py    MIT License 5 votes vote down vote up
def main():
    #1,加载数据(训练和测试)和预处理数据
    #将NumberTime30-59,60-89,90中标记的96,98替换为NaN
    #将Age中的0替换为NaN
    colnames = ['ID', 'label', 'RUUnsecuredL', 'age', 'NOTime30-59', 
                'DebtRatio', 'Income', 'NOCredit', 'NOTimes90', 
                'NORealEstate', 'NOTime60-89', 'NODependents']
    col_nas = ['', 'NA', 'NA', 0, [98, 96], 'NA', 'NA', 'NA', [98, 96], 'NA', [98, 96], 'NA']
    col_na_values = creatDictKV(colnames, col_nas)
    dftrain = pd.read_csv("./data/cs-training.csv", names=colnames, na_values=col_na_values, skiprows=[0])
    dftrain.pop("NOCredit")
    train_id = [int(x) for x in dftrain.pop("ID")]
    y_train = np.asarray([int(x)for x in dftrain.pop("label")])
    x_train = dftrain.as_matrix()

    dftest = pd.read_csv("./data/cs-test.csv", names=colnames, na_values=col_na_values, skiprows=[0])
    dftest.pop("NOCredit")
    test_id = [int(x) for x in dftest.pop("ID")]
    y_test = np.asarray(dftest.pop("label"))
    x_test = dftest.as_matrix()
    #使用StratifiedShuffleSplit将训练数据分解为training_new和test_new(用于验证模型)
    sss = StratifiedShuffleSplit(n_splits=1,test_size=0.33333,random_state=0)
    for train_index, test_index in sss.split(x_train, y_train):
        x_train_new, x_test_new = x_train[train_index], x_train[test_index]
        y_train_new, y_test_new = y_train[train_index], y_train[test_index]

    y_train = y_train_new
    x_train = x_train_new
    #使用Imputer将NaN替换为平均值
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(x_train)
    x_train = imp.transform(x_train)
    x_test_new = imp.transform(x_test_new)
    x_test = imp.transform(x_test)
    if not os.path.isfile("dtc_model.m"):
        clf = DecisionTreeClassifier(max_depth=12, min_samples_leaf=200 ,class_weight="balanced")
        clf.fit(x_train, y_train)
        joblib.dump(clf,"dtc_model.m")
        predicted_probs_train =clf.predict_proba(x_train)
        predicted_probs_train = [x[1] for  x in predicted_probs_train]
        computeAUC(y_train, predicted_probs_train)
        # 将决策树可视化输出,需要Graphviz软件的支持
        dot_data = export_graphviz(clf, filled = True,
                                rounded = True,
                                class_names = ['0', '1'],
                                feature_names = ['RUUnsecuredL', 'age', 'NOTime30-59', 'DebtRatio', 'Income', 'NOTimes90', 
                                                    'NORealEstate', 'NOTime60-89', 'NODependents'],
                                out_file = None)
        graph = graph_from_dot_data(dot_data)
        graph.write_png('tree.png')
    else:
        clf = joblib.load("dtc_model.m")
        predicted_probs_test_new = clf.predict_proba(x_test_new)
        predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
        computeAUC(y_test_new, predicted_probs_test_new) 
Example 38
Project: nips2017   Author: c-hofer   File: experiments.py    MIT License 5 votes vote down vote up
def train_test_from_dataset(dataset,
                            test_size=0.2,
                            batch_size=64,
                            wanted_views=None):

    sample_labels = list(dataset.sample_labels)
    label_encoder = LabelEncoder().fit(sample_labels)
    sample_labels = label_encoder.transform(sample_labels)

    label_map = lambda l: int(label_encoder.transform([l])[0])
    collate_fn = PersistenceDiagramProviderCollate(dataset, label_map=label_map, wanted_views=wanted_views)

    sp = StratifiedShuffleSplit(n_splits=1, test_size=test_size)
    train_i, test_i = list(sp.split([0]*len(sample_labels), sample_labels))[0]

    data_train = DataLoader(dataset,
                            batch_size=batch_size,
                            collate_fn=collate_fn,
                            shuffle=False,
                            sampler=SubsetRandomSampler(train_i.tolist()))

    data_test = DataLoader(dataset,
                           batch_size=batch_size,
                           collate_fn=collate_fn,
                           shuffle=False,
                           sampler=SubsetRandomSampler(test_i.tolist()))

    return data_train, data_test 
Example 39
Project: Bella   Author: apmoore1   File: base.py    MIT License 5 votes vote down vote up
def train_val_split(train: 'TargetCollection',
                        split_size: float = 0.2, seed: Union[None, int] = 42
                        ) -> Tuple[Tuple[np.ndarray, np.ndarray],
                                   Tuple[np.ndarray, np.ndarray]]:
        '''
        Splits the training dataset into a train and validation set in a
        stratified split.

        :param train: The training dataset that needs to be split into
        :param split_size: Fraction of the dataset to assign to the
                           validation set.
        :param seed: Seed value to give to the stratified splitter. If
                     None then it uses the radnom state of numpy.
        :return: Two tuples of length two where each tuple is the train
                 and validation splits respectively, and each tuple contains
                 the data (X) and class labels (y) respectively. Returns
                 ((X_train, y_train), (X_val, y_val))
        '''
        splitter = StratifiedShuffleSplit(n_splits=1, test_size=split_size,
                                          random_state=seed)
        data = np.asarray(train.data_dict())
        sentiment = np.asarray(train.sentiment_data())
        for train_indexs, test_indexs in splitter.split(data, sentiment):
            train_data = data[train_indexs]
            test_data = data[test_indexs]

        train = TargetCollection(ModelMixin._convert_to_targets(train_data))
        val = TargetCollection(ModelMixin._convert_to_targets(test_data))

        X_train = np.array(train.data_dict())
        y_train = np.array(train.sentiment_data())
        X_val = np.array(val.data_dict())
        y_val = np.array(val.sentiment_data())
        return (X_train, y_train), (X_val, y_val) 
Example 40
Project: graph2gauss   Author: abojchevski   File: utils.py    MIT License 4 votes vote down vote up
def score_node_classification(features, z, p_labeled=0.1, n_repeat=10, norm=False):
    """
    Train a classifier using the node embeddings as features and reports the performance.

    Parameters
    ----------
    features : array-like, shape [N, L]
        The features used to train the classifier, i.e. the node embeddings
    z : array-like, shape [N]
        The ground truth labels
    p_labeled : float
        Percentage of nodes to use for training the classifier
    n_repeat : int
        Number of times to repeat the experiment
    norm

    Returns
    -------
    f1_micro: float
        F_1 Score (micro) averaged of n_repeat trials.
    f1_micro : float
        F_1 Score (macro) averaged of n_repeat trials.
    """
    lrcv = LogisticRegressionCV()

    if norm:
        features = normalize(features)

    trace = []
    for seed in range(n_repeat):
        sss = StratifiedShuffleSplit(n_splits=1, test_size=1 - p_labeled, random_state=seed)
        split_train, split_test = next(sss.split(features, z))

        lrcv.fit(features[split_train], z[split_train])
        predicted = lrcv.predict(features[split_test])

        f1_micro = f1_score(z[split_test], predicted, average='micro')
        f1_macro = f1_score(z[split_test], predicted, average='macro')

        trace.append((f1_micro, f1_macro))

    return np.array(trace).mean(0) 
Example 41
Project: RPGOne   Author: RTHMaK   File: helpers.py    Apache License 2.0 4 votes vote down vote up
def load_titanic(test_size=.25, feature_skip_tuple=(), random_state=1999):
    f = open(os.path.join('datasets', 'titanic', 'titanic3.csv'))
    # Remove . from home.dest, split on quotes because some fields have commas
    keys = f.readline().strip().replace('.', '').split('","')
    lines = f.readlines()
    f.close()
    string_keys = ['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat',
                   'homedest']
    string_keys = [s for s in string_keys if s not in feature_skip_tuple]
    numeric_keys = ['pclass', 'age', 'sibsp', 'parch', 'fare']
    numeric_keys = [n for n in numeric_keys if n not in feature_skip_tuple]
    train_vectorizer_list = []
    test_vectorizer_list = []

    n_samples = len(lines)
    numeric_data = np.zeros((n_samples, len(numeric_keys)))
    numeric_labels = np.zeros((n_samples,), dtype=int)

    # Doing this twice is horribly inefficient but the file is small...
    for n, l in enumerate(lines):
        line_dict = process_titanic_line(l)
        strings = {k: line_dict[k] for k in string_keys}
        numeric_labels[n] = line_dict["survived"]

    sss = StratifiedShuffleSplit(n_iter=1, test_size=test_size, random_state=12)
    # This is a weird way to get the indices but it works
    train_idx = None
    test_idx = None
    for train_idx, test_idx in sss.split(numeric_data, numeric_labels):
        pass

    for n, l in enumerate(lines):
        line_dict = process_titanic_line(l)
        strings = {k: line_dict[k] for k in string_keys}
        if n in train_idx:
            train_vectorizer_list.append(strings)
        else:
            test_vectorizer_list.append(strings)
        numeric_data[n] = np.asarray([line_dict[k]
                                      for k in numeric_keys])

    train_numeric = numeric_data[train_idx]
    test_numeric = numeric_data[test_idx]
    train_labels = numeric_labels[train_idx]
    test_labels = numeric_labels[test_idx]

    vec = DictVectorizer()
    # .toarray() due to returning a scipy sparse array
    train_categorical = vec.fit_transform(train_vectorizer_list).toarray()
    test_categorical = vec.transform(test_vectorizer_list).toarray()
    train_data = np.concatenate([train_numeric, train_categorical], axis=1)
    test_data = np.concatenate([test_numeric, test_categorical], axis=1)
    keys = numeric_keys + string_keys
    return keys, train_data, test_data, train_labels, test_labels 
Example 42
Project: prosodeep   Author: gerazov   File: prosodeep_corpus.py    GNU General Public License v3.0 4 votes vote down vote up
def split_corpus(corpus, test_size, stratify=True, random_state=None):
    """
    Split corpus into train and test. Can be also used to split train into
    train and validation for analysis by synthesis.
    Two ways to do it:
        - split aware of contour content and makes a stratification of the data
        keeping the original percentage of contours in both sets.
        - using files as groups in GroupsShuffleSplit
    """
    files = corpus.file.tolist()
    filenames = corpus.file.unique()
    file_ind = np.arange(len(filenames))

    if stratify:
        # loop through files and find their contour content
        all_contours = corpus.contourtype.unique().tolist()
        #    file_contours = {}
        file_contours = []
        for file in filenames:
            mask = corpus.file == file
            contours = sorted(corpus[mask].contourtype.unique().tolist())
            contour_code = 0
            for contour in contours:
                ind = all_contours.index(contour)
                # encode in an integer:
                contour_code += 2**ind
            file_contours.append(contour_code)
        file_contours = np.array(file_contours)

        sss = StratifiedShuffleSplit(test_size=test_size,
                                     random_state=random_state)
        train_ind, test_ind = sss.split(file_ind, file_contours).__next__()
        train_filenames = filenames[train_ind]
        test_filenames = filenames[test_ind]

        mask_train = corpus.file.isin(train_filenames)
        mask_test = corpus.file.isin(test_filenames)
        #%
        corpus_train = corpus[mask_train]
        corpus_test = corpus[mask_test]
        assert len(corpus) == len(corpus_train) + len(corpus_test)
        # TODO: make our own algorithm

    else:  # use files as groups
        groups = [np.where(filenames == file)[0] for file in files]
        gss = GroupShuffleSplit(test_size=test_size,
                                random_state=random_state)
        train_ind, test_ind = gss.split(groups, groups, groups).__next__()

    return train_ind, test_ind 
Example 43
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_split.py    Apache License 2.0 4 votes vote down vote up
def test_stratified_shuffle_split_even():
    # Test the StratifiedShuffleSplit, indices are drawn with a
    # equal chance
    n_folds = 5
    n_splits = 1000

    def assert_counts_are_ok(idx_counts, p):
        # Here we test that the distribution of the counts
        # per index is close enough to a binomial
        threshold = 0.05 / n_splits
        bf = stats.binom(n_splits, p)
        for count in idx_counts:
            prob = bf.pmf(count)
            assert_true(prob > threshold,
                        "An index is not drawn with chance corresponding "
                        "to even draws")

    for n_samples in (6, 22):
        groups = np.array((n_samples // 2) * [0, 1])
        splits = StratifiedShuffleSplit(n_splits=n_splits,
                                        test_size=1. / n_folds,
                                        random_state=0)

        train_counts = [0] * n_samples
        test_counts = [0] * n_samples
        n_splits_actual = 0
        for train, test in splits.split(X=np.ones(n_samples), y=groups):
            n_splits_actual += 1
            for counter, ids in [(train_counts, train), (test_counts, test)]:
                for id in ids:
                    counter[id] += 1
        assert_equal(n_splits_actual, n_splits)

        n_train, n_test = _validate_shuffle_split(
            n_samples, test_size=1. / n_folds, train_size=1. - (1. / n_folds))

        assert_equal(len(train), n_train)
        assert_equal(len(test), n_test)
        assert_equal(len(set(train).intersection(test)), 0)

        group_counts = np.unique(groups)
        assert_equal(splits.test_size, 1.0 / n_folds)
        assert_equal(n_train + n_test, len(groups))
        assert_equal(len(group_counts), 2)
        ex_test_p = float(n_test) / n_samples
        ex_train_p = float(n_train) / n_samples

        assert_counts_are_ok(train_counts, ex_train_p)
        assert_counts_are_ok(test_counts, ex_test_p) 
Example 44
Project: pypastry   Author: datapastry   File: evaluation_test.py    MIT License 4 votes vote down vote up
def test_simple_evaluation():
    # Given
    dataset = DataFrame({
        'a': [1, 1, 0, 0],
        'b': [1, 1, 0, 0],
    })

    cross_validation = StratifiedShuffleSplit(n_splits=1, test_size=0.5)
    predictor = DecisionTreeClassifier()
    scorer = make_scorer(accuracy_score)
    experiment = Experiment(dataset, 'b', predictor, cross_validation, scorer)

    git_mock = Mock()
    results_repo_mock = Mock()
    new_results_files = ['results/abc.json']
    results_repo_mock.save_results.return_value = new_results_files
    results_display_mock = Mock()
    runner = ExperimentRunner(git_mock, results_repo_mock, results_display_mock)
    commit_message = "Test commit message"

    # When
    runner.run_experiment(experiment, False, commit_message)

    # Then
    call_args_list = results_repo_mock.save_results.call_args_list
    assert 1 == len(call_args_list)
    run_infos, dataset_info = call_args_list[0][0]

    assert len(run_infos) == 1

    results = run_infos[0]['results']
    assert 1.0 == results['test_score']
    assert ['a', 'b'] == dataset_info['columns']

    # TODO: check the hash. Need to find a way to make this consistent between python versions etc.
    # assert '28ea628a50a47c726a9b0ec437c88fc4742d81fd' == dataset_info['hash']

    git_mock.git.add.assert_called_once_with(update=True)
    git_mock.index.add.assert_called_once_with(new_results_files)
    git_mock.index.commit.assert_called_once_with(commit_message)

    assert 1 == len(results_display_mock.cache_display.call_args_list)
    print(results_display_mock.cache_display.call_args[0])
    assert len(results_display_mock.cache_display.call_args[0]) > 0
    assert 1 == len(results_display_mock.print_cache_file.call_args_list) 
Example 45
Project: node_embedding_attack   Author: abojchevski   File: utils.py    MIT License 4 votes vote down vote up
def evaluate_embedding_node_classification(embedding_matrix, labels, train_ratio=0.1, norm=True, seed=0, n_repeats=10):
    """Evaluate the node embeddings on the node classification task..

    :param embedding_matrix: np.ndarray, shape [n_nodes, embedding_dim]
        Embedding matrix
    :param labels: np.ndarray, shape [n_nodes]
        The ground truth labels
    :param train_ratio: float
        The fraction of labels to use for training
    :param norm: bool
        Whether to normalize the embeddings
    :param seed: int
        Random seed
    :param n_repeats: int
        Number of times to repeat the experiment
    :return: [float, float], [float, float]
        The mean and standard deviation of the f1_scores
    """
    if norm:
        embedding_matrix = normalize(embedding_matrix)

    results = []
    for it_seed in range(n_repeats):
        sss = StratifiedShuffleSplit(n_splits=1, test_size=1 - train_ratio, random_state=seed + it_seed)
        split_train, split_test = next(sss.split(embedding_matrix, labels))

        features_train = embedding_matrix[split_train]
        features_test = embedding_matrix[split_test]
        labels_train = labels[split_train]
        labels_test = labels[split_test]

        lr = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='auto')
        lr.fit(features_train, labels_train)

        lr_z_predict = lr.predict(features_test)
        f1_micro = f1_score(labels_test, lr_z_predict, average='micro')
        f1_macro = f1_score(labels_test, lr_z_predict, average='macro')

        results.append([f1_micro, f1_macro])

    results = np.array(results)

    return results.mean(0), results.std(0) 
Example 46
Project: palladio   Author: slipguru   File: model_assessment.py    GNU General Public License v3.0 4 votes vote down vote up
def _check_cv(cv=3, y=None, classifier=False, **kwargs):
    """Input checker utility for building a cross-validator.

    Parameters
    ----------
    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if classifier is True and ``y`` is either
        binary or multiclass, :class:`StratifiedKFold` is used. In all other
        cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    y : array-like, optional
        The target variable for supervised learning problems.

    classifier : boolean, optional, default False
        Whether the task is a classification task, in which case
        stratified KFold will be used.

    kwargs : dict
        Other parameters for StratifiedShuffleSplit or ShuffleSplit.

    Returns
    -------
    checked_cv : a cross-validator instance.
        The return value is a cross-validator which generates the train/test
        splits via the ``split`` method.
    """
    if cv is None:
        cv = kwargs.pop('n_splits', 0) or 10

    if isinstance(cv, numbers.Integral):
        if (classifier and (y is not None) and
                (type_of_target(y) in ('binary', 'multiclass'))):
            return StratifiedShuffleSplit(cv, **kwargs)
        else:
            return ShuffleSplit(cv, **kwargs)

    if not hasattr(cv, 'split') or isinstance(cv, str):
        if not isinstance(cv, Iterable) or isinstance(cv, str):
            raise ValueError("Expected cv as an integer, cross-validation "
                             "object (from sklearn.model_selection) "
                             "or an iterable. Got %s." % cv)
        return _CVIterableWrapper(cv)

    return cv  # New style cv objects are passed without any modification 
Example 47
Project: Forecast-Loan-Default   Author: Perry961002   File: train_use_boosting.py    MIT License 4 votes vote down vote up
def main():
    #1,加载数据(训练和测试)和预处理数据
    #将NumberTime30-59,60-89,90中标记的96,98替换为NaN
    #将Age中的0替换为NaN
    colnames = ['ID', 'label', 'RUUnsecuredL', 'age', 'NOTime30-59', 
                'DebtRatio', 'Income', 'NOCredit', 'NOTimes90', 
                'NORealEstate', 'NOTime60-89', 'NODependents']
    col_nas = ['', 'NA', 'NA', 0, [98, 96], 'NA', 'NA', 'NA', [98, 96], 'NA', [98, 96], 'NA']
    col_na_values = creatDictKV(colnames, col_nas)
    dftrain = pd.read_csv("./data/cs-training.csv", names=colnames, na_values=col_na_values, skiprows=[0])
    #print(dftrain)
    train_id = [int(x) for x in dftrain.pop("ID")]
    y_train = np.asarray([int(x)for x in dftrain.pop("label")])
    x_train = dftrain.as_matrix()

    dftest = pd.read_csv("./data/cs-test.csv", names=colnames, na_values=col_na_values, skiprows=[0])
    test_id = [int(x) for x in dftest.pop("ID")]
    y_test = np.asarray(dftest.pop("label"))
    x_test = dftest.as_matrix()
    #2,使用StratifiedShuffleSplit将训练数据分解为training_new和test_new(用于验证模型)
    sss = StratifiedShuffleSplit(n_splits=1,test_size=0.33333,random_state=0)
    for train_index, test_index in sss.split(x_train, y_train):
        x_train_new, x_test_new = x_train[train_index], x_train[test_index]
        y_train_new, y_test_new = y_train[train_index], y_train[test_index]

    y_train = y_train_new
    x_train = x_train_new
    #3,使用Imputer将NaN替换为平均值
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(x_train)
    x_train = imp.transform(x_train)
    x_test_new = imp.transform(x_test_new)
    x_test = imp.transform(x_test)
    x_train = np.delete(x_train, 5, axis=1)
    x_test_new = np.delete(x_test_new, 5, axis=1)
    if not os.path.isfile("boost_model.m"):
        clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 5,min_samples_leaf = 50,class_weight="balanced"), 
                                                        n_estimators = 10,
                                                        algorithm = 'SAMME.R', 
                                                        learning_rate = 0.4)
        clf.fit(x_train, y_train)
        joblib.dump(clf,"boost_model.m")
        predicted_probs_train =clf.predict_proba(x_train)
        predicted_probs_train = [x[1] for  x in predicted_probs_train]
        computeAUC(y_train, predicted_probs_train)
    else:
        clf = joblib.load("boost_model.m")
        predicted_probs_test_new = clf.predict_proba(x_test_new)
        predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
        computeAUC(y_test_new, predicted_probs_test_new) 
Example 48
Project: Forecast-Loan-Default   Author: Perry961002   File: train_use_lr.py    MIT License 4 votes vote down vote up
def main():
    #1,加载数据(训练和测试)和预处理数据
    #将NumberTime30-59,60-89,90中标记的96,98替换为NaN
    #将Age中的0替换为NaN
    colnames = ['ID', 'label', 'RUUnsecuredL', 'age', 'NOTime30-59', 
                'DebtRatio', 'Income', 'NOCredit', 'NOTimes90', 
                'NORealEstate', 'NOTime60-89', 'NODependents']
    col_nas = ['', 'NA', 'NA', 0, [98, 96], 'NA', 'NA', 'NA', [98, 96], 'NA', [98, 96], 'NA']
    col_na_values = creatDictKV(colnames, col_nas)
    dftrain = pd.read_csv("./data/cs-training.csv", names=colnames, na_values=col_na_values, skiprows=[0])
    train_id = [int(x) for x in dftrain.pop("ID")]
    y_train = np.asarray([int(x)for x in dftrain.pop("label")])
    x_train = dftrain.as_matrix()

    dftest = pd.read_csv("./data/cs-test.csv", names=colnames, na_values=col_na_values, skiprows=[0])
    test_id = [int(x) for x in dftest.pop("ID")]
    y_test = np.asarray(dftest.pop("label"))
    x_test = dftest.as_matrix()
    #2,使用StratifiedShuffleSplit将训练数据分解为training_new和test_new(用于验证模型)
    sss = StratifiedShuffleSplit(n_splits=1,test_size=0.33333,random_state=0)
    for train_index, test_index in sss.split(x_train, y_train):
        print("TRAIN:", train_index, "TEST:", test_index)
        x_train_new, x_test_new = x_train[train_index], x_train[test_index]
        y_train_new, y_test_new = y_train[train_index], y_train[test_index]

    y_train = y_train_new
    x_train = x_train_new
    #3,使用Imputer将NaN替换为平均值
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(x_train)
    x_train = imp.transform(x_train)
    x_test_new = imp.transform(x_test_new)
    x_test = imp.transform(x_test)
    x_train = np.delete(x_train, 5, axis=1)
    x_test_new = np.delete(x_test_new, 5, axis=1)
    if not os.path.isfile("lr_model.m"):
        clf = LogisticRegression(class_weight="balanced")
        clf.fit(x_train, y_train)
        joblib.dump(clf,"lr_model.m")
        predicted_probs_train =clf.predict_proba(x_train)
        predicted_probs_train = [x[1] for  x in predicted_probs_train]
        computeAUC(y_train, predicted_probs_train)
    else:
        clf = joblib.load("lr_model.m")
        predicted_probs_test_new = clf.predict_proba(x_test_new)
        predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
        computeAUC(y_test_new, predicted_probs_test_new) 
Example 49
Project: Forecast-Loan-Default   Author: Perry961002   File: dtc_learn_curve.py    MIT License 4 votes vote down vote up
def main():
    #1,加载数据(训练和测试)和预处理数据
    #将NumberTime30-59,60-89,90中标记的96,98替换为NaN
    #将Age中的0替换为NaN
    colnames = ['ID', 'label', 'RUUnsecuredL', 'age', 'NOTime30-59', 
                'DebtRatio', 'Income', 'NOCredit', 'NOTimes90', 
                'NORealEstate', 'NOTime60-89', 'NODependents']
    col_nas = ['', 'NA', 'NA', 0, [98, 96], 'NA', 'NA', 'NA', [98, 96], 'NA', [98, 96], 'NA']
    col_na_values = creatDictKV(colnames, col_nas)
    dftrain = pd.read_csv("./data/cs-training.csv", names=colnames, na_values=col_na_values, skiprows=[0])
    dftrain.pop("NOCredit")
    train_id = [int(x) for x in dftrain.pop("ID")]
    y_train = np.asarray([int(x)for x in dftrain.pop("label")])
    x_train = dftrain.as_matrix()
    #使用StratifiedShuffleSplit将训练数据分解为training_new和test_new(用于验证模型)
    sss = StratifiedShuffleSplit(n_splits=1,test_size=0.33333,random_state=0)
    for train_index, test_index in sss.split(x_train, y_train):
        x_train_new, x_test_new = x_train[train_index], x_train[test_index]
        y_train_new, y_test_new = y_train[train_index], y_train[test_index]

    y_train = y_train_new
    x_train = x_train_new
    #使用Imputer将NaN替换为平均值
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(x_train)
    x_train = imp.transform(x_train)
    x_test_new = imp.transform(x_test_new)
    
    train_errors, test_errors = [], []
    for i in range(1, 50):
        print('max_depth: ', i)
        clf = DecisionTreeClassifier(max_depth=i, class_weight="balanced")
        clf.fit(x_train, y_train)
        y_train_predicted = clf.predict(x_train)
        y_test_predicted = clf.predict(x_test_new)
        train_errors.append(mean_squared_error(y_train_predicted, y_train))
        test_errors.append(mean_squared_error(y_test_predicted, y_test_new))
    plt.plot(np.sqrt(train_errors), "r-x", label="Train Set", linewidth=2)
    plt.plot(np.sqrt(test_errors), "b-o", label="Test Set", linewidth=2)
    plt.legend(loc='heighter right')
    plt.xlabel("Max Depth")
    plt.ylabel("RMSE")
    plt.show()
    
    train_errors, test_errors = [], []
    for i in range(1, 1000, 20):
        print('min_samples_leaf: ', i)
        clf = DecisionTreeClassifier( min_samples_leaf=i, class_weight="balanced")
        clf.fit(x_train, y_train)
        y_train_predicted = clf.predict(x_train)
        y_test_predicted = clf.predict(x_test_new)
        train_errors.append(mean_squared_error(y_train_predicted, y_train))
        test_errors.append(mean_squared_error(y_test_predicted, y_test_new))
    plt.plot(range(1, 1000, 20), np.sqrt(train_errors), "r-x", label="Train Set", linewidth=2)
    plt.plot(range(1, 1000, 20), np.sqrt(test_errors), "b-o", label="Test Set", linewidth=2)
    plt.legend(loc='heighter right')
    plt.xlabel("Min Samples Leaf")
    plt.ylabel("RMSE")
    plt.show()