Python sklearn.model_selection.train_test_split() Examples

The following are 30 code examples for showing how to use sklearn.model_selection.train_test_split(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.model_selection , or try the search function .

Example 1
Project: Kaggler   Author: jeongyoonlee   File: automl.py    License: MIT License 6 votes vote down vote up
def optimize_hyperparam(self, X, y, test_size=.2, n_eval=100):
        X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=test_size, shuffle=self.shuffle)

        def objective(hyperparams):
            model = XGBModel(n_estimators=self.n_est, **self.params, **hyperparams)
            model.fit(X=X_trn, y=y_trn,
                      eval_set=[(X_val, y_val)],
                      eval_metric=self.metric,
                      early_stopping_rounds=self.n_stop,
                      verbose=False)
            score = model.evals_result()['validation_0'][self.metric][model.best_iteration] * self.loss_sign

            return {'loss': score, 'status': STATUS_OK, 'model': model}

        trials = Trials()
        best = hyperopt.fmin(fn=objective, space=self.space, trials=trials,
                             algo=tpe.suggest, max_evals=n_eval, verbose=1,
                             rstate=self.random_state)

        hyperparams = space_eval(self.space, best)
        return hyperparams, trials 
Example 2
Project: Kaggler   Author: jeongyoonlee   File: automl.py    License: MIT License 6 votes vote down vote up
def optimize_hyperparam(self, X, y, test_size=.2, n_eval=100):
        X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=test_size, shuffle=self.shuffle)

        train_data = lgb.Dataset(X_trn, label=y_trn)
        valid_data = lgb.Dataset(X_val, label=y_val)

        def objective(hyperparams):
            model = lgb.train({**self.params, **hyperparams}, train_data, self.n_est,
                              valid_data, early_stopping_rounds=self.n_stop, verbose_eval=0)

            score = model.best_score["valid_0"][self.metric] * self.loss_sign

            return {'loss': score, 'status': STATUS_OK, 'model': model}

        trials = Trials()
        best = hyperopt.fmin(fn=objective, space=self.space, trials=trials,
                             algo=tpe.suggest, max_evals=n_eval, verbose=1,
                             rstate=self.random_state)

        hyperparams = space_eval(self.space, best)
        return hyperparams, trials 
Example 3
Project: differential-privacy-library   Author: IBM   File: test_LogisticRegression.py    License: MIT License 6 votes vote down vote up
def test_different_results(self):
        from sklearn import datasets
        from sklearn import linear_model
        from sklearn.model_selection import train_test_split

        dataset = datasets.load_iris()
        X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2)

        clf = LogisticRegression(data_norm=12)
        clf.fit(X_train, y_train)

        predict1 = clf.predict(X_test)

        clf = LogisticRegression(data_norm=12)
        clf.fit(X_train, y_train)

        predict2 = clf.predict(X_test)

        clf = linear_model.LogisticRegression(solver="lbfgs", multi_class="ovr")
        clf.fit(X_train, y_train)

        predict3 = clf.predict(X_test)

        self.assertFalse(np.all(predict1 == predict2))
        self.assertFalse(np.all(predict3 == predict1) and np.all(predict3 == predict2)) 
Example 4
Project: differential-privacy-library   Author: IBM   File: test_LogisticRegression.py    License: MIT License 6 votes vote down vote up
def test_same_results(self):
        from sklearn import datasets
        from sklearn.model_selection import train_test_split
        from sklearn import linear_model

        dataset = datasets.load_iris()
        X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2)

        clf = LogisticRegression(data_norm=12, epsilon=float("inf"))
        clf.fit(X_train, y_train)

        predict1 = clf.predict(X_test)

        clf = linear_model.LogisticRegression(solver="lbfgs", multi_class="ovr")
        clf.fit(X_train, y_train)

        predict2 = clf.predict(X_test)

        self.assertTrue(np.all(predict1 == predict2)) 
Example 5
Project: differential-privacy-library   Author: IBM   File: test_LinearRegression.py    License: MIT License 6 votes vote down vote up
def test_different_results(self):
        from sklearn import datasets
        from sklearn import linear_model
        from sklearn.model_selection import train_test_split

        dataset = datasets.load_iris()
        X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2)

        clf = LinearRegression(data_norm=12, bounds_X=([4.3, 2.0, 1.1, 0.1], [7.9, 4.4, 6.9, 2.5]), bounds_y=(0, 2))
        clf.fit(X_train, y_train)

        predict1 = clf.predict(X_test)

        clf = LinearRegression(data_norm=12, bounds_X=([4.3, 2.0, 1.1, 0.1], [7.9, 4.4, 6.9, 2.5]), bounds_y=(0, 2))
        clf.fit(X_train, y_train)

        predict2 = clf.predict(X_test)

        clf = linear_model.LinearRegression()
        clf.fit(X_train, y_train)

        predict3 = clf.predict(X_test)

        self.assertFalse(np.all(predict1 == predict2))
        self.assertFalse(np.all(predict3 == predict1) and np.all(predict3 == predict2)) 
Example 6
Project: differential-privacy-library   Author: IBM   File: test_LinearRegression.py    License: MIT License 6 votes vote down vote up
def test_same_results(self):
        from sklearn import datasets
        from sklearn.model_selection import train_test_split
        from sklearn import linear_model

        dataset = datasets.load_iris()
        X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2)

        clf = LinearRegression(data_norm=12, epsilon=float("inf"),
                               bounds_X=([4.3, 2.0, 1.0, 0.1], [7.9, 4.4, 6.9, 2.5]), bounds_y=(0, 2))
        clf.fit(X_train, y_train)

        predict1 = clf.predict(X_test)

        clf = linear_model.LinearRegression(normalize=False)
        clf.fit(X_train, y_train)

        predict2 = clf.predict(X_test)

        self.assertTrue(np.allclose(predict1, predict2)) 
Example 7
Project: differential-privacy-library   Author: IBM   File: test_GaussianNB.py    License: MIT License 6 votes vote down vote up
def test_with_iris(self):
        global_seed(12345)
        from sklearn import datasets
        dataset = datasets.load_iris()

        x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=.2)

        bounds = ([4.3, 2.0, 1.0, 0.1], [7.9, 4.4, 6.9, 2.5])

        clf = GaussianNB(epsilon=5.0, bounds=bounds)
        clf.fit(x_train, y_train)

        accuracy = clf.score(x_test, y_test)
        counts = clf.class_count_.copy()
        self.assertGreater(accuracy, 0.5)

        clf.partial_fit(x_train, y_train)
        new_counts = clf.class_count_
        self.assertEqual(np.sum(new_counts), np.sum(counts) * 2) 
Example 8
Project: interpret-text   Author: interpretml   File: common_utils.py    License: MIT License 6 votes vote down vote up
def create_cancer_data():
    # Import cancer dataset
    cancer = (
        retrieve_dataset("breast-cancer.train.csv", na_values="?")
        .interpolate()
        .astype("int64")
    )
    cancer_target = cancer.iloc[:, 0]
    cancer_data = cancer.iloc[:, 1:]
    feature_names = cancer_data.columns.values
    target_names = ["no_cancer", "cancer"]
    # Split data into train and test
    x_train, x_test, y_train, y_validation = train_test_split(
        cancer_data, cancer_target, test_size=0.2, random_state=0
    )
    return x_train, x_test, y_train, y_validation, feature_names, target_names 
Example 9
Project: interpret-text   Author: interpretml   File: common_utils.py    License: MIT License 6 votes vote down vote up
def create_simple_titanic_data():
    titanic_url = (
        "https://raw.githubusercontent.com/amueller/"
        "scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv"
    )
    data = read_csv(titanic_url)
    # fill missing values
    data = data.fillna(method="ffill")
    data = data.fillna(method="bfill")
    numeric_features = ["age", "fare"]
    categorical_features = ["embarked", "sex", "pclass"]

    y = data["survived"].values
    X = data[categorical_features + numeric_features]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test, numeric_features, categorical_features 
Example 10
Project: MaliciousMacroBot   Author: egaus   File: mmbot.py    License: MIT License 6 votes vote down vote up
def mmb_evaluate_model(self):
        """
        Returns scores from cross validation evaluation on the malicious / benign classifier
        """
        predictive_features = self.features['predictive_features']
        self.clf_X = self.modeldata[predictive_features].values
        self.clf_y = np.array(self.modeldata['label'])

        X_train, X_test, y_train, y_test = train_test_split(self.clf_X, self.clf_y, test_size=0.2, random_state=0)
        lb = LabelBinarizer()
        y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
        eval_cls = RandomForestClassifier(n_estimators=100, max_features=.2)
        eval_cls.fit(X_train, y_train)

        recall = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='recall')
        precision = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='precision')
        accuracy = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='accuracy')
        f1_score = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='f1_macro')

        return {'accuracy': accuracy, 'f1': f1_score, 'precision': precision, 'recall': recall} 
Example 11
Project: kaggle-carvana-2017   Author: killthekitten   File: datasets.py    License: MIT License 6 votes vote down vote up
def bootstrapped_split(car_ids, seed=args.seed):
    """
    # Arguments
        metadata: metadata.csv provided by Carvana (should include
        `train` column).

    # Returns
        A tuple (train_ids, test_ids)
    """
    all_ids = pd.Series(car_ids)
    train_ids, valid_ids = train_test_split(car_ids, test_size=args.test_size_float,
                                                     random_state=seed)

    np.random.seed(seed)
    bootstrapped_idx = np.random.random_integers(0, len(train_ids))
    bootstrapped_train_ids = train_ids[bootstrapped_idx]

    return generate_filenames(bootstrapped_train_ids.values), generate_filenames(valid_ids) 
Example 12
Project: Python-ELM   Author: masaponto   File: ml_elm.py    License: MIT License 6 votes vote down vote up
def main():
    from sklearn import preprocessing
    from sklearn.datasets import fetch_openml as fetch_mldata
    from sklearn.model_selection import train_test_split

    db_name = 'diabetes'
    data_set = fetch_mldata(db_name)
    data_set.data = preprocessing.normalize(data_set.data)

    tmp = data_set.target
    tmpL = [ 1 if i == "tested_positive" else -1 for i in tmp]
    data_set.target = tmpL

    X_train, X_test, y_train, y_test = train_test_split(
        data_set.data, data_set.target, test_size=0.4)

    mlelm = MLELM(hidden_units=(10, 30, 200)).fit(X_train, y_train)
    elm = ELM(200).fit(X_train, y_train)

    print("MLELM Accuracy %0.3f " % mlelm.score(X_test, y_test))
    print("ELM Accuracy %0.3f " % elm.score(X_test, y_test)) 
Example 13
Project: Tensorflow-Audio-Classification   Author: luuil   File: audio_util.py    License: Apache License 2.0 6 votes vote down vote up
def train_test_val_split(X, Y, split=(0.2, 0.1), shuffle=True):
    """Split dataset into train/val/test subsets by 70:20:10(default).
    
    Args:
      X: List of data.
      Y: List of labels corresponding to data.
      split: Tuple of split ratio in `test:val` order.
      shuffle: Bool of shuffle or not.
      
    Returns:
      Three dataset in `train:test:val` order.
    """
    from sklearn.model_selection import train_test_split
    assert len(X) == len(Y), 'The length of X and Y must be consistent.'
    X_train, X_test_val, Y_train, Y_test_val = train_test_split(X, Y, 
        test_size=(split[0]+split[1]), shuffle=shuffle)
    X_test, X_val, Y_test, Y_val = train_test_split(X_test_val, Y_test_val, 
        test_size=split[1], shuffle=False)
    return (X_train, Y_train), (X_test, Y_test), (X_val, Y_val) 
Example 14
Project: fake-news-detection   Author: aldengolab   File: run.py    License: MIT License 6 votes vote down vote up
def pipeline(args):
    '''
    Runs the model loop.
    '''
    df = pd.read_csv(args.filename)
    df.loc[:,args.x_label] = df[args.x_label].fillna("None")
    if args.dedupe:
        df = df.drop_duplicates(subset='content')
    if args.reduce:
        df = restrict_sources(df)
    X = df[args.x_label]
    y = df[args.y_label]
    parser = spacy.load('en')
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    loop = ModelLoop(X_train, X_test, y_train, y_test, args.models,
                     args.iterations, args.output_dir,
                     thresholds = args.thresholds, ks = args.ks,
                     setting=args.features[0])
    loop.run() 
Example 15
Project: dota2-predictor   Author: andreiapostoae   File: learning_curve.py    License: MIT License 6 votes vote down vote up
def plot_learning_curve(x_train, y_train, subsets=20, mmr=None, cv=5, tool='matplotlib'):
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)

    subset_sizes = np.exp(np.linspace(3, np.log(len(y_train)), subsets)).astype(int)

    results_list = [[], []]

    for subset_size in subset_sizes:
        logger.info('Performing cross validation on subset_size %d', subset_size)
        _, _, cv_score, roc_auc, _ = evaluate([x_train[:subset_size], y_train[:subset_size]],
                                              [x_test, y_test], cv=cv)

        results_list[0].append(1 - cv_score)
        results_list[1].append(1 - roc_auc)

    if tool == 'matplotlib':
        _plot_matplotlib(subset_sizes, results_list, mmr)
    else:
        _plot_plotly(subset_sizes, results_list, mmr) 
Example 16
Project: xcessiv   Author: reiinakano   File: models.py    License: Apache License 2.0 6 votes vote down vote up
def return_train_dataset(self):
        """Returns train data set

        Returns:
            X (numpy.ndarray): Features

            y (numpy.ndarray): Labels
        """
        X, y = self.return_main_dataset()

        if self.test_dataset['method'] == 'split_from_main':
            X, X_test, y, y_test = train_test_split(
                X,
                y,
                test_size=self.test_dataset['split_ratio'],
                random_state=self.test_dataset['split_seed'],
                stratify=y
            )

        return X, y 
Example 17
Project: edge2vec   Author: RoyZhengGao   File: multi_class_classification.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def multi_class_classification(data_X,data_Y):
    '''
    calculate multi-class classification and return related evaluation metrics
    '''

    svc = svm.SVC(C=1, kernel='linear')
    # X_train, X_test, y_train, y_test = train_test_split( data_X, data_Y, test_size=0.4, random_state=0) 
    clf = svc.fit(data_X, data_Y) #svm
    # array = svc.coef_
    # print array
    predicted = cross_val_predict(clf, data_X, data_Y, cv=2)
    print "accuracy",metrics.accuracy_score(data_Y, predicted)
    print "f1 score macro",metrics.f1_score(data_Y, predicted, average='macro') 
    print "f1 score micro",metrics.f1_score(data_Y, predicted, average='micro') 
    print "precision score",metrics.precision_score(data_Y, predicted, average='macro') 
    print "recall score",metrics.recall_score(data_Y, predicted, average='macro') 
    print "hamming_loss",metrics.hamming_loss(data_Y, predicted)
    print "classification_report", metrics.classification_report(data_Y, predicted)
    print "jaccard_similarity_score", metrics.jaccard_similarity_score(data_Y, predicted)
    # print "log_loss", metrics.log_loss(data_Y, predicted)
    print "zero_one_loss", metrics.zero_one_loss(data_Y, predicted)
    # print "AUC&ROC",metrics.roc_auc_score(data_Y, predicted)
    # print "matthews_corrcoef", metrics.matthews_corrcoef(data_Y, predicted) 
Example 18
Project: sgd-influence   Author: sato9hara   File: DataModule.py    License: MIT License 6 votes vote down vote up
def fetch(self, n_tr, n_val, n_test, seed=0):
        x, y = self.load()
        
        # split data
        x_tr, x_val, y_tr, y_val = train_test_split(
            x, y, train_size=n_tr, test_size=n_val+n_test, random_state=seed)
        x_val, x_test, y_val, y_test = train_test_split(
            x_val, y_val, train_size=n_val, test_size=n_test, random_state=seed+1)
        
        # process x
        if self.normalize:
            scaler = StandardScaler()
            scaler.fit(x_tr)
            x_tr = scaler.transform(x_tr)
            x_val = scaler.transform(x_val)
            x_test = scaler.transform(x_test)
        if self.append_one:
            x_tr = np.c_[x_tr, np.ones(n_tr)]
            x_val = np.c_[x_val, np.ones(n_val)]
            x_test = np.c_[x_test, np.ones(n_test)]
        
        return (x_tr, y_tr), (x_val, y_val), (x_test, y_test) 
Example 19
Project: qb   Author: Pinafore   File: pipeline.py    License: MIT License 6 votes vote down vote up
def run(self):
        with open(QANTA_TRAIN_DATASET_PATH) as f:
            all_guess_train = [q for q in json.load(f)['questions'] if q['fold'] == GUESSER_TRAIN_FOLD]

        guess_train, guess_val = train_test_split(all_guess_train, random_state=42, train_size=.9)

        with open(QANTA_DEV_DATASET_PATH) as f:
            guess_dev = [q for q in json.load(f)['questions'] if q['fold'] == GUESSER_DEV_FOLD]

        with open(QANTA_TORCH_TRAIN_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_train, DS_VERSION), f)

        with open(QANTA_TORCH_VAL_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_val, DS_VERSION), f)

        with open(QANTA_TORCH_DEV_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_dev, DS_VERSION), f) 
Example 20
Project: SGCN   Author: benedekrozemberczki   File: sgcn.py    License: GNU General Public License v3.0 6 votes vote down vote up
def setup_dataset(self):
        """
        Creating train and test split.
        """
        self.positive_edges, self.test_positive_edges = train_test_split(self.edges["positive_edges"],
                                                                         test_size=self.args.test_size)

        self.negative_edges, self.test_negative_edges = train_test_split(self.edges["negative_edges"],
                                                                         test_size=self.args.test_size)
        self.ecount = len(self.positive_edges + self.negative_edges)

        self.X = setup_features(self.args,
                                self.positive_edges,
                                self.negative_edges,
                                self.edges["ncount"])

        self.positive_edges = torch.from_numpy(np.array(self.positive_edges,
                                                        dtype=np.int64).T).type(torch.long).to(self.device)

        self.negative_edges = torch.from_numpy(np.array(self.negative_edges,
                                                        dtype=np.int64).T).type(torch.long).to(self.device)

        self.y = np.array([0 if i < int(self.ecount/2) else 1 for i in range(self.ecount)]+[2]*(self.ecount*2))
        self.y = torch.from_numpy(self.y).type(torch.LongTensor).to(self.device)
        self.X = torch.from_numpy(self.X).float().to(self.device) 
Example 21
Project: snape   Author: mbernico   File: make_dataset.py    License: Apache License 2.0 6 votes vote down vote up
def write_dataset(df, file_name, out_path="." + os.path.sep):
    """
    Writes generated dataset to file

    :param df: dataframe to write
    :param file_name: beginning of filename
    :param out_path: the path to write the dataset
    :return: None
    """
    # todo: Mike, do we want to take a param for overwriting existing files?
    df_train, df_testkey = train_test_split(df, test_size=.2)

    df_train.to_csv(out_path + file_name + "_train.csv", index=False)
    df_test = df_testkey.drop(['y'], axis=1)
    df_test.to_csv(out_path + file_name + "_test.csv", index=False)
    df_testkey.to_csv(out_path + file_name + "_testkey.csv", index=False) 
Example 22
Project: libact   Author: ntucllab   File: label_digits.py    License: BSD 2-Clause "Simplified" License 6 votes vote down vote up
def split_train_test(n_classes):
    from sklearn.datasets import load_digits

    n_labeled = 5
    digits = load_digits(n_class=n_classes)  # consider binary case
    X = digits.data
    y = digits.target
    print(np.shape(X))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    while len(np.unique(y_train[:n_labeled])) < n_classes:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.33)

    trn_ds = Dataset(X_train, np.concatenate(
        [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
    tst_ds = Dataset(X_test, y_test)

    return trn_ds, tst_ds, digits 
Example 23
Project: TripletLossFace   Author: aangfanboy   File: main_data_engine.py    License: MIT License 6 votes vote down vote up
def create_tensorflow_dataset_object(self, paths, labels, test_rate: float = 0.1, test_data: tuple = (None, None), supportive: bool = False):
		print("Creating TensorFlow dataset object...")
		if type(test_data) != tuple:
			printl("\"test_data\" must be tuple for 'create_tensorflow_dataset_object', test data will be taken from real data " + 
				f"with rate of {test_rate}. You have 5 seconds to stop the process and cancel running. Use Ctrl+C to do that.")

			time.sleep(5)


		paths_train, paths_test, labels_train, labels_test = train_test_split(paths, labels, test_size=test_rate, random_state=42)
		print("Dataset splitted by system, please make sure this is what you want.")

		dataset_train = tf.data.Dataset.from_tensor_slices((paths_train, labels_train)).shuffle(len(labels_train))
		dataset_test = tf.data.Dataset.from_tensor_slices((paths_test, labels_test)).shuffle(len(labels_test))
		print("TensorFlow dataset object created!")

		if not supportive:
			self.dataset_train = dataset_train
			self.dataset_test = dataset_test

		return dataset_train, dataset_test 
Example 24
Project: TripletLossFace   Author: aangfanboy   File: main_data_engine.py    License: MIT License 6 votes vote down vote up
def create_tensorflow_dataset_object(self, paths, labels, test_rate: float = 0.1, test_data: tuple = (None, None), supportive: bool = False):
		print("Creating TensorFlow dataset object...")
		if type(test_data) != tuple:
			print("\"test_data\" must be tuple for 'create_tensorflow_dataset_object', test data will be taken from real data " + 
				f"with rate of {test_rate}. You have 5 seconds to stop the process and cancel running. Use Ctrl+C to do that.")

			time.sleep(5)


		paths_train, paths_test, labels_train, labels_test = train_test_split(paths, labels, test_size=test_rate, random_state=42)
		print("Dataset splitted by system, please make sure this is what you want.")

		dataset_train = tf.data.Dataset.from_tensor_slices((paths_train, labels_train)).shuffle(len(labels_train))
		dataset_test = tf.data.Dataset.from_tensor_slices((paths_test, labels_test)).shuffle(len(labels_test))
		print("TensorFlow dataset object created!")

		if not supportive:
			self.dataset_train = dataset_train
			self.dataset_test = dataset_test

		return dataset_train, dataset_test 
Example 25
Project: AIX360   Author: IBM   File: test_shap.py    License: Apache License 2.0 6 votes vote down vote up
def test_ShapLinearExplainer(self):
        corpus, y = shap.datasets.imdb()
        corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, random_state=7)

        vectorizer = TfidfVectorizer(min_df=10)
        X_train = vectorizer.fit_transform(corpus_train)
        X_test = vectorizer.transform(corpus_test)

        model = sklearn.linear_model.LogisticRegression(penalty="l1", C=0.1, solver='liblinear')
        model.fit(X_train, y_train)

        shapexplainer = LinearExplainer(model, X_train, feature_dependence="independent")
        shap_values = shapexplainer.explain_instance(X_test)
        print("Invoked Shap LinearExplainer")

    # comment this test as travis runs out of resources 
Example 26
Project: cloudml-samples   Author: GoogleCloudPlatform   File: utils.py    License: Apache License 2.0 6 votes vote down vote up
def data_train_test_split(data_df):
  """Split the DataFrame two subsets for training and testing.

  Args:
    data_df: (pandas.DataFrame) DataFrame the splitting to be performed on

  Returns:
    A Tuple of (pandas.DataFrame, pandas.Series,
                pandas.DataFrame, pandas.Series)
  """

  label_column = metadata.LABEL
  # Only use metadata.FEATURE_COLUMNS + metadata.LABEL
  columns_to_use = metadata.FEATURE_COLUMNS + [label_column]

  train, val = model_selection.train_test_split(data_df[columns_to_use])
  x_train, y_train = _feature_label_split(train, label_column)
  x_val, y_val = _feature_label_split(val, label_column)
  return x_train, y_train, x_val, y_val 
Example 27
Project: nodevectors   Author: VHRanger   File: graph_eval.py    License: MIT License 6 votes vote down vote up
def print_labeled_tests(w, y, test_size=0.2, seed=42):
    """
    Clustering and label prediction tests
    """
    X_train, X_test, y_train, y_test = train_test_split(
        w, y, test_size=test_size, random_state=seed)
    # Print Label Prediction Tests
    res = LabelPrediction(w, y, test_size=test_size, seed=seed)
    # Can only cluster on single-label (not multioutput)
    if len(y.shape) < 2:
        n_clusters = np.unique(y).size
        umpagglo = cluster.AgglomerativeClustering(
            n_clusters=n_clusters, 
            affinity='cosine', 
            linkage='average'
        ).fit(w).labels_
        x = evalClusteringOnLabels(umpagglo, y, verbose=True)
        res = {**res, **x}
    return res 
Example 28
Project: Kaggler   Author: jeongyoonlee   File: test_automl.py    License: MIT License 5 votes vote down vote up
def test_automl():
    X, y = make_regression(n_samples=N_OBS,
                           n_features=N_FEATURE,
                           n_informative=N_IMP_FEATURE,
                           random_state=RANDOM_SEED)
    X = pd.DataFrame(X, columns=['x{}'.format(i) for i in range(X.shape[1])])
    y = pd.Series(y)
    logging.info(X.shape, y.shape)

    X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=.2, random_state=RANDOM_SEED)

    model = AutoLGB(objective='regression', metric='l1')
    model.tune(X_trn, y_trn)
    model.fit(X_trn, y_trn)
    p = model.predict(X_tst)
    r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) + y_trn.min())
    logging.info('MAE (LGB): {:.4f}'.format(mae(y_tst, p)))
    assert mae(y_tst, p) < mae(y_tst, r)

    model = AutoXGB(objective='reg:linear', metric='rmse')
    model.tune(X_trn, y_trn)
    model.fit(X_trn, y_trn)
    p = model.predict(X_tst)
    r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) + y_trn.min())
    logging.info('MAE (XGB): {:.4f}'.format(mae(y_tst, p)))
    assert mae(y_tst, p) < mae(y_tst, r) 
Example 29
Project: differential-privacy-library   Author: IBM   File: test_PCA.py    License: MIT License 5 votes vote down vote up
def test_different_results(self):
        from sklearn import datasets
        from sklearn.model_selection import train_test_split

        dataset = datasets.load_iris()
        X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2)

        mean = np.mean(X_train, axis=0)
        X_train -= mean
        X_test -= mean

        clf = PCA(data_norm=12, centered=True)
        clf.fit(X_train, y_train)

        transform1 = clf.transform(X_test)

        clf = PCA(data_norm=12, centered=True)
        clf.fit(X_train, y_train)

        transform2 = clf.transform(X_test)

        clf = sk_pca.PCA(svd_solver='full')
        clf.fit(X_train, y_train)

        transform3 = clf.transform(X_test)

        self.assertFalse(np.all(transform1 == transform2))
        self.assertFalse(np.all(transform3 == transform1) and np.all(transform3 == transform2)) 
Example 30
Project: interpret-text   Author: interpretml   File: utils_test.py    License: MIT License 5 votes vote down vote up
def setup_mnli_test_train_split():
    train_df = get_mnli_test_dataset('train')
    X_str = train_df['sentence1']
    ylabels = train_df['genre']
    X_train, X_test, y_train, y_test = train_test_split(X_str, ylabels, train_size=0.8, test_size=0.2)
    return X_train, X_test, y_train, y_test