Python sklearn.cross_validation.train_test_split() Examples

The following are 30 code examples of sklearn.cross_validation.train_test_split(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.cross_validation , or try the search function

Example #1

Source File: analysis.py From smallrnaseq with GNU General Public License v3.0

7 votes

def classify(X, y, cl, name=''):
    """Classification using gene features"""

    from sklearn.metrics import classification_report, accuracy_score
    np.random.seed()
    ind = np.random.permutation(len(X))

    from sklearn.cross_validation import train_test_split
    Xtrain, Xtest, ytrain, ytest  = train_test_split(X, y, test_size=0.4)
    #print X
    cl.fit(Xtrain, ytrain)
    ypred = cl.predict(Xtest)

    print (classification_report(ytest, ypred))
    #print accuracy_score(ytest, ypred)
    from sklearn import cross_validation
    yl = pd.Categorical(y).labels
    sc = cross_validation.cross_val_score(cl, X, yl, scoring='roc_auc', cv=5)
    print("AUC: %0.2f (+/- %0.2f)" % (sc.mean(), sc.std() * 2))
    return cl

Example #2

Source File: label_digits.py From libact with BSD 2-Clause "Simplified" License

7 votes

def split_train_test(n_classes):
    from sklearn.datasets import load_digits

    n_labeled = 5
    digits = load_digits(n_class=n_classes)  # consider binary case
    X = digits.data
    y = digits.target
    print(np.shape(X))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    while len(np.unique(y_train[:n_labeled])) < n_classes:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.33)

    trn_ds = Dataset(X_train, np.concatenate(
        [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
    tst_ds = Dataset(X_test, y_test)

    return trn_ds, tst_ds, digits

Example #3

Source File: data_preparation_tools.py From corpus-to-graph-ml with MIT License

6 votes

def split_to_test_and_train(data, labels, entities, test_size=DEFAULT_TEST_SIZE):
    d_train, d_test, l_train, l_test, c_train, c_test = train_test_split(data, labels, entities, test_size=test_size)
    d_test_2 = []
    l_test_2 = []
    c_test_2 = []

    train_dict = {}
    for d in d_train:
        train_dict[d] = 1

    for d,l,c in zip(d_test, l_test, c_test):
        if (train_dict.has_key(d)):
            continue
        d_test_2.append(d)
        l_test_2.append(l)
        c_test_2.append(c)

    return (d_train, d_test_2, l_train, l_test_2, c_train, c_test_2)

# utility to extracts entities from preproceseed files

Example #4

Source File: p119_squential_backward_selection.py From PythonMachineLearningExamples with MIT License

6 votes

def fit(self, X, y):
        X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=self.test_size,
        random_state=self.random_state)
        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train, y_train,
        X_test, y_test, self.indices_)
        self.scores_ = [score]
        while dim > self.k_features:
            scores = []
            subsets = []
            for p in combinations(self.indices_, r=dim-1):
                score = self._calc_score(X_train, y_train,
                X_test, y_test, p)
                scores.append(score)
                subsets.append(p)
            best = np.argmax(scores)
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            dim -= 1
            self.scores_.append(scores[best])
        self.k_score_ = self.scores_[-1]
        return self

Example #5

Source File: sklearn-RS-demo-cf-item-test.py From AiLearning with GNU General Public License v3.0

6 votes

def splitData(self, dataFile, test_size):
        # 加载数据集
        header = ['user_id', 'item_id', 'rating', 'timestamp']
        df = pd.read_csv(dataFile, sep='\t', names=header)

        self.n_users = df.user_id.unique().shape[0]
        self.n_items = df.item_id.unique().shape[0]

        print('Number of users = ' + str(self.n_users) +
              ' | Number of items = ' + str(self.n_items))

        # 拆分数据集:  用户+电影
        self.train_data, self.test_data = cv.train_test_split(
            df, test_size=test_size)
        print('分离训练集和测试集成功', file=sys.stderr)
        print('len(train) = %s' % np.shape(self.train_data)[0], file=sys.stderr)
        print('len(test) = %s' % np.shape(self.test_data)[0], file=sys.stderr)

Example #6

Source File: prepare_data.py From personal-photos-model with Apache License 2.0

6 votes

def _shuffle_images_for_target(self, data, target):
    """
    Takes all the non-paired images for a given person, slices them into training, validation, and
    training sets, and shuffles within each of these sets.
    """
    # train_test_split can only partition into two sets, so we have to partition into two sets, then
    # further partition the validation set into a test set.
    (train_data, other_data, train_target, other_target) = train_test_split(data, target,
      train_size=0.7, test_size=0.3, random_state=0)
    self._train["data"].extend(train_data)
    self._train["target"].extend(train_target)

    (validation_data, test_data, validation_target, test_target) = train_test_split(other_data,
      other_target, train_size=0.9, test_size=0.1, random_state=0)
    self._validation["data"].extend(validation_data)
    self._validation["target"].extend(validation_target)
    self._test["data"].extend(test_data)
    self._test["target"].extend(test_target)

Example #7

Source File: data_loader.py From datastories-semeval2017-task4 with MIT License

6 votes

def load_train_val_test(self, only_test=False):
        X_train, X_rest, y_train, y_rest = train_test_split(self.X, self.y,
                                                            test_size=0.3,
                                                            stratify=self.y,
                                                            random_state=42)
        X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest,
                                                        test_size=0.5,
                                                        stratify=y_rest,
                                                        random_state=42)

        if not only_test:
            print("\nPreparing training set...")
            training = prepare_dataset(X_train, y_train, self.pipeline,
                                       self.y_one_hot)
            print("\nPreparing validation set...")
            validation = prepare_dataset(X_val, y_val, self.pipeline,
                                         self.y_one_hot)
        print("\nPreparing test set...")
        testing = prepare_dataset(X_test, y_test, self.pipeline,
                                  self.y_one_hot)

        if only_test:
            return testing
        else:
            return training, validation, testing

Example #8

Source File: iris_run_config.py From deep_image_model with Apache License 2.0

6 votes

def main(unused_argv):
  # Load dataset.
  iris = datasets.load_iris()
  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  # You can define you configurations by providing a RunConfig object to
  # estimator to control session configurations, e.g. num_cores
  # and gpu_memory_fraction
  run_config = tf.contrib.learn.estimators.RunConfig(
      num_cores=3, gpu_memory_fraction=0.6)

  # Build 3 layer DNN with 10, 20, 10 units respectively.
  feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
      x_train)
  classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
                                              hidden_units=[10, 20, 10],
                                              n_classes=3,
                                              config=run_config)

  # Fit and predict.
  classifier.fit(x_train, y_train, steps=200)
  predictions = list(classifier.predict(x_test, as_iterable=True))
  score = metrics.accuracy_score(y_test, predictions)
  print('Accuracy: {0:f}'.format(score))

Example #9

Source File: iris.py From deep_image_model with Apache License 2.0

6 votes

def main(unused_argv):
  # Load dataset.
  iris = learn.datasets.load_dataset('iris')
  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  # Build 3 layer DNN with 10, 20, 10 units respectively.
  feature_columns = learn.infer_real_valued_columns_from_input(x_train)
  classifier = learn.DNNClassifier(
      feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3)

  # Fit and predict.
  classifier.fit(x_train, y_train, steps=200)
  predictions = list(classifier.predict(x_test, as_iterable=True))
  score = metrics.accuracy_score(y_test, predictions)
  print('Accuracy: {0:f}'.format(score))

Example #10

Source File: iris_custom_decay_dnn.py From deep_image_model with Apache License 2.0

6 votes

def main(unused_argv):
  iris = datasets.load_iris()
  x_train, x_test, y_train, y_test = train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
      x_train)
  classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
                                              hidden_units=[10, 20, 10],
                                              n_classes=3,
                                              optimizer=optimizer_exp_decay)

  classifier.fit(x_train, y_train, steps=800)
  predictions = list(classifier.predict(x_test, as_iterable=True))
  score = metrics.accuracy_score(y_test, predictions)
  print('Accuracy: {0:f}'.format(score))

Example #11

Source File: faces.py From ConvNetPy with MIT License

6 votes

def load_data():
    global training_data, testing_data

    lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

    xs = lfw_people.data
    ys = lfw_people.target

    inputs = []
    labels = list(ys)

    for face in xs:
        V = Vol(50, 37, 1, 0.0)
        V.w = list(face)
        inputs.append(augment(V, 30))

    x_tr, x_te, y_tr, y_te = train_test_split(inputs, labels, test_size=0.25)

    training_data = zip(x_tr, y_tr)
    testing_data = zip(x_te, y_te)

    print 'Dataset made...'

Example #12

Source File: test_display.py From diogenes with MIT License

6 votes

def test_get_top_features(self):
        M, labels = uft.generate_test_matrix(1000, 15, random_state=0)
        M = utils.cast_np_sa_to_nd(M)
        M_train, M_test, labels_train, labels_test = train_test_split(
                M, 
                labels)
        clf = RandomForestClassifier(random_state=0)
        clf.fit(M_train, labels_train)

        ctrl_feat_importances = clf.feature_importances_
        ctrl_col_names = ['f{}'.format(i) for i in xrange(15)]
        ctrl_feat_ranks = np.argsort(ctrl_feat_importances)[::-1][:10]
        ctrl = utils.convert_to_sa(
                zip(ctrl_col_names, ctrl_feat_importances),
                col_names=('feat_name', 'score'))[ctrl_feat_ranks]

        res = dsp.get_top_features(clf, M, verbose=False)
        self.assertTrue(uft.array_equal(ctrl, res))

        res = dsp.get_top_features(clf, col_names=['f{}'.format(i) for i in xrange(15)], verbose=False)
        self.assertTrue(uft.array_equal(ctrl, res))

Example #13

Source File: data_loader.py From datastories-semeval2017-task4 with MIT License

5 votes

def load_final(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y,
                                                            test_size=0.1,
                                                            stratify=self.y,
                                                            random_state=27)
        print("\nPreparing training set...")
        training = prepare_dataset(X_train, y_train, self.pipeline,
                                   self.y_one_hot)
        print("\nPreparing test set...")
        testing = prepare_dataset(X_test, y_test, self.pipeline,
                                  self.y_one_hot)
        return training, testing

Example #14

Source File: recipe_classification.py From Flavor-Network with GNU General Public License v3.0

5 votes

def logistic_test(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print 'First round:',metrics.accuracy_score(y_test,y_pred)
    #tune parameter C
    crange =[0.01,0.1,1,10,100]
    for num in crange:
        model = LogisticRegression(C=num)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print 'C=', num, ',score=', metrics.accuracy_score(y_test,y_pred)

Example #15

Source File: stack5.py From semeval2017-scienceie with Apache License 2.0

5 votes

def model_withValidation(X_train_total, Y_train_total,X_test=None,Y_test=None,words_test=None,indices2labels=None,hiddenDim=250, filename_x = "none", filename_y = "none"):
    
    X_train, X_dev, Y_train, Y_dev = train_test_split(X_train_total, Y_train_total, test_size=0.10, random_state=0)

    model = Sequential()


    model.add(Dense(output_dim=hiddenDim, input_dim=X_train.shape[1]))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    
    model.add(Dense(3))
    model.add(Activation("softmax"))

    model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])

    weightsPath = "./tmp/myfooo2%s.dat"%(time.time())
    checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1, save_best_only=True)

    model.fit(X_train, Y_train, verbose=2,  nb_epoch=100, batch_size=32, validation_data=(X_dev,Y_dev),callbacks=[checkpointer])

    model.load_weights(weightsPath)
    loss, acc = model.evaluate(X_test,Y_test, batch_size=32)
    
    print("loss : %0.5f Accuracy :%0.5f"%(loss,acc))

    cf = confusion_matrix(Y_test[:,1],model.predict_classes(X_test))
    print(cf)
    predictions = model.predict_classes(X_test)
    print("-->",predictions)
    
    return model,predictions

Example #16

Source File: stack5.py From semeval2017-scienceie with Apache License 2.0

5 votes

def model_withValidation(X_train_total, Y_train_total,X_test=None,Y_test=None,words_test=None,indices2labels=None,hiddenDim=250, filename_x = "none", filename_y = "none"):
    
    X_train, X_dev, Y_train, Y_dev = train_test_split(X_train_total, Y_train_total, test_size=0.10, random_state=0)

    model = Sequential()


    model.add(Dense(output_dim=hiddenDim, input_dim=X_train.shape[1]))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    
    
#    model.add(Dense(output_dim=1000))
#    model.add(BatchNormalization())
#    model.add(Activation("relu"))
    
#     model.add(Dense(output_dim=20))
#     model.add(BatchNormalization())
#     model.add(Activation("relu"))
    
    model.add(Dense(3))
    model.add(Activation("softmax"))

    model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])

    weightsPath = "./tmp/myfooo2%s.dat"%(time.time())
    checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1, save_best_only=True)

    model.fit(X_train, Y_train, verbose=2,  nb_epoch=100, batch_size=32, validation_data=(X_dev,Y_dev),callbacks=[checkpointer])

    model.load_weights(weightsPath)
    loss, acc = model.evaluate(X_test,Y_test, batch_size=32)
    
    print("loss : %0.5f Accuracy :%0.5f"%(loss,acc))

    cf = confusion_matrix(Y_test[:,1],model.predict_classes(X_test))
    print(cf)
    predictions = model.predict_classes(X_test)
    print("-->",predictions)
    
    return model,predictions

Example #17

Source File: logistic_regression_updated.py From DataSciencePython with MIT License

5 votes

def cv_loop(X, y, model, N):
    mean_auc = 0.
    for i in range(N):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
                                       X, y, test_size=.20, 
                                       random_state = i*SEED)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)[:,1]
        auc = metrics.auc_score(y_cv, preds)
        print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
        mean_auc += auc
    return mean_auc/N

Example #18

Source File: classifier.py From TextDetector with GNU General Public License v3.0

5 votes

def data_load(self, datadir):
                mytime = timeLog('../timelogs/data_load')
                mytime.start()
		print 'Loading data ....',
		P, L = data_load(datadir)
		P = numpy.uint8(P)
		L = numpy.uint8(L)
                P_train, P_test, L_train, L_test = train_test_split(P, L, train_size = 0.8, test_size = 0.2, random_state = 22)
                self.feature = P_train
                self.label = L_train
                self.feature_test = P_test
                self.label_test = L_test
                mytime.end()
                mytime.final()

Example #19

Source File: classification.py From text-analytics-with-python with Apache License 2.0

5 votes

def prepare_datasets(corpus, labels, test_data_proportion=0.3):
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels, 
                                                        test_size=0.33, random_state=42)
    return train_X, test_X, train_Y, test_Y

Example #20

Source File: main.py From Python-DevOps with MIT License

5 votes

def train_bayes(corpus,tokenizing=True,cleaning=True,normalizing=True,stem=True,vector='tfidf',split=0.2):
    multinomial,labels,vectorize = None, None, None
    if vector.lower().find('tfidf') < 0 and vector.lower().find('bow'):
        raise Exception('Invalid vectorization technique')
    if isinstance(corpus, str):
        trainset = sklearn.datasets.load_files(container_path = corpus, encoding = 'UTF-8')
        trainset.data, trainset.target = separate_dataset(trainset)
        data, target = trainset.data, trainset.target
        labels = trainset.target_names
    if isinstance(corpus, list) or isinstance(corpus, tuple):
        corpus = np.array(corpus)
        data, target = corpus[:,0].tolist(),corpus[:,1].tolist()
        labels = np.unique(target).tolist()
        target = LabelEncoder().fit_transform(target)
    c = list(zip(data, target))
    random.shuffle(c)
    data, target = zip(*c)
    data, target = list(data), list(target)
    if stem:
        for i in range(len(data)): data[i] = ' '.join([stemming(k) for k in data[i].split()])
    if cleaning:
        for i in range(len(data)): data[i] = clearstring(data[i],tokenizing)
    if vector.lower().find('tfidf') >= 0:
        vectorize = TfidfVectorizer().fit(data)
        vectors = vectorize.transform(data)
    else:
        vectorize = CountVectorizer().fit(data)
        vectors = vectorize.transform(data)
    multinomial = MultinomialNB()
    if split:
        train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = split)
        multinomial.partial_fit(train_X, train_Y,classes=np.unique(target))
        predicted = multinomial.predict(test_X)
        print(metrics.classification_report(test_Y, predicted, target_names = labels))
    else:
        multinomial.partial_fit(vectors,target,classes=np.unique(target))
        predicted = multinomial.predict(vectors)
        print(metrics.classification_report(target, predicted, target_names = labels))
    return USER_BAYES(multinomial,labels,vectorize)

Example #21

Source File: test_vae.py From smrt with BSD 3-Clause "New" or "Revised" License

5 votes

def test_autoencoder():
    mnist = input_data.read_data_sets("MNIST_data", one_hot=True)
    all_data = np.asarray(mnist.train.images)

    seed = 42
    X_train, X_test = train_test_split(all_data, train_size=0.7, random_state=seed)

    # define
    ae = VariationalAutoEncoder(n_hidden=400, n_latent_factors=20, n_epochs=10,
                                learning_rate=0.01, batch_size=256,
                                display_step=5, activation_function='sigmoid', verbose=2,
                                random_state=seed, layer_type='gaussian')

    # fit
    ae.fit(X_train)

    # show we can get the shape
    _ = ae.topography_.shape

    # train error
    # assert_almost_equal(ae.train_cost_, 0.00380031)

    # assert transform works todo assert vals
    ae.transform(X_train)

    # generate a sample
    ae.generate()

    # get the error:
    # mse = ((X_test - reconstructed) ** 2).sum(axis=1).sum() / X_test.shape[0]

    # assert_almost_equal(mse, 4.40549573864)

    # try creating a few synthetic ones using the generate_from_sample method
    synth = ae.generate_from_sample(X_test[:5])
    assert synth.shape[0] == 5

Example #22

Source File: model.py From DeepNews with Apache License 2.0

5 votes

def split_test_train(self, X, y, nb_val_samples=100):
        """
        split X,y data into training and testing
        """
        X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=nb_val_samples, random_state=seed)
        return (X_train, X_test, Y_train, Y_test)

Example #23

Source File: functions.py From topicModelling with GNU General Public License v3.0

5 votes

def perform_class(X, y, iterations=1):
    scores = []
    for i in range(iterations):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42+iterations)
        parameters = {'C':[0.01, 0.1, 1, 10, 100]}
        clf_acc = GridSearchCV(svm.LinearSVC(), parameters, n_jobs=3, cv=3, refit=True, scoring = 'accuracy')
        clf_acc.fit(X_train, y_train)
        scores.append([metrics.accuracy_score(y_test, clf_acc.predict(X_test)), metrics.f1_score(y_test, clf_acc.predict(X_test),average='micro')])
    acc = np.mean([x[0] for x in scores]), np.std([x[0] for x in scores])
    mif = np.mean([x[1] for x in scores]), np.std([x[1] for x in scores])
    return acc, mif

Example #24

Source File: functions.py From topicModelling with GNU General Public License v3.0

5 votes

def perform_class(X, y, iterations=1):
    scores = []
    for i in range(iterations):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42+iterations)
        parameters = {'C':[0.01, 0.1, 1, 10, 100]}
        clf_acc = GridSearchCV(svm.LinearSVC(), parameters, n_jobs=3, cv=3, refit=True, scoring = 'accuracy')
        clf_acc.fit(X_train, y_train)
        scores.append([metrics.accuracy_score(y_test, clf_acc.predict(X_test)), metrics.f1_score(y_test, clf_acc.predict(X_test),average='micro')])
    acc = np.mean([x[0] for x in scores]), np.std([x[0] for x in scores])
    mif = np.mean([x[1] for x in scores]), np.std([x[1] for x in scores])
    return acc, mif

Example #25

Source File: models.py From lentil with Apache License 2.0

5 votes

def fit(self):
        """
        Estimate model parameters that fit the interaction history in self.history
        """
        X = self.feature_matrix_from_interactions(self.history)
        Y = np.array(self.history['outcome'].apply(lambda x: 1 if x else 0).values)

        Cs = [0.1, 1., 10.]
        def val_log_likelihood(C):
            """
            Compute average log-likelihood of IRT model with a specific
            regularization constant on a validation set

            :param float C: Coefficient of L2 regularization term
            :rtype: float
            :return: Average log-likelihood on validation set
            """
            train_idxes, val_idxes = cross_validation.train_test_split(
                np.arange(0, len(self.history), 1), train_size=0.7)
            model = LogisticRegression(penalty='l2', C=C)
            X_train = self.feature_matrix_from_interactions(self.history.ix[train_idxes])
            model.fit(X_train, Y[train_idxes])
            X_val = self.feature_matrix_from_interactions(self.history.ix[val_idxes])
            log_probas = model.predict_log_proba(X_val)
            idx_of_zero = 1 if model.classes_[1]==0 else 0
            return np.mean(log_probas[np.arange(0, len(val_idxes), 1), idx_of_zero ^ Y[val_idxes]])

        self.model = LogisticRegression(penalty='l2', C=(
            1. if not self.select_regularization_constant else max(Cs, key=val_log_likelihood)))

        self.model.fit(X, Y)

Example #26