Python sklearn.cross_validation.train_test_split() Examples

The following are 30 code examples of sklearn.cross_validation.train_test_split(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.cross_validation , or try the search function .
Example #1
Source File: analysis.py    From smallrnaseq with GNU General Public License v3.0 7 votes vote down vote up
def classify(X, y, cl, name=''):
    """Classification using gene features"""

    from sklearn.metrics import classification_report, accuracy_score
    np.random.seed()
    ind = np.random.permutation(len(X))

    from sklearn.cross_validation import train_test_split
    Xtrain, Xtest, ytrain, ytest  = train_test_split(X, y, test_size=0.4)
    #print X
    cl.fit(Xtrain, ytrain)
    ypred = cl.predict(Xtest)

    print (classification_report(ytest, ypred))
    #print accuracy_score(ytest, ypred)
    from sklearn import cross_validation
    yl = pd.Categorical(y).labels
    sc = cross_validation.cross_val_score(cl, X, yl, scoring='roc_auc', cv=5)
    print("AUC: %0.2f (+/- %0.2f)" % (sc.mean(), sc.std() * 2))
    return cl 
Example #2
Source File: label_digits.py    From libact with BSD 2-Clause "Simplified" License 7 votes vote down vote up
def split_train_test(n_classes):
    from sklearn.datasets import load_digits

    n_labeled = 5
    digits = load_digits(n_class=n_classes)  # consider binary case
    X = digits.data
    y = digits.target
    print(np.shape(X))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    while len(np.unique(y_train[:n_labeled])) < n_classes:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.33)

    trn_ds = Dataset(X_train, np.concatenate(
        [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
    tst_ds = Dataset(X_test, y_test)

    return trn_ds, tst_ds, digits 
Example #3
Source File: data_preparation_tools.py    From corpus-to-graph-ml with MIT License 6 votes vote down vote up
def split_to_test_and_train(data, labels, entities, test_size=DEFAULT_TEST_SIZE):
    d_train, d_test, l_train, l_test, c_train, c_test = train_test_split(data, labels, entities, test_size=test_size)
    d_test_2 = []
    l_test_2 = []
    c_test_2 = []

    train_dict = {}
    for d in d_train:
        train_dict[d] = 1

    for d,l,c in zip(d_test, l_test, c_test):
        if (train_dict.has_key(d)):
            continue
        d_test_2.append(d)
        l_test_2.append(l)
        c_test_2.append(c)

    return (d_train, d_test_2, l_train, l_test_2, c_train, c_test_2)

# utility to extracts entities from preproceseed files 
Example #4
Source File: p119_squential_backward_selection.py    From PythonMachineLearningExamples with MIT License 6 votes vote down vote up
def fit(self, X, y):
        X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=self.test_size,
        random_state=self.random_state)
        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train, y_train,
        X_test, y_test, self.indices_)
        self.scores_ = [score]
        while dim > self.k_features:
            scores = []
            subsets = []
            for p in combinations(self.indices_, r=dim-1):
                score = self._calc_score(X_train, y_train,
                X_test, y_test, p)
                scores.append(score)
                subsets.append(p)
            best = np.argmax(scores)
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            dim -= 1
            self.scores_.append(scores[best])
        self.k_score_ = self.scores_[-1]
        return self 
Example #5
Source File: sklearn-RS-demo-cf-item-test.py    From AiLearning with GNU General Public License v3.0 6 votes vote down vote up
def splitData(self, dataFile, test_size):
        # 加载数据集
        header = ['user_id', 'item_id', 'rating', 'timestamp']
        df = pd.read_csv(dataFile, sep='\t', names=header)

        self.n_users = df.user_id.unique().shape[0]
        self.n_items = df.item_id.unique().shape[0]

        print('Number of users = ' + str(self.n_users) +
              ' | Number of items = ' + str(self.n_items))

        # 拆分数据集:  用户+电影
        self.train_data, self.test_data = cv.train_test_split(
            df, test_size=test_size)
        print('分离训练集和测试集成功', file=sys.stderr)
        print('len(train) = %s' % np.shape(self.train_data)[0], file=sys.stderr)
        print('len(test) = %s' % np.shape(self.test_data)[0], file=sys.stderr) 
Example #6
Source File: prepare_data.py    From personal-photos-model with Apache License 2.0 6 votes vote down vote up
def _shuffle_images_for_target(self, data, target):
    """
    Takes all the non-paired images for a given person, slices them into training, validation, and
    training sets, and shuffles within each of these sets.
    """
    # train_test_split can only partition into two sets, so we have to partition into two sets, then
    # further partition the validation set into a test set.
    (train_data, other_data, train_target, other_target) = train_test_split(data, target,
      train_size=0.7, test_size=0.3, random_state=0)
    self._train["data"].extend(train_data)
    self._train["target"].extend(train_target)

    (validation_data, test_data, validation_target, test_target) = train_test_split(other_data,
      other_target, train_size=0.9, test_size=0.1, random_state=0)
    self._validation["data"].extend(validation_data)
    self._validation["target"].extend(validation_target)
    self._test["data"].extend(test_data)
    self._test["target"].extend(test_target) 
Example #7
Source File: data_loader.py    From datastories-semeval2017-task4 with MIT License 6 votes vote down vote up
def load_train_val_test(self, only_test=False):
        X_train, X_rest, y_train, y_rest = train_test_split(self.X, self.y,
                                                            test_size=0.3,
                                                            stratify=self.y,
                                                            random_state=42)
        X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest,
                                                        test_size=0.5,
                                                        stratify=y_rest,
                                                        random_state=42)

        if not only_test:
            print("\nPreparing training set...")
            training = prepare_dataset(X_train, y_train, self.pipeline,
                                       self.y_one_hot)
            print("\nPreparing validation set...")
            validation = prepare_dataset(X_val, y_val, self.pipeline,
                                         self.y_one_hot)
        print("\nPreparing test set...")
        testing = prepare_dataset(X_test, y_test, self.pipeline,
                                  self.y_one_hot)

        if only_test:
            return testing
        else:
            return training, validation, testing 
Example #8
Source File: iris_run_config.py    From deep_image_model with Apache License 2.0 6 votes vote down vote up
def main(unused_argv):
  # Load dataset.
  iris = datasets.load_iris()
  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  # You can define you configurations by providing a RunConfig object to
  # estimator to control session configurations, e.g. num_cores
  # and gpu_memory_fraction
  run_config = tf.contrib.learn.estimators.RunConfig(
      num_cores=3, gpu_memory_fraction=0.6)

  # Build 3 layer DNN with 10, 20, 10 units respectively.
  feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
      x_train)
  classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
                                              hidden_units=[10, 20, 10],
                                              n_classes=3,
                                              config=run_config)

  # Fit and predict.
  classifier.fit(x_train, y_train, steps=200)
  predictions = list(classifier.predict(x_test, as_iterable=True))
  score = metrics.accuracy_score(y_test, predictions)
  print('Accuracy: {0:f}'.format(score)) 
Example #9
Source File: iris.py    From deep_image_model with Apache License 2.0 6 votes vote down vote up
def main(unused_argv):
  # Load dataset.
  iris = learn.datasets.load_dataset('iris')
  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  # Build 3 layer DNN with 10, 20, 10 units respectively.
  feature_columns = learn.infer_real_valued_columns_from_input(x_train)
  classifier = learn.DNNClassifier(
      feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3)

  # Fit and predict.
  classifier.fit(x_train, y_train, steps=200)
  predictions = list(classifier.predict(x_test, as_iterable=True))
  score = metrics.accuracy_score(y_test, predictions)
  print('Accuracy: {0:f}'.format(score)) 
Example #10
Source File: iris_custom_decay_dnn.py    From deep_image_model with Apache License 2.0 6 votes vote down vote up
def main(unused_argv):
  iris = datasets.load_iris()
  x_train, x_test, y_train, y_test = train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
      x_train)
  classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
                                              hidden_units=[10, 20, 10],
                                              n_classes=3,
                                              optimizer=optimizer_exp_decay)

  classifier.fit(x_train, y_train, steps=800)
  predictions = list(classifier.predict(x_test, as_iterable=True))
  score = metrics.accuracy_score(y_test, predictions)
  print('Accuracy: {0:f}'.format(score)) 
Example #11
Source File: faces.py    From ConvNetPy with MIT License 6 votes vote down vote up
def load_data():
    global training_data, testing_data

    lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

    xs = lfw_people.data
    ys = lfw_people.target

    inputs = []
    labels = list(ys)

    for face in xs:
        V = Vol(50, 37, 1, 0.0)
        V.w = list(face)
        inputs.append(augment(V, 30))

    x_tr, x_te, y_tr, y_te = train_test_split(inputs, labels, test_size=0.25)

    training_data = zip(x_tr, y_tr)
    testing_data = zip(x_te, y_te)

    print 'Dataset made...' 
Example #12
Source File: test_display.py    From diogenes with MIT License 6 votes vote down vote up
def test_get_top_features(self):
        M, labels = uft.generate_test_matrix(1000, 15, random_state=0)
        M = utils.cast_np_sa_to_nd(M)
        M_train, M_test, labels_train, labels_test = train_test_split(
                M, 
                labels)
        clf = RandomForestClassifier(random_state=0)
        clf.fit(M_train, labels_train)

        ctrl_feat_importances = clf.feature_importances_
        ctrl_col_names = ['f{}'.format(i) for i in xrange(15)]
        ctrl_feat_ranks = np.argsort(ctrl_feat_importances)[::-1][:10]
        ctrl = utils.convert_to_sa(
                zip(ctrl_col_names, ctrl_feat_importances),
                col_names=('feat_name', 'score'))[ctrl_feat_ranks]

        res = dsp.get_top_features(clf, M, verbose=False)
        self.assertTrue(uft.array_equal(ctrl, res))

        res = dsp.get_top_features(clf, col_names=['f{}'.format(i) for i in xrange(15)], verbose=False)
        self.assertTrue(uft.array_equal(ctrl, res)) 
Example #13
Source File: data_loader.py    From datastories-semeval2017-task4 with MIT License 5 votes vote down vote up
def load_final(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y,
                                                            test_size=0.1,
                                                            stratify=self.y,
                                                            random_state=27)
        print("\nPreparing training set...")
        training = prepare_dataset(X_train, y_train, self.pipeline,
                                   self.y_one_hot)
        print("\nPreparing test set...")
        testing = prepare_dataset(X_test, y_test, self.pipeline,
                                  self.y_one_hot)
        return training, testing 
Example #14
Source File: recipe_classification.py    From Flavor-Network with GNU General Public License v3.0 5 votes vote down vote up
def logistic_test(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print 'First round:',metrics.accuracy_score(y_test,y_pred)
    #tune parameter C
    crange =[0.01,0.1,1,10,100]
    for num in crange:
        model = LogisticRegression(C=num)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print 'C=', num, ',score=', metrics.accuracy_score(y_test,y_pred) 
Example #15
Source File: stack5.py    From semeval2017-scienceie with Apache License 2.0 5 votes vote down vote up
def model_withValidation(X_train_total, Y_train_total,X_test=None,Y_test=None,words_test=None,indices2labels=None,hiddenDim=250, filename_x = "none", filename_y = "none"):
    
    X_train, X_dev, Y_train, Y_dev = train_test_split(X_train_total, Y_train_total, test_size=0.10, random_state=0)

    model = Sequential()


    model.add(Dense(output_dim=hiddenDim, input_dim=X_train.shape[1]))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    
    model.add(Dense(3))
    model.add(Activation("softmax"))

    model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])

    weightsPath = "./tmp/myfooo2%s.dat"%(time.time())
    checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1, save_best_only=True)

    model.fit(X_train, Y_train, verbose=2,  nb_epoch=100, batch_size=32, validation_data=(X_dev,Y_dev),callbacks=[checkpointer])

    model.load_weights(weightsPath)
    loss, acc = model.evaluate(X_test,Y_test, batch_size=32)
    
    print("loss : %0.5f Accuracy :%0.5f"%(loss,acc))

    cf = confusion_matrix(Y_test[:,1],model.predict_classes(X_test))
    print(cf)
    predictions = model.predict_classes(X_test)
    print("-->",predictions)
    
    return model,predictions 
Example #16
Source File: stack5.py    From semeval2017-scienceie with Apache License 2.0 5 votes vote down vote up
def model_withValidation(X_train_total, Y_train_total,X_test=None,Y_test=None,words_test=None,indices2labels=None,hiddenDim=250, filename_x = "none", filename_y = "none"):
    
    X_train, X_dev, Y_train, Y_dev = train_test_split(X_train_total, Y_train_total, test_size=0.10, random_state=0)

    model = Sequential()


    model.add(Dense(output_dim=hiddenDim, input_dim=X_train.shape[1]))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    
    
#    model.add(Dense(output_dim=1000))
#    model.add(BatchNormalization())
#    model.add(Activation("relu"))
    
#     model.add(Dense(output_dim=20))
#     model.add(BatchNormalization())
#     model.add(Activation("relu"))
    
    model.add(Dense(3))
    model.add(Activation("softmax"))

    model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])

    weightsPath = "./tmp/myfooo2%s.dat"%(time.time())
    checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1, save_best_only=True)

    model.fit(X_train, Y_train, verbose=2,  nb_epoch=100, batch_size=32, validation_data=(X_dev,Y_dev),callbacks=[checkpointer])

    model.load_weights(weightsPath)
    loss, acc = model.evaluate(X_test,Y_test, batch_size=32)
    
    print("loss : %0.5f Accuracy :%0.5f"%(loss,acc))

    cf = confusion_matrix(Y_test[:,1],model.predict_classes(X_test))
    print(cf)
    predictions = model.predict_classes(X_test)
    print("-->",predictions)
    
    return model,predictions 
Example #17
Source File: logistic_regression_updated.py    From DataSciencePython with MIT License 5 votes vote down vote up
def cv_loop(X, y, model, N):
    mean_auc = 0.
    for i in range(N):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
                                       X, y, test_size=.20, 
                                       random_state = i*SEED)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)[:,1]
        auc = metrics.auc_score(y_cv, preds)
        print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
        mean_auc += auc
    return mean_auc/N 
Example #18
Source File: classifier.py    From TextDetector with GNU General Public License v3.0 5 votes vote down vote up
def data_load(self, datadir):
                mytime = timeLog('../timelogs/data_load')
                mytime.start()
		print 'Loading data ....',
		P, L = data_load(datadir)
		P = numpy.uint8(P)
		L = numpy.uint8(L)
                P_train, P_test, L_train, L_test = train_test_split(P, L, train_size = 0.8, test_size = 0.2, random_state = 22)
                self.feature = P_train
                self.label = L_train
                self.feature_test = P_test
                self.label_test = L_test
                mytime.end()
                mytime.final() 
Example #19
Source File: classification.py    From text-analytics-with-python with Apache License 2.0 5 votes vote down vote up
def prepare_datasets(corpus, labels, test_data_proportion=0.3):
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels, 
                                                        test_size=0.33, random_state=42)
    return train_X, test_X, train_Y, test_Y 
Example #20
Source File: main.py    From Python-DevOps with MIT License 5 votes vote down vote up
def train_bayes(corpus,tokenizing=True,cleaning=True,normalizing=True,stem=True,vector='tfidf',split=0.2):
    multinomial,labels,vectorize = None, None, None
    if vector.lower().find('tfidf') < 0 and vector.lower().find('bow'):
        raise Exception('Invalid vectorization technique')
    if isinstance(corpus, str):
        trainset = sklearn.datasets.load_files(container_path = corpus, encoding = 'UTF-8')
        trainset.data, trainset.target = separate_dataset(trainset)
        data, target = trainset.data, trainset.target
        labels = trainset.target_names
    if isinstance(corpus, list) or isinstance(corpus, tuple):
        corpus = np.array(corpus)
        data, target = corpus[:,0].tolist(),corpus[:,1].tolist()
        labels = np.unique(target).tolist()
        target = LabelEncoder().fit_transform(target)
    c = list(zip(data, target))
    random.shuffle(c)
    data, target = zip(*c)
    data, target = list(data), list(target)
    if stem:
        for i in range(len(data)): data[i] = ' '.join([stemming(k) for k in data[i].split()])
    if cleaning:
        for i in range(len(data)): data[i] = clearstring(data[i],tokenizing)
    if vector.lower().find('tfidf') >= 0:
        vectorize = TfidfVectorizer().fit(data)
        vectors = vectorize.transform(data)
    else:
        vectorize = CountVectorizer().fit(data)
        vectors = vectorize.transform(data)
    multinomial = MultinomialNB()
    if split:
        train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = split)
        multinomial.partial_fit(train_X, train_Y,classes=np.unique(target))
        predicted = multinomial.predict(test_X)
        print(metrics.classification_report(test_Y, predicted, target_names = labels))
    else:
        multinomial.partial_fit(vectors,target,classes=np.unique(target))
        predicted = multinomial.predict(vectors)
        print(metrics.classification_report(target, predicted, target_names = labels))
    return USER_BAYES(multinomial,labels,vectorize) 
Example #21
Source File: test_vae.py    From smrt with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_autoencoder():
    mnist = input_data.read_data_sets("MNIST_data", one_hot=True)
    all_data = np.asarray(mnist.train.images)

    seed = 42
    X_train, X_test = train_test_split(all_data, train_size=0.7, random_state=seed)

    # define
    ae = VariationalAutoEncoder(n_hidden=400, n_latent_factors=20, n_epochs=10,
                                learning_rate=0.01, batch_size=256,
                                display_step=5, activation_function='sigmoid', verbose=2,
                                random_state=seed, layer_type='gaussian')

    # fit
    ae.fit(X_train)

    # show we can get the shape
    _ = ae.topography_.shape

    # train error
    # assert_almost_equal(ae.train_cost_, 0.00380031)

    # assert transform works todo assert vals
    ae.transform(X_train)

    # generate a sample
    ae.generate()

    # get the error:
    # mse = ((X_test - reconstructed) ** 2).sum(axis=1).sum() / X_test.shape[0]

    # assert_almost_equal(mse, 4.40549573864)

    # try creating a few synthetic ones using the generate_from_sample method
    synth = ae.generate_from_sample(X_test[:5])
    assert synth.shape[0] == 5 
Example #22
Source File: model.py    From DeepNews with Apache License 2.0 5 votes vote down vote up
def split_test_train(self, X, y, nb_val_samples=100):
        """
        split X,y data into training and testing
        """
        X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=nb_val_samples, random_state=seed)
        return (X_train, X_test, Y_train, Y_test) 
Example #23
Source File: functions.py    From topicModelling with GNU General Public License v3.0 5 votes vote down vote up
def perform_class(X, y, iterations=1):
    scores = []
    for i in range(iterations):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42+iterations)
        parameters = {'C':[0.01, 0.1, 1, 10, 100]}
        clf_acc = GridSearchCV(svm.LinearSVC(), parameters, n_jobs=3, cv=3, refit=True, scoring = 'accuracy')
        clf_acc.fit(X_train, y_train)
        scores.append([metrics.accuracy_score(y_test, clf_acc.predict(X_test)), metrics.f1_score(y_test, clf_acc.predict(X_test),average='micro')])
    acc = np.mean([x[0] for x in scores]), np.std([x[0] for x in scores])
    mif = np.mean([x[1] for x in scores]), np.std([x[1] for x in scores])
    return acc, mif 
Example #24
Source File: functions.py    From topicModelling with GNU General Public License v3.0 5 votes vote down vote up
def perform_class(X, y, iterations=1):
    scores = []
    for i in range(iterations):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42+iterations)
        parameters = {'C':[0.01, 0.1, 1, 10, 100]}
        clf_acc = GridSearchCV(svm.LinearSVC(), parameters, n_jobs=3, cv=3, refit=True, scoring = 'accuracy')
        clf_acc.fit(X_train, y_train)
        scores.append([metrics.accuracy_score(y_test, clf_acc.predict(X_test)), metrics.f1_score(y_test, clf_acc.predict(X_test),average='micro')])
    acc = np.mean([x[0] for x in scores]), np.std([x[0] for x in scores])
    mif = np.mean([x[1] for x in scores]), np.std([x[1] for x in scores])
    return acc, mif 
Example #25
Source File: models.py    From lentil with Apache License 2.0 5 votes vote down vote up
def fit(self):
        """
        Estimate model parameters that fit the interaction history in self.history
        """
        X = self.feature_matrix_from_interactions(self.history)
        Y = np.array(self.history['outcome'].apply(lambda x: 1 if x else 0).values)

        Cs = [0.1, 1., 10.]
        def val_log_likelihood(C):
            """
            Compute average log-likelihood of IRT model with a specific
            regularization constant on a validation set

            :param float C: Coefficient of L2 regularization term
            :rtype: float
            :return: Average log-likelihood on validation set
            """
            train_idxes, val_idxes = cross_validation.train_test_split(
                np.arange(0, len(self.history), 1), train_size=0.7)
            model = LogisticRegression(penalty='l2', C=C)
            X_train = self.feature_matrix_from_interactions(self.history.ix[train_idxes])
            model.fit(X_train, Y[train_idxes])
            X_val = self.feature_matrix_from_interactions(self.history.ix[val_idxes])
            log_probas = model.predict_log_proba(X_val)
            idx_of_zero = 1 if model.classes_[1]==0 else 0
            return np.mean(log_probas[np.arange(0, len(val_idxes), 1), idx_of_zero ^ Y[val_idxes]])

        self.model = LogisticRegression(penalty='l2', C=(
            1. if not self.select_regularization_constant else max(Cs, key=val_log_likelihood)))

        self.model.fit(X, Y) 
Example #26
Source File: solution.py    From Kaggle with MIT License 5 votes vote down vote up
def optimize_logisticRegression():
    train_data = pd.read_csv(r"data/train.csv")
    print u"数据信息:\n",train_data.info()
    print u'数据描述:\n',train_data.describe()  
    #display_data(train_data)  # 简单显示数据信息
    #display_with_process(train_data) # 根据数据的理解,简单处理一下数据显示,验证猜想
    process_data = fe_preprocessData(train_data,'process_train_data')  # 数据预处理,要训练的数据
    train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')  # 使用正则抽取想要的数据
    train_np = train_data.as_matrix()  # 转为矩阵
    '''训练model'''
    X = train_np[:,1:]
    y = train_np[:,0]
    #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
    model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y)
    print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})

    '''测试集上预测'''
    test_data = pd.read_csv(r"data/test.csv")
    process_test_data = fe_preprocessData(test_data,'process_test_data')  # 预处理数据
    test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    test_np = test_data.as_matrix()
    predict = model.predict(test_np)
    result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
    result.to_csv(r'optimize_logisticRegression_result/prediction.csv',index=False)
    #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
    #print cross_validation.cross_val_score(clf, X,y,cv=5)    
## 两项映射为多项式 
Example #27
Source File: solution.py    From Kaggle with MIT License 5 votes vote down vote up
def baseline_logisticRegression_crossValidate():
    origin_train_data = pd.read_csv(r"data/train.csv")
    process_data = fe_preprocessData(origin_train_data,'process_train_data')  # 数据预处理,要训练的数据
    process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2)
    train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')  # 使用正则抽取想要的数据
    train_np = train_data.as_matrix()  # 转为矩阵
    '''训练model'''
    X_train = train_np[:,1:]
    y_train = train_np[:,0]
    model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
    print pd.DataFrame({'columns':list(train_data.columns[1:]),'coef_':list(model.coef_.T)})
    
    cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    cv_np = cv_data.as_matrix()
    X_cv = cv_np[:,1:]
    y_cv = cv_np[:,0]
    predictions = model.predict(X_cv)
    print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0])
    
    '''找到预测错的原始数据,并保存到文件'''
    error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)]
    predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId'])
    predictions_item.columns=['error_PassengerId']
    error_result = pd.concat([error_items,predictions_item],axis=1)
    error_result.to_csv(r'error.csv',index=False)
    
    #=print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
    #=prediction = model.predict(X_test)
    #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0])    
    
    '''测试集上预测'''
    '''test_data = pd.read_csv(r"data/test.csv")
    process_test_data = fe_preprocessData(test_data,'process_test_data',optimize=True)  # 预处理数据
    test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    test_np = test_data.as_matrix()
    predict = model.predict(test_np)
    result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
    result.to_csv(r'logisticRegression_result/prediction.csv',index=False)'''
    #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
    #print cross_validation.cross_val_score(clf, X,y,cv=5) 
Example #28
Source File: solution.py    From Kaggle with MIT License 5 votes vote down vote up
def baseline_svm_crossValidate():
    origin_train_data = pd.read_csv(r"data/train.csv")
    process_data = pre_processData(origin_train_data,'process_train_data')  # 数据预处理,要训练的数据
    process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2)   
    train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')  # 使用正则抽取想要的数据
    train_np = train_data.as_matrix()  # 转为矩阵
    '''训练model'''
    X_train = train_np[:,1:]
    y_train = train_np[:,0]
    model = svm.SVC(kernel='rbf',tol=1e-6).fit(X_train,y_train)
    #print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
    cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    cv_np = cv_data.as_matrix()
    X_cv = cv_np[:,1:]
    y_cv = cv_np[:,0]
    predictions = model.predict(X_cv)
    print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0])

    error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)]
    predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId'])
    predictions_item.columns=['error_PassengerId']
    # error_items = error_items.reset_index(drop=True)
    error_result = pd.concat([error_items,predictions_item],axis=1)
    error_result.to_csv(r'error.csv',index=False)
    
    
    '''测试集上预测'''
    '''test_data = pd.read_csv(r"data/test.csv")
    process_test_data = pre_processData(test_data,'process_test_data',optimize=False)  # 预处理数据
    test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    test_np = test_data.as_matrix()
    predict = model.predict(test_np)
    result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
    result.to_csv(r'svm_result/prediction.csv',index=False)'''
    
    
    
# baseline crossValidate:逻辑回归模型——进行交叉验证 
Example #29
Source File: solution.py    From Kaggle with MIT License 5 votes vote down vote up
def baseline_randomForest():
    train_data = pd.read_csv(r"data/train.csv")
    print u"数据信息:\n",train_data.info()
    print u'数据描述:\n',train_data.describe()  
    #display_data(train_data)  # 简单显示数据信息
    #display_with_process(train_data) # 根据数据的理解,简单处理一下数据显示,验证猜想
    process_data = pre_processData(train_data,'process_train_data',optimize=False)  # 数据预处理,要训练的数据
    train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')  # 使用正则抽取想要的数据
    train_np = train_data.as_matrix()  # 转为矩阵
    
    '''训练model'''
    X = train_np[:,1:]
    y = train_np[:,0] 
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    
    model = RandomForestClassifier(n_estimators=100).fit(X,y)
    #predictions = model.predict(X_test)
    #print np.float32(np.sum(predictions == y_test))/np.float32(predictions.shape[0])
    '''预测'''
    test_data = pd.read_csv(r"data/test.csv")
    process_test_data = pre_processData(test_data,'process_test_data',optimize=False)  # 预处理数据
    test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    test_np = test_data.as_matrix()
    predict = model.predict(test_np)
    result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
    result.to_csv(r'baseline_randomForest_result/prediction.csv',index=False)   
# baseline crossValidate:SVM模型———进行交叉验证: 
Example #30
Source File: solution.py    From Kaggle with MIT License 5 votes vote down vote up
def baseline_logisticRegression():
    train_data = pd.read_csv(r"data/train.csv")
    #print u"数据信息:\n",train_data.info()
    #print u'数据描述:\n',train_data.describe()  
    #display_data(train_data)  # 简单显示数据信息
    #display_with_process(train_data) # 根据数据的理解,简单处理一下数据显示,验证猜想
    process_data = pre_processData(train_data,'process_train_data')  # 数据预处理,要训练的数据
    train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')  # 使用正则抽取想要的数据
    train_np = train_data.as_matrix()  # 转为矩阵
    '''训练model'''
    X = train_np[:,1:]
    y = train_np[:,0]
    #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
    model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y)
    print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
    #=prediction = model.predict(X_test)
    #=cv_error = pd.DataFrame(data=list(X_test[np.where(prediction!=y_test)]),columns=list(train_data.columns)[1:])
    #=cv_error.to_csv(r'error.csv',index=True)
    #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0])
    
    '''测试集上预测'''
    test_data = pd.read_csv(r"data/test.csv")
    process_test_data = pre_processData(test_data,'process_test_data')  # 预处理数据
    test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    test_np = test_data.as_matrix()
    predict = model.predict(test_np)
    result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
    result.to_csv(r'baseline_logisticRegression_result/prediction.csv',index=False)
    #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
    #print cross_validation.cross_val_score(clf, X,y,cv=5)


# baseline:SVM模型——0.78947