Python sklearn.cross_validation.train_test_split() Examples

The following are code examples for showing how to use sklearn.cross_validation.train_test_split(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: euclid   Author: njpayne   File: data_work.py    GNU General Public License v2.0 7 votes vote down vote up
def divide_for_training(data):
    ##first use the category for training and use the rest as features except for period code
    ##select_columns = ["names", "of", "columns"]
    #select_columns = header 

    ##select the appropriate columns
    #selected_header, selected_data = select_data_columns(header, data, select_columns)

    #have scikit partition the data into training and test sets
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(data[ : , 1 : ], data[ : ,  : 1], test_size=0.15, random_state=0)

    #create the scaler the data based on the training data
    #this is used to scale values to 0 mean and unit variance
    data_scaler = StandardScaler().fit(X_train.astype(np.float32))

    #scale training and test set to mean 0 with unit variance
    X_train = data_scaler.transform(X_train.astype(np.float32))
    X_test = data_scaler.transform(X_test.astype(np.float32))

    return X_train, X_test, y_train, y_test 
Example 2
Project: libact   Author: ntucllab   File: label_digits.py    BSD 2-Clause "Simplified" License 7 votes vote down vote up
def split_train_test(n_classes):
    from sklearn.datasets import load_digits

    n_labeled = 5
    digits = load_digits(n_class=n_classes)  # consider binary case
    X = digits.data
    y = digits.target
    print(np.shape(X))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    while len(np.unique(y_train[:n_labeled])) < n_classes:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.33)

    trn_ds = Dataset(X_train, np.concatenate(
        [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
    tst_ds = Dataset(X_test, y_test)

    return trn_ds, tst_ds, digits 
Example 3
Project: corpus-to-graph-ml   Author: CatalystCode   File: data_preparation_tools.py    MIT License 6 votes vote down vote up
def split_to_test_and_train(data, labels, entities, test_size=DEFAULT_TEST_SIZE):
    d_train, d_test, l_train, l_test, c_train, c_test = train_test_split(data, labels, entities, test_size=test_size)
    d_test_2 = []
    l_test_2 = []
    c_test_2 = []

    train_dict = {}
    for d in d_train:
        train_dict[d] = 1

    for d,l,c in zip(d_test, l_test, c_test):
        if (train_dict.has_key(d)):
            continue
        d_test_2.append(d)
        l_test_2.append(l)
        c_test_2.append(c)

    return (d_train, d_test_2, l_train, l_test_2, c_train, c_test_2)

# utility to extracts entities from preproceseed files 
Example 4
Project: snape   Author: mbernico   File: make_dataset.py    Apache License 2.0 6 votes vote down vote up
def write_dataset(df, file_name, out_path="." + os.path.sep):
    """
    Writes generated dataset to file

    :param df: dataframe to write
    :param file_name: beginning of filename
    :param out_path: the path to write the dataset
    :return: None
    """
    # todo: Mike, do we want to take a param for overwriting existing files?
    df_train, df_testkey = train_test_split(df, test_size=.2)

    df_train.to_csv(out_path + file_name + "_train.csv", index=False)
    df_test = df_testkey.drop(['y'], axis=1)
    df_test.to_csv(out_path + file_name + "_test.csv", index=False)
    df_testkey.to_csv(out_path + file_name + "_testkey.csv", index=False) 
Example 5
Project: libact   Author: ntucllab   File: multilabel_plot.py    BSD 2-Clause "Simplified" License 6 votes vote down vote up
def split_train_test(test_size):
    # choose a dataset with unbalanced class instances
    data = make_multilabel_classification(
        n_samples=300, n_classes=10, allow_unlabeled=False)
    X = StandardScaler().fit_transform(data[0])
    Y = data[1]

    X_trn, X_tst, Y_trn, Y_tst = train_test_split(X, Y, test_size=test_size)

    trn_ds = Dataset(X_trn, Y_trn[:5].tolist() + [None] * (len(Y_trn) - 5))
    tst_ds = Dataset(X_tst, Y_tst.tolist())

    fully_labeled_trn_ds = Dataset(X_trn, Y_trn)

    return trn_ds, tst_ds, fully_labeled_trn_ds 
Example 6
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 6 votes vote down vote up
def run(sc):
	def zero_matrix(n, m):
		return np.zeros(n*m, dtype = int).reshape(n, m)
	
	def vote_increment(y_est):
		increment = zero_matrix(y_est.size, n_ys)
		increment[np.arange(y_est.size), y_est] = 1
		return increment # test point x class matrix with 1s marking the estimator prediction

	X, y = make_classification()
	X_train, X_test, y_train, y_test = train_test_split(X, y)

	n_test = X_test.shape[0]
	n_ys = np.unique(y_train).size
	
	model = DecisionTreeClassifier()
	# Partition the training data into random sub-samples with replacement.
	samples = sc.parallelize(Bootstrap(y.size))
	# Train a model for each sub-sample and apply it to the test data.
	vote_tally = samples.map(lambda (index, _):
		model.fit(X[index], y[index]).predict(X_test)
	).map(vote_increment).fold(zero_matrix(n_test, n_ys), np.add) # Take the learner majority vote.
	y_estimate_vote = np.argmax(vote_tally, axis = 1)
	return accuracy_score(y_test, y_estimate_vote) 
Example 7
Project: datacleaning-benchmark   Author: sjyk   File: EvalUtils.py    MIT License 6 votes vote down vote up
def generateDirtyTrain(X,
	                   y,
	                   noisemodelX=None, 
	                   noisemodely=None, 
	                   test_size=0.2):

  X_train, X_test, y_train, y_test = train_test_split(X, 
  	                                                  y, 
  	                                                  test_size=test_size)
  if noisemodelX != None:
  	nmx = noisemodelX.reshape(np.shape(X_train))
  	X_train = nmx.apply(X_train)[0]

  if noisemodely != None:
  	nmx = noisemodely.reshape(np.shape(y_train))
  	y_train = nmx.apply(y_train)[0]

  return X_train, X_test, y_train, y_test 
Example 8
Project: Bayesian-Deep-Learning   Author: guilherme-pombo   File: bayesian_neural_net.py    MIT License 6 votes vote down vote up
def create_data(plot=False):
    """
    Create training data
    :return:
    """
    X, Y = make_moons(noise=0.2, random_state=0, n_samples=1000)
    X = scale(X)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5)

    if plot:
        fig, ax = plt.subplots()
        ax.scatter(X[Y == 0, 0], X[Y == 0, 1], label='Class 0')
        ax.scatter(X[Y == 1, 0], X[Y == 1, 1], color='r', label='Class 1')
        sns.despine(); ax.legend()
        ax.set(xlabel='X', ylabel='Y', title='Classification data set');

        plt.show()

    return X, Y, X_train, X_test, Y_train, Y_test 
Example 9
Project: RSV   Author: Healthcast   File: skeleton.py    GNU General Public License v2.0 6 votes vote down vote up
def test_knn():
    iris = datasets.load_iris()
    iris_X = iris.data[:,:2]
    iris_y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=.5, \
                                                        random_state=0)
    #n: number of neighbors
    #weights: uniform or distance
    clf = neighbors.KNeighborsClassifier(15, weights='uniform')
    clf.fit(X_train, y_train)
    x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
    y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1

    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z)

    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i, weights = '%s')"%(15, 'uniform'))
    plt.show() 
Example 10
Project: RSV   Author: Healthcast   File: skeleton.py    GNU General Public License v2.0 6 votes vote down vote up
def test_knn():
    iris = datasets.load_iris()
    iris_X = iris.data[:,:2]
    iris_y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=.5, \
                                                        random_state=0)
    #n: number of neighbors
    #weights: uniform or distance
    clf = neighbors.KNeighborsClassifier(15, weights='uniform')
    clf.fit(X_train, y_train)
    x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
    y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1

    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z)

    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i, weights = '%s')"%(15, 'uniform'))
    plt.show() 
Example 11
Project: RPGOne   Author: RTHMaK   File: test_custom_decay.py    Apache License 2.0 6 votes vote down vote up
def testIrisExponentialDecay(self):
        random.seed(42)

        iris = datasets.load_iris()
        X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                            iris.target,
                                                            test_size=0.2,
                                                            random_state=42)
        # setup exponential decay function
        def exp_decay(global_step):
            return tf.train.exponential_decay(
                learning_rate=0.1, global_step=global_step,
                decay_steps=100, decay_rate=0.001)
        classifier = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
                                                    n_classes=3, steps=800,
                                                    learning_rate=exp_decay)
        classifier.fit(X_train, y_train)
        score = metrics.accuracy_score(y_test, classifier.predict(X_test))

        self.assertGreater(score, 0.7, "Failed with score = {0}".format(score)) 
Example 12
Project: RPGOne   Author: RTHMaK   File: test_estimators.py    Apache License 2.0 6 votes vote down vote up
def testIrisMomentum(self):
        random.seed(42)

        iris = datasets.load_iris()
        X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                            iris.target,
                                                            test_size=0.2,
                                                            random_state=42)
        # setup exponential decay function
        def exp_decay(global_step):
            return tf.train.exponential_decay(
                learning_rate=0.1, global_step=global_step,
                decay_steps=100, decay_rate=0.001)
        custom_optimizer = lambda x: tf.train.MomentumOptimizer(x, 0.9)
        classifier = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
                                                    n_classes=3, steps=800,
                                                    learning_rate=exp_decay,
                                                    optimizer=custom_optimizer)
        classifier.fit(X_train, y_train)
        score = metrics.accuracy_score(y_test, classifier.predict(X_test))

        self.assertGreater(score, 0.7, "Failed with score = {0}".format(score)) 
Example 13
Project: image-recognition   Author: zw76859420   File: face_dl.py    GNU General Public License v3.0 6 votes vote down vote up
def load(self , img_rows=64 , img_cols=64 , img_channels=3 , nb_classes=2):
        images , labels = load_dataset(self.pathname)
        train_images , valid_images , train_labels , valid_labels = train_test_split(images , labels , test_size=0.2 , random_state=random.randint(0 , 100))

        train_images = train_images.reshape(train_images.shape[0] , img_rows , img_cols , img_channels)
        valid_images = valid_images.reshape(valid_images.shape[0] , img_rows , img_cols , img_channels)

        self.input_shape = (img_rows , img_cols , img_channels)

        train_labels = np_utils.to_categorical(train_labels , num_classes=nb_classes)
        valid_labels = np_utils.to_categorical(valid_labels , num_classes=nb_classes)

        train_images.astype('float32')
        valid_images.astype('float32')

        train_images = train_images / 255
        valid_images = valid_images / 255

        self.train_images = train_images
        self.valid_images = valid_images
        self.train_labels = train_labels
        self.valid_labels = valid_labels 
Example 14
Project: apachecn_ml   Author: ys1305   File: sklearn-RS-demo-cf-item-test.py    GNU General Public License v3.0 6 votes vote down vote up
def splitData(self, dataFile, test_size):
        # 加载数据集
        header = ['user_id', 'item_id', 'rating', 'timestamp']
        df = pd.read_csv(dataFile, sep='\t', names=header)

        self.n_users = df.user_id.unique().shape[0]
        self.n_items = df.item_id.unique().shape[0]

        print('Number of users = ' + str(self.n_users) +
              ' | Number of items = ' + str(self.n_items))

        # 拆分数据集: 用户+电影
        self.train_data, self.test_data = cv.train_test_split(
            df, test_size=test_size)
        print >> sys.stderr, '分离训练集和测试集成功'
        print >> sys.stderr, 'len(train) = %s' % np.shape(self.train_data)[0]
        print >> sys.stderr, 'len(test) = %s' % np.shape(self.test_data)[0] 
Example 15
Project: deep_image_model   Author: tobegit3hub   File: iris_run_config.py    Apache License 2.0 6 votes vote down vote up
def main(unused_argv):
  # Load dataset.
  iris = datasets.load_iris()
  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  # You can define you configurations by providing a RunConfig object to
  # estimator to control session configurations, e.g. num_cores
  # and gpu_memory_fraction
  run_config = tf.contrib.learn.estimators.RunConfig(
      num_cores=3, gpu_memory_fraction=0.6)

  # Build 3 layer DNN with 10, 20, 10 units respectively.
  feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
      x_train)
  classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
                                              hidden_units=[10, 20, 10],
                                              n_classes=3,
                                              config=run_config)

  # Fit and predict.
  classifier.fit(x_train, y_train, steps=200)
  predictions = list(classifier.predict(x_test, as_iterable=True))
  score = metrics.accuracy_score(y_test, predictions)
  print('Accuracy: {0:f}'.format(score)) 
Example 16
Project: deep_image_model   Author: tobegit3hub   File: iris.py    Apache License 2.0 6 votes vote down vote up
def main(unused_argv):
  # Load dataset.
  iris = learn.datasets.load_dataset('iris')
  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  # Build 3 layer DNN with 10, 20, 10 units respectively.
  feature_columns = learn.infer_real_valued_columns_from_input(x_train)
  classifier = learn.DNNClassifier(
      feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3)

  # Fit and predict.
  classifier.fit(x_train, y_train, steps=200)
  predictions = list(classifier.predict(x_test, as_iterable=True))
  score = metrics.accuracy_score(y_test, predictions)
  print('Accuracy: {0:f}'.format(score)) 
Example 17
Project: deep_image_model   Author: tobegit3hub   File: iris_custom_decay_dnn.py    Apache License 2.0 6 votes vote down vote up
def main(unused_argv):
  iris = datasets.load_iris()
  x_train, x_test, y_train, y_test = train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
      x_train)
  classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
                                              hidden_units=[10, 20, 10],
                                              n_classes=3,
                                              optimizer=optimizer_exp_decay)

  classifier.fit(x_train, y_train, steps=800)
  predictions = list(classifier.predict(x_test, as_iterable=True))
  score = metrics.accuracy_score(y_test, predictions)
  print('Accuracy: {0:f}'.format(score)) 
Example 18
Project: sfcc   Author: kv-kunalvyas   File: auxiliary.py    MIT License 5 votes vote down vote up
def plotLearningCurves(train, classifier):
    #P.show()
    X = train.values[:, 1::]
    y = train.values[:, 0]

    train_sizes, train_scores, test_scores = learning_curve(
            classifier, X, y, cv=10, n_jobs=-1, train_sizes=np.linspace(.1, 1., 10), verbose=0)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure()
    plt.title("Learning Curves")
    plt.legend(loc="best")
    plt.xlabel("Training samples")
    plt.ylabel("Error Rate")
    plt.ylim((0, 1))
    plt.gca().invert_yaxis()
    plt.grid()

    # Plot the average training and test score lines at each training set size
    plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="Test score")

    # Plot the std deviation as a transparent range at each training set size
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
                     alpha=0.1, color="b")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
                     alpha=0.1, color="r")

    # Draw the plot and reset the y-axis
    plt.draw()
    plt.gca().invert_yaxis()

    # shuffle and split training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)
    classifier.fit(X_train, y_train)
    plt.show() 
Example 19
Project: Mussy-Robot   Author: arnomoonens   File: training.py    MIT License 5 votes vote down vote up
def training(data):

    
    svc_1 = SVC(kernel='linear')
    
    
    #we create the target vector of -1 for sad images, 0 for normal, 
    #and 1 for happy images, the data  is composed by 15 sad image after 15 happy image and after 15 normal image
    zero=[int(i) for i in numpy.zeros(15)]
    one=[int(i) for i in numpy.ones(15)]
    minus1=[int(i) for i in numpy.repeat(-1,15)]
    target=numpy.concatenate((minus1,one,zero,),axis=0)
   
    #we test if the classifier work correctly with CROSS-VALIDATION
    #5 fold cross validation
    from sklearn.cross_validation import train_test_split

    
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.20, random_state=0)
    
    from sklearn import neighbors
    n_neighbors =3 
    for weights in ['uniform', 'distance']:
        # we create an instance of Neighbours Classifier and fit the data.
        KNeigh = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
        KNeigh.fit(X_train,y_train)
        print(KNeigh.predict(X_test))
        
    print(y_test)
    #evaluate_cross_validation(KNeigh, X_train, y_train, 10)
    #svc is better!!!
    svc_1.fit(X_train,y_train)
    evaluate_cross_validation(svc_1, X_train, y_train, 10)
    joblib.dump(svc_1,'svc_1.pkl') 
Example 20
Project: LifelongVAE   Author: jramapuram   File: svhn_class.py    MIT License 5 votes vote down vote up
def train_validation_spit(train_dataset, train_labels):
    train_dataset, validation_dataset, train_labels, validation_labels = train_test_split(train_dataset, train_labels, test_size=0.1, random_state = 42)
    return train_dataset, validation_dataset, train_labels, validation_labels 
Example 21
Project: ml-helper-funcs   Author: numb3r33   File: feature_selection.py    MIT License 5 votes vote down vote up
def split_examples(X, y):
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=214)
	
	return X_train, X_test, y_train, y_test 
Example 22
Project: ml-helper-funcs   Author: numb3r33   File: custom_metrics_knn.py    MIT License 5 votes vote down vote up
def split_dataset(X, y):
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=241)
	
	return X_train, X_test, y_train, y_test 
Example 23
Project: twitter-svm   Author: josh-byster   File: calculations.py    MIT License 5 votes vote down vote up
def regularSVM(X,Y,c,pctTest,shouldReturnMetrics):
    #svm = LinearSVC(C=c);
    svm=linear_model.LogisticRegression(C=c);
    cv=X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X,Y, test_size=pctTest, random_state=None)
    svm.fit(X_train,Y_train)
    y_pred=svm.predict(X_test)
    channels=svm.classes_
    channels.sort()
    getWrongValues(y_pred,Y_test,channels,shouldReturnMetrics,num=len(X))
    return svm 
Example 24
Project: twitter-svm   Author: josh-byster   File: calculations.py    MIT License 5 votes vote down vote up
def testOverN(X,Y,c,pctTest,channels,shouldReturnMetrics=False,increment=100):
    for i in xrange(100,len(X),50):
        start = time.time()
        svm = LinearSVC(C=c);
        cv=X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X[:i],Y[:i], test_size=pctTest, random_state=None)
        svm.fit(X_train,Y_train)
        y_pred=svm.predict(X_test)
        print(str(i) + "," + str(metrics.accuracy_score(Y_test, y_pred, normalize=True, sample_weight=None))+","+str(time.time()-start)) 
Example 25
Project: JAABF   Author: drr3d   File: classify.py    GNU General Public License v3.0 5 votes vote down vote up
def train(self, X, y, max_df=1.0, minword=1, maxfeature=10000, **algoparam):
        #Convert a collection of text documents to a matrix of token counts
        #This implementation produces a sparse representation of the counts using scipy.sparse.coo_matrix.
        #If you do not provide an a-priori dictionary and you do not use an analyzer that does some kind of feature selection
        #   then the number of features will be equal to the vocabulary size found by analyzing the data
        # the count vectorizer produces a "bag of words" and for the term frequencies
        self._tf_vectorizer = countVectorizer(max_df=max_df, min_df=minword, max_features=maxfeature,\
                                        stop_words = self.stoplist, ngram_range=(1,3))

        self._tf_transformer = tfidfTransformer(norm='l2', use_idf=False, smooth_idf=True, sublinear_tf=False)

        from sklearn.cross_validation import train_test_split
        if self.validation_split:
            if type(self.validation_split) is float:
                # split into xx% for train and x% for test
                X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=self.validation_split, \
                                                                                random_state=337, stratify=train_labels)
            else:
                raise RuntimeError("validation_split must float...")
        else:
             X_train = X
             y_train = y

        algo = self.solver_algo(**algoparam)
        model = JAABFEstimator(vectorizer=self._tf_vectorizer, \
                            transformer=self._tf_transformer, classifier=algo)
        
        return model.fit(X_train,y_train) 
Example 26
Project: face_landmark_dnn   Author: junhwanjang   File: train_mobilenets.py    MIT License 5 votes vote down vote up
def main():
#        Define X and y
# #        Load data
        PATH = "./data/64_64_1/offset_1.3/"
        X = np.load(PATH + "basic_dataset_img.npz")
        y = np.load(PATH + "basic_dataset_pts.npz")
        X = X['arr_0']
        y = y['arr_0'].reshape(-1, 136)
        

        print("Define X and Y")
        print("=======================================")
        
        # Split train / test dataset
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        print("Success of getting train / test dataset")
        print("=======================================")
        print("X_train: ", X_train.shape)
        print("y_train: ", y_train.shape)
        print("X_test: ", X_test.shape)
        print("y_test: ", y_test.shape)
        print("=======================================")

        model.compile(loss=smoothL1, optimizer=keras.optimizers.Adam(lr=1e-3), metrics=['mape'])
        print(model.summary())
        # checkpoint
        filepath="./mobilenet_checkpoints/smooth_L1-{epoch:02d}-{val_mean_absolute_percentage_error:.5f}.hdf5"
        checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        callbacks_list = [checkpoint]
        history = model.fit(X_train, y_train, batch_size=64, epochs=10000, shuffle=True,\
                            verbose=1, validation_data=(X_test, y_test), callbacks=callbacks_list)

        # Save model
        model.save("./model/face_landmark_dnn.h5")
        print("=======================================")
        print("Save Final Model")
        print("=======================================") 
Example 27
Project: face_landmark_dnn   Author: junhwanjang   File: train_basic_models.py    MIT License 5 votes vote down vote up
def main():
#        Define X and y
# #        Load data
        PATH = "./data/64_64_1/offset_1.3/"
        X = np.load(PATH + "basic_dataset_img.npz")
        y = np.load(PATH + "basic_dataset_pts.npz")
        X = X['arr_0']
        y = y['arr_0'].reshape(-1, 136)
        

        print("Define X and Y")
        print("=======================================")
        
        # Split train / test dataset
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        print("Success of getting train / test dataset")
        print("=======================================")
        print("X_train: ", X_train.shape)
        print("y_train: ", y_train.shape)
        print("X_test: ", X_test.shape)
        print("y_test: ", y_test.shape)
        print("=======================================")

        model.compile(loss=smoothL1, optimizer=keras.optimizers.Adam(lr=1e-3), metrics=['mape'])
        print(model.summary())
        # checkpoint
        filepath="./basic_checkpoints/smooth_L1-{epoch:02d}-{val_mean_absolute_percentage_error:.5f}.hdf5"
        checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        callbacks_list = [checkpoint]
        history = model.fit(X_train, y_train, batch_size=64, epochs=10000, shuffle=True,\
                            verbose=1, validation_data=(X_test, y_test), callbacks=callbacks_list)

        # Save model
        model.save("./model/face_landmark_dnn.h5")
        print("=======================================")
        print("Save Final Model")
        print("=======================================") 
Example 28
Project: kaggle_rossmann   Author: datanuggets   File: svm_1.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    print "Loading train set..."
    train_df = load_dataset('/Users/Carlos_Vaquero/Desktop/Rossmann/train.csv')
    data_columns = train_df.columns.diff(['Sales', 'Customers'])
    target_column = 'Sales'


    # this must be changed ! Has to be sequential not random...
    print "Splitting train and verification sets..."
    train_index, validation_index = train_test_split(
        train_df.index,
        test_size=0.1,
        random_state=RANDOM_STATE
    )

    X = train_df.loc[train_index[1:10], data_columns[1:10]]


    #print 'train_index', train_index
    #print 'data_columns', data_columns


    y=train_df.loc[train_index[1:10], target_column]
    print 'X', X

    print ' '

    print 'Y', y

    print "Training svm..."
    svr = svm.SVR(C=1.0, epsilon=0.2).fit(
        X=train_df.loc[train_index, data_columns],
        y=train_df.loc[train_index, target_column],
    ) 
Example 29
Project: DCASE2017-task1   Author: ronggong   File: xgb_classification.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def train_test(clf, X, y, labels):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    save_results(y_test, y_pred, labels) 
Example 30
Project: SofPythonBot   Author: UtkucanBykl   File: base.py    GNU General Public License v3.0 5 votes vote down vote up
def learning(self):
        self.vect = TfidfVectorizer(min_df=1)
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.frame_x, self.frame_y, test_size=0.2, random_state=4)
        self.x_trainvect = self.vect.fit_transform(self.x_train)
        self.x_trainvect.toarray()
        self.vect1 = TfidfVectorizer(min_df=1)
        self.x_trainvect = self.vect1.fit_transform(self.x_train)
        a = self.x_trainvect.toarray()
        self.vect1.inverse_transform(a[0]) 
Example 31
Project: Fraud-Corruption-Detection-Data-Science-Pipeline-DSSG2015   Author: eredmiles   File: model_pipeline_script.py    MIT License 5 votes vote down vote up
def plot_confusion(classifier,threshold =0.4):
    x_train,x_test,y_train,y_test = train_test_split(df_new,y,test_size = 0.2)
    y_pred = []
    try:
        prob_score = clf_grid.predict_proba(x_train)
    except:
	prob_score = clf_grid.predict_proba(np.float_(x_train))
    a = prob_score[:,1]
    for idx,item in enumerate(a):
        if item>= threshold:
            item = 1
        else:
            item =0
        y_pred.append(item)
    # Plotting                                                                                                              

    class_name = classifier.__repr__()
    class_name = re.sub(r'\([^)]*\)','',class_name)
    print ("")
    print ("")
    print("Legends")
    print ('1 - Substantiated')
    print ('0 - Unfounded')
    print("")
    print("Confusion Matrix: "+ class_name+ " (threshold- " +str(threshold)+")"  )
    sns.heatmap(metrics.confusion_matrix(y_pred, y_train), annot=True, cmap="YlGnBu",fmt ="d")
    plt.xlabel('Predicted')
    plt.ylabel('True') 
Example 32
Project: Fraud-Corruption-Detection-Data-Science-Pipeline-DSSG2015   Author: eredmiles   File: model_pipeline_script.py    MIT License 5 votes vote down vote up
def feature_direction(idx,dataframe,label,threshold):
    y_pred = [];
    clf_rf = RandomForestClassifier(n_estimators=100, max_depth=80,
                               min_samples_split=5)
    x_trainr,x_test,y_train,y_test = train_test_split(dataframe,label,test_size = 0.2)
    col_names = list(dataframe.columns.values)
    maximum_val = x_train[:,idx].max()
    minimum_val = x_train[:,idx].min()
    feature_name = col_names[idx]
    for i,col in enumerate(x_train):
        if i != idx:
            x_train[:,i] = np.mean(x_train[:,i])
    
    clf_feature = clf_rf.fit(x_train,y_train)
    proba_score_feature=clf_feature.predict_proba(x_train)    
    score = proba_score_feature[:,1]
    for item in score:
        if item >=threshold:
            y_pred.append(1)
        else:
            y_pred.append(0)
    plt.scatter(x_train[:,idx],y_pred)
    plt.xlabel(feature_name)
    plt.ylabel('prediction')
    plt.suptitle('Response Curve-  ' + feature_name)

    return top_features 
Example 33
Project: tf-ft   Author: KleinYuan   File: train.py    MIT License 5 votes vote down vote up
def feed_trainer(self, x, y, data_split_ratio):
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=(data_split_ratio[1] + data_split_ratio[2]))
        self.x_test, self.x_val, self.y_test, self.y_val = train_test_split(self.x_test, self.y_test, test_size=(data_split_ratio[2] / (data_split_ratio[1] + data_split_ratio[2])))

        self.graph = self.graph_model.get_graph()
        self.x_placeholder, self.y_placeholder, self.keep_prob_placeholder = self.graph_model.get_placeholders()
        self.writer = self.graph_model.get_writer()
        self.summary = self.graph_model.get_summary()
        self.ops = self.graph_model.get_ops()
        self.loss = self.graph_model.get_loss() 
Example 34
Project: EMNaiveBayes   Author: betterenvi   File: EMNaiveBayes.py    MIT License 5 votes vote down vote up
def _calc_model_accuracy(self, model, X_onehot, Y):
        X_train_onehot, X_test_onehot, Y_train, Y_test = train_test_split(
            X_onehot, Y, test_size=0.2, random_state=0)
        model.fit(X_train_onehot, Y_train)
        pred = model.predict(X_test_onehot)
        accuracy = metrics.accuracy_score(Y_test, pred)
        return accuracy 
Example 35
Project: RSV   Author: Healthcast   File: skeleton.py    GNU General Public License v2.0 5 votes vote down vote up
def test_SVM():
    iris = datasets.load_iris()
    iris_X = iris.data[:,:2]
    iris_y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=.5, \
                                                        random_state=0)
    #modify kernel
    #kernels:'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
    #modify C to control the soft margin: large C, more soft
    svc = svm.SVC(kernel='linear', C=10.0)
    svc.fit(X_train, y_train)

    #plot the results
    x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
    y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max,\
    0.02))

    print len(np.c_[xx.ravel(), yy.ravel()])
    Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
        
    plt.contourf(xx, yy, Z )
    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.Paired)

    plt.show() 
Example 36
Project: RSV   Author: Healthcast   File: skeleton.py    GNU General Public License v2.0 5 votes vote down vote up
def test_RF():
    
    iris = datasets.load_iris()
    iris_X = iris.data[:,:2]
    iris_y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=.5, \
                                                        random_state=0)
    #test max_depth: the max depth of the tree
    #test n_estimators: how many trees used
    #test max_features: how many good features used in each split
    clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    clf.fit(X_train, y_train)
    x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
    y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1

    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z)


    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i, weights = '%s')"%(15, 'uniform'))
    plt.show() 
Example 37
Project: RSV   Author: Healthcast   File: methods.py    GNU General Public License v2.0 5 votes vote down vote up
def apply_evaluation(paras,  clf, data):
    X = data["X"]
    y = data["y"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, \
                                                        random_state=0)

    clf.fit(X_train, y_train)
    r = clf.predict(X_test)
    plot_results(r, clf, data, paras)
    

    if paras['eva'] == 'accuracy':
        print "The accuracy:"
        print metrics.accuracy_score(y_test, r)
    elif paras['eva'] == 'precision':
        print "The precision:"
        print metrics.precision_score(y_test, r)
    elif paras['eva'] == 'recall':
        print "The recall:"
        print metrics.recall_score(y_test, r)
    elif paras['eva'] == 'confusion':
        print "The confusion matrix:"
        print metrics.confusion_matrix(y_test, r)
    elif paras['eva'] == 'report':
        print "The report:"
        print metrics.classification_report(y_test, r)
    elif paras['eva'] == 'roc' and paras['clf'] == 'svm':
        scores = clf.decision_function(X_test)
        print "The auc:"
        fpr, tpr, thresholds = metrics.roc_curve(y_test, scores)
        roc_auc = metrics.auc(fpr, tpr)
        print str(roc_auc)
        plt.figure()
        plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.show() 
Example 38
Project: RSV   Author: Healthcast   File: skeleton.py    GNU General Public License v2.0 5 votes vote down vote up
def test_SVM():
    iris = datasets.load_iris()
    iris_X = iris.data[:,:2]
    iris_y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=.5, \
                                                        random_state=0)
    #modify kernel
    #kernels:'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
    #modify C to control the soft margin: large C, more soft
    svc = svm.SVC(kernel='linear', C=10.0)
    svc.fit(X_train, y_train)

    #plot the results
    x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
    y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max,\
    0.02))

    print len(np.c_[xx.ravel(), yy.ravel()])
    Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
        
    plt.contourf(xx, yy, Z )
    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.Paired)

    plt.show() 
Example 39
Project: RSV   Author: Healthcast   File: skeleton.py    GNU General Public License v2.0 5 votes vote down vote up
def test_RF():
    
    iris = datasets.load_iris()
    iris_X = iris.data[:,:2]
    iris_y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=.5, \
                                                        random_state=0)
    #test max_depth: the max depth of the tree
    #test n_estimators: how many trees used
    #test max_features: how many good features used in each split
    clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    clf.fit(X_train, y_train)
    x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
    y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1

    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z)


    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i, weights = '%s')"%(15, 'uniform'))
    plt.show() 
Example 40
Project: stratosphere-lstm   Author: mendozawow   File: neon_lstm.py    MIT License 5 votes vote down vote up
def split_data(data, split_pct=0.1):
        '''
        Splits data into training and testing.
        '''
        shuffle(data)
        return train_test_split(data, test_size=split_pct) 
Example 41
Project: stratosphere-lstm   Author: mendozawow   File: dga_lstm.py    MIT License 5 votes vote down vote up
def split_data(data, split_pct=0.1):
        '''
        Splits data into training and testing.
        '''
        return train_test_split(data, test_size=split_pct) 
Example 42
Project: stratosphere-lstm   Author: mendozawow   File: dga_lstm.py    MIT License 5 votes vote down vote up
def split_data(data, split_pct=0.1):
        '''
        Splits data into training and testing.
        '''
        shuffle(data)
        return train_test_split(data, test_size=split_pct) 
Example 43
Project: Rbfc   Author: b14ckfir3   File: general_functions.py    GNU General Public License v3.0 5 votes vote down vote up
def split_data_set(data_set, targets, test_size=.5):
    return train_test_split(data_set, targets, test_size=test_size) 
Example 44
Project: RPGOne   Author: RTHMaK   File: test_early_stopping.py    Apache License 2.0 5 votes vote down vote up
def testIrisES(self):
        random.seed(42)

        iris = datasets.load_iris()
        X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                            iris.target,
                                                            test_size=0.2,
                                                            random_state=42)

        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)
        val_monitor = skflow.monitors.ValidationMonitor(X_val, y_val, n_classes=3)

        # classifier without early stopping - overfitting
        classifier1 = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
                                                     n_classes=3, steps=1000)
        classifier1.fit(X_train, y_train)
        score1 = metrics.accuracy_score(y_test, classifier1.predict(X_test))

        # classifier with early stopping - improved accuracy on testing set
        classifier2 = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
                                                     n_classes=3, steps=1000)

        classifier2.fit(X_train, y_train, val_monitor)
        score2 = metrics.accuracy_score(y_test, classifier2.predict(X_test))

        # self.assertGreater(score2, score1, "No improvement using early stopping.") 
Example 45
Project: image-recognition   Author: zw76859420   File: knife_train.py    GNU General Public License v3.0 5 votes vote down vote up
def load(self , img_rows=128 , img_cols=128 , img_channels=3 , nb_classes=2):
        images , labels = load_dataset(self.pathname)
        train_images , valid_images , train_labels , valid_labels = train_test_split(images , labels , test_size=0.3 , random_state=random.randint(0 , 100))
        valid_images , test_images , valid_labels , test_labels = train_test_split(valid_images , valid_labels , test_size=0.5 , random_state=random.randint(0 , 100))

        train_images = train_images.reshape(train_images.shape[0] , img_rows , img_cols , img_channels)
        valid_images = valid_images.reshape(valid_images.shape[0] , img_rows , img_cols , img_channels)
        test_images = test_images.reshape(test_images.shape[0], img_rows, img_cols, img_channels)
        # print(valid_images.shape)

        self.input_shape = (img_rows , img_cols , img_channels)

        train_labels = np_utils.to_categorical(train_labels , num_classes=nb_classes)
        valid_labels = np_utils.to_categorical(valid_labels , num_classes=nb_classes)
        test_labels = np_utils.to_categorical(test_labels , num_classes=nb_classes)
        # print(test_labels)

        train_images.astype('float32')
        valid_images.astype('float32')
        test_images.astype('float32')

        train_images = train_images / 255
        valid_images = valid_images / 255
        test_images = test_images / 255

        self.train_images = train_images
        self.valid_images = valid_images
        self.train_labels = train_labels
        self.valid_labels = valid_labels
        self.test_images = test_images
        self.test_labels = test_labels 
Example 46
Project: FaceLock   Author: Donny-Hikari   File: train.py    MIT License 5 votes vote down vote up
def read(self, img_rows=IMAGE_SIZE, img_cols=IMAGE_SIZE, img_channels=3, nb_classes=2):
        
        images, labels = extract_data(self.TRAIN_DATA)
        labels = np.reshape(labels, [-1])
        X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.3, random_state=random.randint(0, 100))
        X_valid, X_test, y_valid, y_test = train_test_split(images, labels, test_size=0.5, random_state=random.randint(0, 100))
        if K.image_dim_ordering() == 'th':
            X_train = X_train.reshape(X_train.shape[0], img_channels, img_rows, img_cols)
            X_valid = X_valid.reshape(X_valid.shape[0], img_channels, img_rows, img_cols)
            X_test = X_test.reshape(X_test.shape[0], img_channels, img_rows, img_cols)
            input_shape = (img_channels, img_rows, img_cols)
        else:
            X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, img_channels)
            X_valid = X_valid.reshape(X_valid.shape[0], img_rows, img_cols, img_channels)
            X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, img_channels)
            input_shape = (img_rows, img_cols, img_channels)

        print('X_train shape:', X_train.shape)
        print(X_train.shape[0], 'train samples')
        print(X_valid.shape[0], 'valid samples')
        print(X_test.shape[0], 'test samples')

        Y_train = np_utils.to_categorical(y_train, nb_classes)
        Y_valid = np_utils.to_categorical(y_valid, nb_classes)
        Y_test = np_utils.to_categorical(y_test, nb_classes)

        X_train = X_train.astype('float32')
        X_valid = X_valid.astype('float32')
        X_test = X_test.astype('float32')
        X_train /= 255
        X_valid /= 255
        X_test /= 255

        self.X_train = X_train
        self.X_valid = X_valid
        self.X_test = X_test
        self.Y_train = Y_train
        self.Y_valid = Y_valid
        self.Y_test = Y_test 
Example 47
Project: elephant   Author: alanyuchenhou   File: estimator.py    MIT License 5 votes vote down vote up
def estimate(self, y, batch_size, test_size, metric, steps=math.inf):
        x, x_test, y, y_test = cross_validation.train_test_split(self.x, y, test_size=test_size)
        x_train, x_validate, y_train, y_validate = cross_validation.train_test_split(x, y, test_size=0.1)
        monitor = learn.monitors.ValidationMonitor(x_validate, y_validate, every_n_steps=(len(x_train) // batch_size),
                                                   early_stopping_rounds=1)
        estimator = learn.Estimator(self._build_model)
        estimator.fit(x_train, y_train, steps=steps, batch_size=batch_size, monitors=[monitor])
        y_predicted = estimator.predict(x_test)
        if metric == 'MAE':
            return metrics.mean_absolute_error(y_test, y_predicted)
        elif metric == 'MSE':
            return metrics.mean_squared_error(y_test, y_predicted)
        else:
            assert False 
Example 48
Project: apachecn_ml   Author: ys1305   File: RS-sklearn-rating.py    GNU General Public License v3.0 5 votes vote down vote up
def splitData(dataFile, test_size):
    # 加载数据集
    header = ['user_id', 'item_id', 'rating', 'timestamp']
    df = pd.read_csv(dataFile, sep='\t', names=header)

    n_users = df.user_id.unique().shape[0]
    n_items = df.item_id.unique().shape[0]

    print('Number of users = ' + str(n_users) + ' | Number of movies = ' +
          str(n_items))
    train_data, test_data = cv.train_test_split(df, test_size=test_size)
    print("数据量:", len(train_data), len(test_data))
    return df, n_users, n_items, train_data, test_data 
Example 49
Project: jingjuSingingPhraseMatching   Author: ronggong   File: xgb_classification.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def train_test(clf, X, y, labels):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    save_results(y_test, y_pred, labels) 
Example 50
Project: deep_image_model   Author: tobegit3hub   File: multiple_gpu.py    Apache License 2.0 5 votes vote down vote up
def main(unused_argv):
  iris = datasets.load_iris()
  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  classifier = learn.Estimator(model_fn=my_model)
  classifier.fit(x_train, y_train, steps=1000)

  y_predicted = [
      p['class'] for p in classifier.predict(x_test, as_iterable=True)]
  score = metrics.accuracy_score(y_test, y_predicted)
  print('Accuracy: {0:f}'.format(score)) 
Example 51
Project: deep_image_model   Author: tobegit3hub   File: boston.py    Apache License 2.0 5 votes vote down vote up
def main(unused_argv):
  # Load dataset
  boston = learn.datasets.load_dataset('boston')
  x, y = boston.data, boston.target

  # Split dataset into train / test
  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
      x, y, test_size=0.2, random_state=42)

  # Scale data (training set) to 0 mean and unit standard deviation.
  scaler = preprocessing.StandardScaler()
  x_train = scaler.fit_transform(x_train)

  # Build 2 layer fully connected DNN with 10, 10 units respectively.
  feature_columns = learn.infer_real_valued_columns_from_input(x_train)
  regressor = learn.DNNRegressor(
      feature_columns=feature_columns, hidden_units=[10, 10])

  # Fit
  regressor.fit(x_train, y_train, steps=5000, batch_size=1)

  # Predict and score
  y_predicted = list(
      regressor.predict(scaler.transform(x_test), as_iterable=True))
  score = metrics.mean_squared_error(y_predicted, y_test)

  print('MSE: {0:f}'.format(score)) 
Example 52
Project: deep_image_model   Author: tobegit3hub   File: iris_val_based_early_stopping.py    Apache License 2.0 5 votes vote down vote up
def main(unused_argv):
  iris = datasets.load_iris()
  x_train, x_test, y_train, y_test = train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  x_train, x_val, y_train, y_val = train_test_split(
      x_train, y_train, test_size=0.2, random_state=42)
  val_monitor = learn.monitors.ValidationMonitor(
      x_val, y_val, early_stopping_rounds=200)

  model_dir = '/tmp/iris_model'
  clean_folder(model_dir)

  # classifier with early stopping on training data
  classifier1 = learn.DNNClassifier(
      feature_columns=learn.infer_real_valued_columns_from_input(x_train),
      hidden_units=[10, 20, 10], n_classes=3, model_dir=model_dir)
  classifier1.fit(x=x_train, y=y_train, steps=2000)
  predictions1 = list(classifier1.predict(x_test, as_iterable=True))
  score1 = metrics.accuracy_score(y_test, predictions1)

  model_dir = '/tmp/iris_model_val'
  clean_folder(model_dir)

  # classifier with early stopping on validation data, save frequently for
  # monitor to pick up new checkpoints.
  classifier2 = learn.DNNClassifier(
      feature_columns=learn.infer_real_valued_columns_from_input(x_train),
      hidden_units=[10, 20, 10], n_classes=3, model_dir=model_dir,
      config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
  classifier2.fit(x=x_train, y=y_train, steps=2000, monitors=[val_monitor])
  predictions2 = list(classifier2.predict(x_test, as_iterable=True))
  score2 = metrics.accuracy_score(y_test, predictions2)

  # In many applications, the score is improved by using early stopping
  print('score1: ', score1)
  print('score2: ', score2)
  print('score2 > score1: ', score2 > score1) 
Example 53
Project: deep_image_model   Author: tobegit3hub   File: hdf5_classification.py    Apache License 2.0 5 votes vote down vote up
def main(unused_argv):
  # Load dataset.
  iris = learn.datasets.load_dataset('iris')
  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  # Note that we are saving and load iris data as h5 format as a simple
  # demonstration here.
  h5f = h5py.File('/tmp/test_hdf5.h5', 'w')
  h5f.create_dataset('X_train', data=x_train)
  h5f.create_dataset('X_test', data=x_test)
  h5f.create_dataset('y_train', data=y_train)
  h5f.create_dataset('y_test', data=y_test)
  h5f.close()

  h5f = h5py.File('/tmp/test_hdf5.h5', 'r')
  x_train = np.array(h5f['X_train'])
  x_test = np.array(h5f['X_test'])
  y_train = np.array(h5f['y_train'])
  y_test = np.array(h5f['y_test'])

  # Build 3 layer DNN with 10, 20, 10 units respectively.
  feature_columns = learn.infer_real_valued_columns_from_input(x_train)
  classifier = learn.DNNClassifier(
      feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3)

  # Fit and predict.
  classifier.fit(x_train, y_train, steps=200)
  score = metrics.accuracy_score(y_test, classifier.predict(x_test))
  print('Accuracy: {0:f}'.format(score)) 
Example 54
Project: deep_image_model   Author: tobegit3hub   File: iris_custom_model.py    Apache License 2.0 5 votes vote down vote up
def main(unused_argv):
  iris = datasets.load_iris()
  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  classifier = learn.Estimator(model_fn=my_model)
  classifier.fit(x_train, y_train, steps=1000)

  y_predicted = [
      p['class'] for p in classifier.predict(x_test, as_iterable=True)]
  score = metrics.accuracy_score(y_test, y_predicted)
  print('Accuracy: {0:f}'.format(score)) 
Example 55
Project: deep_segment   Author: JoshuaEbenezer   File: ISIC_dataset.py    GNU General Public License v3.0 5 votes vote down vote up
def train_val_split(train_list, train_labels, seed, val_split = 0.20):
    train_list, val_list, train_label, val_label = train_test_split(train_list, train_labels, test_size=val_split, stratify=train_labels, random_state=seed)
    return train_list, val_list, train_label, val_label 
Example 56
Project: sonic_contest   Author: flyyufelix   File: train_level_classifier.py    MIT License 5 votes vote down vote up
def train_covnet(nb_epoch=3, size=(224,224)):

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101

    img_rows, img_cols = size
    batch_size = 16
    random_state = random_seed
    num_class = 27
    color_type = 1

    train_data, train_target, train_target_vec, train_id = read_and_normalize_and_shuffle_train_data(img_rows, img_cols, color_type,random_seed)

    X_train, X_valid, Y_train, Y_valid = train_test_split(train_data, train_target, test_size=0.1)

    model = resnet50_model(img_rows, img_cols, color_type, num_class)

    model.fit(
        X_train,
        Y_train,
        epochs=nb_epoch,
        batch_size=batch_size,
        validation_data=(X_valid, Y_valid)
    )

    save_model(model, 'level_classifier', save_weights=True) 
Example 57
Project: SANS_THIR16   Author: endgameinc   File: dga_classifier.py    MIT License 5 votes vote down vote up
def cross_validate(fts, labels, clf, nfolds):
    scores = []
    true_labels = []
    for fold in range(nfolds):
        X_train, X_test, y_train, y_test = train_test_split(fts, labels, test_size=.2)
        clf.fit(X_train, y_train)

        scores.append(clf.predict_proba(X_test)[:,1])
        true_labels.append(y_test)
    ret = {}
    ret['fpr'], ret['tpr'], ret['thr'] = roc_curve(np.array(true_labels).ravel(), np.array(scores).ravel())
    ret['auc'] = auc(ret['fpr'], ret['tpr'])
    print ret['auc']
    return ret 
Example 58
Project: BRISE   Author: dpukhkaiev   File: regression.py    MIT License 5 votes vote down vote up
def __init__(self, file_name, train_size, target, features, indices):
        del self.dict[:]
        del self.indices[:]
        for i in indices:
            self.indices.append(i)

        subset_target = []
        subset_features = []
        for i in self.indices:
            subset_target.append(target[i])
            subset_features.append(features[i])

        self.train_size = train_size

        '''
        kf = cross_validation.KFold(n=len(data), n_folds=10, shuffle=True )
        for train_index, test_index in kf:
            for i in train_index:
                self.feature_train.append(features[i])
                self.target_train.append(target[i])
            for i in test_index:
                self.feature_test.append(features[i])
                self.target_test.append(target[i])
        '''
        # print subset_target
        # print "***************"
        # print subset_features
        self.feature_train, self.feature_test, self.target_train, self.target_test = \
        cross_validation.train_test_split(subset_features, subset_target, train_size=train_size)
        old_indices = []
        return 
Example 59
Project: samples   Author: tsaqib   File: categorical_dnn.py    MIT License 5 votes vote down vote up
def _shuffle_split(self):
        self._raw_data.iloc[np.random.permutation(len(self._raw_data))]
        self._testdata, self._traindata = train_test_split(self._raw_data, test_size=self._training_size)

        # TF Learn / TensorFlow only takes int32 / int64 at the moment as oppose to int8
        self._train_label = [int(row) for row in self._traindata[self._raw_data.columns[self._datadim - 1]]]
        self._test_label = [int(row) for row in self._testdata[self._raw_data.columns[self._datadim - 1]]]
        self._traindata = self._traindata.ix[:, range(self._datadim - 1)]
        self._testdata = self._testdata.ix[:, range(self._datadim - 1)] 
Example 60
Project: frankenstein   Author: hunterowens   File: test.py    Apache License 2.0 5 votes vote down vote up
def train(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
    classifier.fit(X_train, y_train)
    print "Accuracy: %s" % classifier.score(X_test, y_test)
    return classifier 
Example 61
Project: frankenstein   Author: hunterowens   File: test.py    Apache License 2.0 5 votes vote down vote up
def train(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
    classifier.fit(X_train, y_train)
    print "Accuracy: %s" % classifier.score(X_test, y_test)
    return classifier 
Example 62
Project: Recipes   Author: Lasagne   File: utils.py    MIT License 4 votes vote down vote up
def load_pickle_data_cv():
    fo_1 = open('data/cifar-10-batches-py/data_batch_1', 'rb')
    fo_2 = open('data/cifar-10-batches-py/data_batch_2', 'rb')
    fo_3 = open('data/cifar-10-batches-py/data_batch_3', 'rb')
    fo_4 = open('data/cifar-10-batches-py/data_batch_4', 'rb')
    fo_5 = open('data/cifar-10-batches-py/data_batch_5', 'rb')
    dict_1 = cPickle.load(fo_1)
    fo_1.close()
    dict_2 = cPickle.load(fo_2)
    fo_2.close()
    dict_3 = cPickle.load(fo_3)
    fo_3.close()
    dict_4 = cPickle.load(fo_4)
    fo_4.close()
    dict_5 = cPickle.load(fo_5)
    fo_5.close()
    data_1 = dict_1['data']
    data_2 = dict_2['data']
    data_3 = dict_3['data']
    data_4 = dict_4['data']
    data_5 = dict_5['data']
    labels_1 = dict_1['labels']
    labels_2 = dict_2['labels']
    labels_3 = dict_3['labels']
    labels_4 = dict_4['labels']
    labels_5 = dict_5['labels']

    X_train = np.vstack((data_1, data_2, data_3, data_4, data_5))
    y_train = np.hstack((labels_1, labels_2, labels_3, labels_4, labels_5)).astype('int32')

    X_train, y_train = shuffle(X_train, y_train)

    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1)

    X_train = X_train.reshape(X_train.shape[0], 3, PIXELS, PIXELS).astype('float32')
    X_test = X_test.reshape(X_test.shape[0], 3, PIXELS, PIXELS).astype('float32')

    # subtract per-pixel mean
    pixel_mean = np.mean(X_train, axis=0)
    print pixel_mean
    np.save('data/pixel_mean.npy', pixel_mean)
    X_train -= pixel_mean
    X_test -= pixel_mean

    return X_train, X_test, y_train, y_test 
Example 63
Project: pohmm-keystroke   Author: vmonaco   File: plotting.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def gen_roc():
    from sklearn import svm, datasets
    from sklearn.metrics import roc_curve, auc
    from sklearn.cross_validation import train_test_split
    from sklearn.preprocessing import label_binarize
    from sklearn.multiclass import OneVsRestClassifier

    # Import some data to play with
    iris = datasets.load_iris()
    X = iris.data
    y = iris.target

    # Binarize the output
    y = label_binarize(y, classes=[0, 1, 2])
    n_classes = y.shape[1]

    # Add noisy features to make the problem harder
    random_state = np.random.RandomState(0)
    n_samples, n_features = X.shape
    X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

    # shuffle and split training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
                                                        random_state=0)

    # Learn to predict each class against the other
    classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
                                             random_state=random_state))
    y_score = classifier.fit(X_train, y_train).decision_function(X_test)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    thresh = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], thresh[i] = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    far = fpr[0]
    frr = 1 - tpr[0]
    roc = pd.DataFrame({'threshold': thresh[0], 'far': far, 'frr': frr})
    roc['threshold'] = (roc['threshold'] - roc['threshold'].min()) / (roc['threshold'].max() - roc['threshold'].min())
    return roc 
Example 64
Project: robot-navigation   Author: ronaldahmed   File: utils.py    MIT License 4 votes vote down vote up
def get_folds_vDev(dir='data/',  val=0.1, force=False):
	pickle_file = 'folds_vDev.pickle'
	filename = os.path.join(dir,pickle_file)
	folds = []
	if force or not os.path.exists(filename):
		# Make pickle object
		dataByMap = get_data()
		map_names = dataByMap.keys()
		n_names = len(map_names)
		# Iteration over folds
		for i in range(n_names):
			# reset arrays
			train_set = []
			valid_set = []
			complete_set = []	# for universal vocab
			#
			test_single_set = dataByMap[map_names[i]].samples
			test_multi_set  = dataByMap[map_names[i]].get_multi_sentence_samples()
			for j in range(n_names):
				if j != i:
					# shuffle data before splitting
					data = np.array(dataByMap[map_names[j]].samples)	# shuffle in separate array, preserver order for multi_sentence building
					np.random.shuffle(data)
					# split into training and validation sets
					train_samples,valid_samples = train_test_split(	data,
																	test_size=val,
																	random_state = SEED)
					train_set.extend(train_samples)
					valid_set.extend(valid_samples)
					complete_set.extend(data)
			# Reformat to word index
			#vocabulary = getVocabulary(train_set)
			vocabulary = getVocabulary(complete_set) # universal vocabulary
			train_set 			= reformat_wordid(train_set		,vocabulary)
			valid_set 			= reformat_wordid(valid_set		,vocabulary)
			test_single_set 	= reformat_wordid(test_single_set,vocabulary)
			#   for multi sentences
			temp = []
			for parag in test_multi_set:
				temp.append(reformat_wordid(parag,vocabulary))
			test_multi_set = temp
			# shuffle between maps
			np.random.shuffle(train_set)
			np.random.shuffle(valid_set)
			np.random.shuffle(test_single_set)
			np.random.shuffle(test_multi_set)
			#END-FOR-TRAIN-VAL-SPLIT
			folds.append( Fold(train_set,valid_set,test_single_set,test_multi_set,vocabulary) )
		#END-FOR-FOLDS
		print('Pickling %s.' % filename)
		try:
			with open(filename, 'wb') as f:
				pickle.dump(folds, f, pickle.HIGHEST_PROTOCOL)
		except Exception as e:
			print('Unable to save data to', filename, ':', e)
	else:
		with open(filename, 'rb') as f:
			folds = pickle.load(f)
			print('%s read from pickle...' % filename)
	return folds 
Example 65
Project: Botnets   Author: sabersf   File: botnet_tf.py    MIT License 4 votes vote down vote up
def create_inp_two_sectors(first, second):
    #first sector and the second sector
    first_ent = sector_columns(first)
    second_ent = sector_columns(second)
    #we have to keep track of the items we picked so we don't pick them again
    first_flag = [False for i in range(len(first_ent))]
    second_flag = [False for i in range(len(second_ent))]
    #Min number of items we have for both entities
    #We want to pick the same number from both sectors
    min_num = min(len(first_ent), len(second_ent))
    #keep count on the number of items we picked from each sector
    count_first = 0
    count_second = 0
    #Input and output for the DNN algorithms we wanna use later
    Y = []
    X = []
    #Continue this loop until we have the same number from both sectors
    while (count_first + count_second)  < (min_num*2):
        #pick a random sector: first or second
        priority = random.randint(0,1)
        if priority == 0:
            #The first sector it is! we have to set Y to 0
            if count_first >= min_num:
                continue
            else:
                found = False
                while(found == False):
                    i = random.randint(0,len(first_ent) - 1)
                    if first_flag[i] == False:
                        found = True
                        first_flag[i] = True
                        X.append(first_ent[i])
                Y.append([0])
                count_first += 1
        else:
            #The second sector is picked! We have to set Y to 1
            if count_second >= min_num:
                continue
            else:
                found = False
                while(found == False):
                    i = random.randint(0,len(second_ent) - 1)
                    if second_flag[i] == False:
                        found = True
                        second_flag[i] = True
                        X.append(second_ent[i])
                Y.append([1])
                count_second += 1
    X = np.array(X)
    Y = np.array(Y)
    return train_test_split(X, Y, test_size=0.1, random_state=42)


# In[180]:

#Define a dnn model using TFlearn 
Example 66
Project: HDLTex   Author: kk7nc   File: Data_helper.py    MIT License 4 votes vote down vote up
def loadData():
    WOS.download_and_extract()
    fname = os.path.join(path_WOS,"WebOfScience/WOS5736/X.txt")
    fnamek = os.path.join(path_WOS,"WebOfScience/WOS5736/YL1.txt")
    fnameL2 = os.path.join(path_WOS,"WebOfScience/WOS5736/YL2.txt")
    with open(fname) as f:
        content = f.readlines()
        content = [text_cleaner(x) for x in content]
    with open(fnamek) as fk:
        contentk = fk.readlines()
    contentk = [x.strip() for x in contentk]
    with open(fnameL2) as fk:
        contentL2 = fk.readlines()
        contentL2 = [x.strip() for x in contentL2]
    Label = np.matrix(contentk, dtype=int)
    Label = np.transpose(Label)
    number_of_classes_L1 = np.max(Label)+1  # number of classes in Level 1

    Label_L2 = np.matrix(contentL2, dtype=int)
    Label_L2 = np.transpose(Label_L2)
    np.random.seed(7)
    print(Label.shape)
    print(Label_L2.shape)
    Label = np.column_stack((Label, Label_L2))

    number_of_classes_L2 = np.zeros(number_of_classes_L1,dtype=int)

    X_train, X_test, y_train, y_test  = train_test_split(content, Label, test_size=0.2,random_state= 0)

    vectorizer_x = CountVectorizer()
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()

    L2_Train = []
    L2_Test = []
    content_L2_Train = []
    content_L2_Test = []

    for i in range(0, number_of_classes_L1):
        L2_Train.append([])
        L2_Test.append([])
        content_L2_Train.append([])
        content_L2_Test.append([])


    for i in range(0, X_train.shape[0]):
        L2_Train[y_train[i, 0]].append(y_train[i, 1])
        number_of_classes_L2[y_train[i, 0]] = max(number_of_classes_L2[y_train[i, 0]],(y_train[i, 1]+1))
        content_L2_Train[y_train[i, 0]].append(X_train[i])

    for i in range(0, X_test.shape[0]):
        L2_Test[y_test[i, 0]].append(y_test[i, 1])
        content_L2_Test[y_test[i, 0]].append(X_test[i])

    for i in range(0, number_of_classes_L1):
        L2_Train[i] = np.array(L2_Train[i])
        L2_Test[i] = np.array(L2_Test[i])
        content_L2_Train[i] = np.array(content_L2_Train[i])
        content_L2_Test[i] = np.array(content_L2_Test[i])
    return (X_train,y_train,X_test,y_test,content_L2_Train,L2_Train,content_L2_Test,L2_Test,number_of_classes_L2) 
Example 67
Project: fairtest   Author: columbia   File: holdout.py    Apache License 2.0 4 votes vote down vote up
def __init__(self, data, budget=1, conf=0.95, train_size=0.5,
                 random_state=0):
        """
        Prepares a dataset for FairTest investigations. Encodes categorical
        features as numbers and separates the data into a training set and a
        holdout set.

        Parameters
        ----------
        data :
            the dataset to use
        budget :
            the maximal number of adaptive investigations that will be performed
        conf :
            overall family-wide confidence
        train_size :
            the number (or fraction) of data samples to use as a training set
        random_state :
            a random seed to be used for the random train-test split
        """
        if data is not None:
            if not isinstance(data, pd.DataFrame):
                raise ValueError('data should be a Pandas DataFrame')

            data = data.copy()

            if budget < 1:
                raise ValueError("budget parameter should be a positive "
                                 "integer")

            if not 0 < conf < 1:
                raise ValueError('conf should be in (0,1), Got %s' % conf)

            # encode categorical features
            encoders = {}
            for column in data.columns:
                if data.dtypes[column] == np.object:
                    encoders[column] = LabelEncoder()
                    data[column] = encoders[column].fit_transform(data[column])
                    logging.info('Encoding Feature %s' % column)

            train_data, test_data = cv_split(data, train_size=train_size,
                                             random_state=random_state)

            logging.info('Training Size %d' % len(train_data))

            holdout = Holdout(test_data, budget, conf)

            self.train_data = train_data
            self.holdout = holdout
            self.encoders = encoders 
Example 68
Project: Flu-Prediction   Author: RK900   File: Flu-Tree.py    GNU General Public License v3.0 4 votes vote down vote up
def predictFluSeq(seqs): # Seqs is the file path of your FASTA files
    #returns cross-val scores and MSE
    X0 = []

    # adding to X and y

    for i in range(0, len(seqs) - 1):
        X0.append(seqs[i].seq)

    y0 = []
    for j in range(1, len(seqs)):
        y0.append(seqs[i].seq)

    from Encoding_v2 import encoding

    # Encoding letters into numbers

    X = []
    for k in range(len(X0)):
        encoded_X = encoding(X0[k])
        X.append(encoded_X)

    y = []
    for l in range(len(y0)):
        encoded_y = encoding(y0[l])
        y.append(encoded_y)

    from sklearn import ensemble, cross_validation, metrics

    # Cross-Validation
    rfr = ensemble.RandomForestRegressor()
    rfrscores = cross_validation.cross_val_score(rfr, X, y, cv=2)

    cv_score = ("Random Forests cross-validation score", rfrscores)
    avg_cv_score = ("Average Cross-Val Accuracy: %0.2f (+/- %0.2f)" % (rfrscores.mean()*100, rfrscores.std() *100))

    # Mean Squared Error
    X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,y,test_size=0.5,random_state=50)

    rfr.fit(X_train,y_train)
    y_predicted = rfr.predict(X_test)
    mse_score = ('Random Forests MSE:', metrics.mean_squared_error(y_test,y_predicted))

    return cv_score, avg_cv_score, mse_score 
Example 69
Project: message-author-classifier   Author: IvayloAtanasov   File: vectorize.py    MIT License 4 votes vote down vote up
def vectorize_and_get_classifier(trainset_limit=0):
    authors = pickle.load(open(os.path.join(BASE_PATH, 'authors.pkl'), 'rb'))
    messages = pickle.load(open(os.path.join(BASE_PATH, 'messages.pkl'), 'rb'))

    # print messages
    #pp = pprint.PrettyPrinter(indent=4)
    #pp.pprint(messages)

    # split into testing and training sets
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(messages, authors, test_size=0.1, random_state=42)

    bulgarian_stopwords = stopwords.words('bulgarian')

    # build tf-idf vectorizer
    #   ignore bulgarian stopwords
    #   ignore words with document frequency > 0.5
    # TODO: almost no words that are frequent through our dataset. max_df=0.01 barely has effect :) is it a problem?
    vectorizer = TfidfVectorizer(stop_words=bulgarian_stopwords, max_df=0.5)
    # build tf-idf matrix
    # ref: https://en.wikipedia.org/wiki/Tf%E2%80%93idf
    # learn vocabulary on training set, and return training matrix
    features_train = vectorizer.fit_transform(features_train)
    # return testing matrix with the vocabulary learned from the training set
    features_test = vectorizer.transform(features_test).toarray()

    # limit data volume, useful for development
    if trainset_limit is not 0:
        features_train = features_train[:trainset_limit].toarray()
        labels_train = labels_train[:trainset_limit]

    # print tf-idf matrix length
    print('tf-idf matrix length: ' + str(len(vectorizer.get_feature_names())))

    # use DT classifier
    clf = DecisionTreeClassifier()
    clf.fit(features_train, labels_train)

    # print classifier accuracy
    print('dt accuracy: ' + str(clf.score(features_test, labels_test)))
    # find most important word index in list
    most_important_feature_index = numpy.argmax(clf.feature_importances_)
    # print most important feature and it's importance coefficient
    feature = vectorizer.get_feature_names()[most_important_feature_index]
    feature_importance = clf.feature_importances_[most_important_feature_index]
    print('most important feature: ' + feature + ' with importance index of ' + str(feature_importance))

    return clf, vectorizer 
Example 70
Project: Identificador-Fraude-Enron   Author: luisneto98   File: email_preprocess.py    MIT License 4 votes vote down vote up
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"):
    """ 
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        4 objects are returned:
            -- training/testing features
            -- training/testing labels

    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)



    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)



    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=1)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    ### info on the data
    print ("no. of Chris training emails:", sum(labels_train))
    print ("no. of Sara training emails:", len(labels_train)-sum(labels_train))
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test 
Example 71
Project: kaggle_dsb   Author: syagev   File: data.py    Apache License 2.0 4 votes vote down vote up
def load():

    tps = glob.glob(dataset_dir+"/*true.jpg")
    fps_2 = glob.glob(dataset_dir+"/*false.jpg")
    fps = np.random.choice(fps_2,10000)
    images_tps = [[imread(x)] for x in tps]
    images_fps = [[imread(x)] for x in fps]
    labels = np.concatenate((np.ones((len(images_tps))),np.zeros((len(images_fps))))).astype("ubyte")
    images = np.concatenate((images_tps,images_fps)).astype("float32")
    train_X, test_X, train_y, test_y = train_test_split(images,labels, test_size=0.4, random_state=1337)
    half = 0.5*len(test_X)
    val_X = test_X[:half]
    val_y = test_y[:half]
    test_X = test_X[half:]
    test_y = test_y[half:]
    label_to_names = {0:"false",1:"true"}

    # training set, batches 1-4
    # train_X = np.zeros((40000, 3, 32, 32), dtype="float32")
    # train_y = np.zeros((40000, 1), dtype="ubyte").flatten()
    # n_samples = 10000 # number of samples per batch
    # for i in range(0,4):
    #     f = open(os.path.join(dataset_dir, "data_batch_"+str(i+1)+""), "rb")
    #     cifar_batch = pickle.load(f)
    #     f.close()
    #     train_X[i*n_samples:(i+1)*n_samples] = (cifar_batch['data'].reshape(-1, 3, 32, 32) / 255.).astype("float32")
    #     train_y[i*n_samples:(i+1)*n_samples] = np.array(cifar_batch['labels'], dtype='ubyte')
    #
    # # validation set, batch 5
    # f = open(os.path.join(dataset_dir, "data_batch_5"), "rb")
    # cifar_batch_5 = pickle.load(f)
    # f.close()
    # val_X = (cifar_batch_5['data'].reshape(-1, 3, 32, 32) / 255.).astype("float32")
    # val_y = np.array(cifar_batch_5['labels'], dtype='ubyte')
    #
    # # labels
    # f = open(os.path.join(dataset_dir, "batches.meta"), "rb")
    # cifar_dict = pickle.load(f)
    # label_to_names = {k:v for k, v in zip(range(10), cifar_dict['label_names'])}
    # f.close()
    #
    # # test set
    # f = open(os.path.join(dataset_dir, "test_batch"), "rb")
    # cifar_test = pickle.load(f)
    # f.close()
    # test_X = (cifar_test['data'].reshape(-1, 3, 32, 32) / 255.).astype("float32")
    # test_y = np.array(cifar_test['labels'], dtype='ubyte')
    #
    #
    # print("training set size: data = {}, labels = {}".format(train_X.shape, train_y.shape))
    # print("validation set size: data = {}, labels = {}".format(val_X.shape, val_y.shape))
    # print("test set size: data = {}, labels = {}".format(test_X.shape, test_y.shape))
    #
    return train_X, train_y, val_X, val_y, test_X, test_y, label_to_names 
Example 72
Project: patient-viz   Author: nyuvis   File: train.py    MIT License 4 votes vote down vote up
def buildmodel(cohort, model, validPercentage, seed, modeloutput, overwrite):
    trainsety, trainsetx, testsety, testsetx, header = parsedata(cohort)

    if model == 'reg':
        # c_list can come from a config file eventually.
        c_list = [0.01, 0.1, 1, 10, 100]
        total = int(np.floor(100.0/validPercentage))
        score_array = np.zeros((total, len(c_list)), dtype='float')
        for run_ix in range(0,total):
            X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(trainsetx, trainsety, test_size=(validPercentage/100.0), random_state=seed+run_ix)
            for (c_ix, c) in enumerate(c_list):
                #more parametrization of model can come from some config file eventually.
                model_c  = linear_model.LogisticRegression(penalty='l1', C=c, fit_intercept='true', class_weight='auto')
                model_c.fit(X_train,y_train)
                Ypred_valid = model_c.predict_proba(X_valid)
                # evaluation metric could come from a config file eventually. currently AUC is commonly used and we use here
                fprs, tprs, thresholdss = roc_curve(y_valid, Ypred_valid[:,1])
                score_c = auc(fprs,tprs)
                score_array [run_ix, c_ix] = score_c

        mean_scores = score_array.mean(axis=0)
        mean_scores_ix = np.argmax(mean_scores)
        best_c = c_list[mean_scores_ix]
        #now train on the entire train set, using best c:
        model_best_c  = linear_model.LogisticRegression(penalty='l1', C=best_c, fit_intercept='true', class_weight='auto')
        model_best_c.fit(trainsetx,trainsety)
        #----
        Ypred_test = model_best_c.predict_proba(testsetx)
        fprs, tprs, thresholdss = roc_curve(testsety, Ypred_test[:,1])
        Ypred_train = model_best_c.predict_proba(trainsetx)
        fprt, tprt, thresholdst = roc_curve(trainsety, Ypred_train[:,1])
        print('score on unseen test set is: ', auc(fprs,tprs), file=sys.stderr)
        print('training score on this set was: ', auc(fprt,tprt), file=sys.stderr)
        print("best average score during cross validation was:", mean_scores[mean_scores_ix], "with c =", best_c, file=sys.stderr)
        #----
        print('saving the model in directory: ', modeloutput, file=sys.stderr)
        if not os.path.exists(modeloutput):
            os.makedirs(modeloutput)
        save_name = getsavefile(modeloutput + "/reg_model_scklearn", ".pkl", overwrite)
        cPickle.dump(model_best_c, open(save_name, 'wb'), -1)
        save_name = getsavefile(modeloutput + "/reg_model_weights", ".txt", overwrite)
        np.savetxt(save_name, model_best_c.coef_, delimiter=',', header=','.join(header), comments='')
        save_name = getsavefile(modeloutput + "/reg_model_bias", ".txt", overwrite)
        np.savetxt(save_name, model_best_c.intercept_)
    elif model == 'SVM' or model == 'randForest':
        print('{0} model not implemented yet'.format(model), file=sys.stderr)
        exit(1)
    else:
        print('unknown model {0}'.format(model), file=sys.stderr)
        exit(1) 
Example 73
Project: convenience_py   Author: ronrest   File: csv2train_test.py    Apache License 2.0 4 votes vote down vote up
def csv2train_test(file, y_col=None, test=0.3,
           sep=",",
           skip_header=0, skip_footer=0,
           missing_values={"NA", "NAN", "N/A"},
           filling_values=np.nan,
           seed=None):
    """
    Takes a csv file and creates a tuple of arrays containing the data.
        X_train, Y_train, X_test, Y_test
    or if no y_col is specified, then:
        X_train, X_test

    NOTE: The rows are automatically shuffled.

    :param file: {string}
        file path to the csv file
    :param y_col: {int}(default=None)
        The column in the data containing the output labels. If this file
        doesnt contain any output labels, then use None.
    :param test: {float greater than 0.0 and less than 1.0}(default=0.3)
        proportion of the data to assign to the test set.
    :param sep: {str}(default=",")
        delimiter used to separate columns.
    :param skip_header: {int}(default=0)
        Skip this many rows from the top
    :param skip_footer: {int}(default=0)
        Skip this many rows from the end.
    :param missing_values: {set of strings} (default={"NA", "NAN", "N/A"})
        The set of characters to recognise as missing values
    :param filling_values: (default = np.nan)
        what to replace missing values with.
    :param seed: {int or None}(default = None)
        Set the random seed if you want reproducible results
    :return: {tuple of numpy arrays}
        If a y_col is specified, then it returns
            X_train, Y_train, X_test, Y_test
        Otherwise it returns:
            X_train, X_test
    """
    # ==========================================================================
    data = csv2arrays(file=file, y_col=y_col, shuffle=False,
               sep=sep,
               skip_header=skip_header, skip_footer=skip_footer,
               missing_values=missing_values,
               filling_values=filling_values,
               seed=seed)

    if y_col is None:
        return train_test_split(data, test_size=test, random_state=seed)
    else:
        X_train, X_test, \
        Y_train, Y_test = train_test_split(data[0], data[1], test_size=test,
                                           random_state=seed)
        return X_train, Y_train, X_test, Y_test 
Example 74
Project: luna16   Author: gzuidhof   File: data.py    BSD 2-Clause "Simplified" License 4 votes vote down vote up
def load():

    tps = glob.glob(dataset_dir+"/*true.jpg")
    fps_2 = glob.glob(dataset_dir+"/*false.jpg")
    fps = np.random.choice(fps_2,10000)
    images_tps = [[imread(x)] for x in tps]
    images_fps = [[imread(x)] for x in fps]
    labels = np.concatenate((np.ones((len(images_tps))),np.zeros((len(images_fps))))).astype("ubyte")
    images = np.concatenate((images_tps,images_fps)).astype("float32")
    train_X, test_X, train_y, test_y = train_test_split(images,labels, test_size=0.4, random_state=1337)
    half = 0.5*len(test_X)
    val_X = test_X[:half]
    val_y = test_y[:half]
    test_X = test_X[half:]
    test_y = test_y[half:]
    label_to_names = {0:"false",1:"true"}

    # training set, batches 1-4
    # train_X = np.zeros((40000, 3, 32, 32), dtype="float32")
    # train_y = np.zeros((40000, 1), dtype="ubyte").flatten()
    # n_samples = 10000 # number of samples per batch
    # for i in range(0,4):
    #     f = open(os.path.join(dataset_dir, "data_batch_"+str(i+1)+""), "rb")
    #     cifar_batch = pickle.load(f)
    #     f.close()
    #     train_X[i*n_samples:(i+1)*n_samples] = (cifar_batch['data'].reshape(-1, 3, 32, 32) / 255.).astype("float32")
    #     train_y[i*n_samples:(i+1)*n_samples] = np.array(cifar_batch['labels'], dtype='ubyte')
    #
    # # validation set, batch 5
    # f = open(os.path.join(dataset_dir, "data_batch_5"), "rb")
    # cifar_batch_5 = pickle.load(f)
    # f.close()
    # val_X = (cifar_batch_5['data'].reshape(-1, 3, 32, 32) / 255.).astype("float32")
    # val_y = np.array(cifar_batch_5['labels'], dtype='ubyte')
    #
    # # labels
    # f = open(os.path.join(dataset_dir, "batches.meta"), "rb")
    # cifar_dict = pickle.load(f)
    # label_to_names = {k:v for k, v in zip(range(10), cifar_dict['label_names'])}
    # f.close()
    #
    # # test set
    # f = open(os.path.join(dataset_dir, "test_batch"), "rb")
    # cifar_test = pickle.load(f)
    # f.close()
    # test_X = (cifar_test['data'].reshape(-1, 3, 32, 32) / 255.).astype("float32")
    # test_y = np.array(cifar_test['labels'], dtype='ubyte')
    #
    #
    # print("training set size: data = {}, labels = {}".format(train_X.shape, train_y.shape))
    # print("validation set size: data = {}, labels = {}".format(val_X.shape, val_y.shape))
    # print("test set size: data = {}, labels = {}".format(test_X.shape, test_y.shape))
    #
    return train_X, train_y, val_X, val_y, test_X, test_y, label_to_names 
Example 75
Project: keras-cnn-text-classify   Author: GINK03   File: model.py    MIT License 4 votes vote down vote up
def init_train():
  print('Loading data')
  Xs = []
  Ys = []
  voc = {}
  maxlen = 0
  maxwords = 0
  buff = set()
  TARGET_DIR = 'data/*'
  idx_name = {}
  for i, filename in enumerate(glob(TARGET_DIR)):
    idx_name[i] = filename
    for line in open(filename).read().split('\n'):
       a = list(line)
       maxlen = max(maxlen, len(a))
       [buff.add(w) for w in a]
  maxwords = len(buff)
  voc[maxwords] = '___MAX___'
  voc['___META_MAXWORD___'] = maxwords
  voc['___META_MAXLEN___'] = maxlen
  print("maxwords %d"%maxwords)
  print("maxlen %d"%maxlen)
  print("idx name len %d"%len(idx_name))
  for i, filename in enumerate(glob(TARGET_DIR)):
    for line in set(filter(lambda x:x!='', open(filename).read().split('\n'))):
      X = [maxwords]*maxlen
      line = line.strip()
      for idx, ch in enumerate(list(line)):
        if voc.get(ch) == None:
          voc[ch] = len(voc)
        convert = voc[ch]
        X[idx] = convert
      Xs.append(X)
      y = [0.]*len(idx_name)
      y[i] = 1.
      Ys.append(y)
  X_train, X_test, y_train, y_test = train_test_split( Xs, Ys, test_size=0.1, random_state=42)
  open('vod.pkl', 'wb').write(pickle.dumps(voc))
  open('idx_name.pkl', 'wb').write(pickle.dumps(idx_name))
  sequence_length = maxlen
  vocabulary_size = maxwords
  embedding_dim   = 256*1
  filter_sizes    = [3,4,5,1,2]
  num_filters     = 512*1
  drop            = 0.5

  nb_epoch   = 10
  batch_size = 30
  return sequence_length, embedding_dim, filter_sizes, vocabulary_size, num_filters, drop, idx_name, \
  	X_train, X_test, y_train, y_test, batch_size, nb_epoch, Xs, Ys 
Example 76
Project: RSV   Author: Healthcast   File: skeleton.py    GNU General Public License v2.0 4 votes vote down vote up
def test_performance():
    iris = datasets.load_iris()
    iris_X = iris.data
    iris_y = iris.target
    iris_y = iris_y/2

    #simple test
    X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=.5, \
                                                        random_state=0)
    svc = svm.SVC(kernel='linear')
    scores = svc.fit(X_train, y_train).decision_function(X_test)
    r = svc.predict(X_test)

    #test performence method: accuracy, confusion matrix, precision, recall
    #ROC, AUC, classification report
    
    #accuracy: number of correct prediction / number of all cases
    print "\ntest accuracy:"
    print metrics.accuracy_score(y_test, r)

    #precision: tp/(tp+fp)
    print "\ntest precision:"
    print metrics.precision_score(y_test, r)

    #precision: tp/(tp+fn)
    print "\ntest recall:"
    print metrics.recall_score(y_test, r)

    #confusion matrix:
    print "\nconfusion matrix:"
    print metrics.confusion_matrix(y_test, r)

    #test roc curve and auc
    fpr, tpr, thresholds = metrics.roc_curve(y_test, scores, pos_label=1)
    roc_auc = metrics.auc(fpr, tpr)
    print "\ntest auc : " + str(roc_auc)
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()

    #classification report
    print "\ntest callsification report:"
    print metrics.classification_report(y_test, r) 
Example 77
Project: RSV   Author: Healthcast   File: skeleton.py    GNU General Public License v2.0 4 votes vote down vote up
def test_performance():
    iris = datasets.load_iris()
    iris_X = iris.data
    iris_y = iris.target
    iris_y = iris_y/2

    #simple test
    X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=.5, \
                                                        random_state=0)
    svc = svm.SVC(kernel='linear')
    scores = svc.fit(X_train, y_train).decision_function(X_test)
    r = svc.predict(X_test)

    #test performence method: accuracy, confusion matrix, precision, recall
    #ROC, AUC, classification report
    
    #accuracy: number of correct prediction / number of all cases
    print "\ntest accuracy:"
    print metrics.accuracy_score(y_test, r)

    #precision: tp/(tp+fp)
    print "\ntest precision:"
    print metrics.precision_score(y_test, r)

    #precision: tp/(tp+fn)
    print "\ntest recall:"
    print metrics.recall_score(y_test, r)

    #confusion matrix:
    print "\nconfusion matrix:"
    print metrics.confusion_matrix(y_test, r)

    #test roc curve and auc
    fpr, tpr, thresholds = metrics.roc_curve(y_test, scores, pos_label=1)
    roc_auc = metrics.auc(fpr, tpr)
    print "\ntest auc : " + str(roc_auc)
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()

    #classification report
    print "\ntest callsification report:"
    print metrics.classification_report(y_test, r) 
Example 78
Project: SDLib   Author: Coder-Yu   File: SemiSAD.py    GNU General Public License v3.0 4 votes vote down vote up
def predict(self):
            ClassifierN = 0
            classifier = GaussianNB()
            X_train,X_test,y_train,y_test = train_test_split(self.training,self.trainingLabels,test_size=0.75,random_state=33)
            classifier.fit(X_train, y_train)
            # predict UnLabledData
            #pred_labelsForTrainingUn = classifier.predict(X_test)
            print 'Enhanced classifier...'
            while 1:
                if len(X_test)<=5: # min
                    break         #min
                proba_labelsForTrainingUn = classifier.predict_proba(X_test)
                X_test_labels = np.hstack((X_test, proba_labelsForTrainingUn))
                X_test_labels0_sort = sorted(X_test_labels,key=lambda x:x[5],reverse=True)
                if X_test_labels0_sort[4][5]>X_test_labels0_sort[4][6]:
                    a = map(lambda x: x[:5], X_test_labels0_sort)
                    b = a[0:5]
                    classifier.partial_fit(b, ['0','0','0','0','0'], classes=['0', '1'],sample_weight=np.ones(len(b), dtype=np.float) * self.Lambda)
                    X_test_labels = X_test_labels0_sort[5:]
                    X_test = a[5:]
                if len(X_test)<6: # min
                    break         #min

                X_test_labels0_sort = sorted(X_test_labels, key=lambda x: x[5], reverse=True)
                if X_test_labels0_sort[4][5]<=X_test_labels0_sort[4][6]: #min
                    a = map(lambda x: x[:5], X_test_labels0_sort)
                    b = a[0:5]
                    classifier.partial_fit(b, ['1', '1', '1', '1', '1'], classes=['0', '1'],sample_weight=np.ones(len(b), dtype=np.float) * 1)
                    X_test_labels = X_test_labels0_sort[5:]  # min
                    X_test = a[5:]
                if len(X_test)<6:
                    break
            # while 1 :
            #     p1 = pred_labelsForTrainingUn
            #     # 将带λ参数的无标签数据拟合入分类器
            #     classifier.partial_fit(X_test, pred_labelsForTrainingUn,classes=['0','1'], sample_weight=np.ones(len(X_test),dtype=np.float)*self.Lambda)
            #     pred_labelsForTrainingUn = classifier.predict(X_test)
            #     p2 = pred_labelsForTrainingUn
            #     # 判断分类器是否稳定
            #     if list(p1)==list(p2) :
            #         ClassifierN += 1
            #     elif ClassifierN > 0:
            #         ClassifierN = 0
            #     if ClassifierN == 20:
            #         break
            pred_labels = classifier.predict(self.test)
            print 'naive_bayes with EM algorithm:'
            return pred_labels 
Example 79
Project: Face_Recognition   Author: AkiraXD0712   File: training.py    Apache License 2.0 4 votes vote down vote up
def read(self, input_dir):
        images, labels, nb_classes = extract_data(input_dir)

        # shuffle and split data between train and test sets
        x_train, x_test, y_train, y_test = train_test_split(
            images,
            labels,
            test_size=0.3,
            random_state=random.randint(0, 100)
        )
        x_valid, x_test, y_valid, y_test = train_test_split(
            images,
            labels,
            test_size=0.5,
            random_state=random.randint(0, 100)
        )

        print('x_train shape:', x_train.shape)
        print(x_train.shape[0], 'train samples')
        print(x_valid.shape[0], 'valid samples')
        print(x_test.shape[0], 'test samples')

        # # convert class vectors to binary class matrices
        # y_train = np_utils.to_categorical(y_train, nb_classes)
        # y_valid = np_utils.to_categorical(y_valid, nb_classes)
        # y_test = np_utils.to_categorical(y_test, nb_classes)

        x_train = x_train.astype('float32')
        x_valid = x_valid.astype('float32')
        x_test = x_test.astype('float32')
        x_train /= 255
        x_valid /= 255
        x_test /= 255

        self.x_train = x_train
        self.x_valid = x_valid
        self.x_test = x_test
        self.y_train = y_train
        self.y_valid = y_valid
        self.y_test = y_test

        return nb_classes 
Example 80
Project: memory-networks   Author: suriyadeepan   File: data.py    GNU General Public License v3.0 4 votes vote down vote up
def fetch(task_id=1, batch_size=32):

    # task data
    train, test = load_task(datadir, task_id)
    data = train + test

    # metadata
    vocab = sorted(reduce(lambda x, y: x | y, (set(list(chain.from_iterable(s)) + q + a) for s, q, a in data)))
    word_idx = dict((c, i + 1) for i, c in enumerate(vocab))

    # sizes
    max_story_size = max(map(len, (s for s, _, _ in data)))
    mean_story_size = int(np.mean([ len(s) for s, _, _ in data ]))
    sentence_size = max(map(len, chain.from_iterable(s for s, _, _ in data)))
    query_size = max(map(len, (q for _, q, _ in data)))
    memory_size = min(50, max_story_size)
    vocab_size = len(word_idx) + 1 # +1 for nil word
    sentence_size = max(query_size, sentence_size) # for the position

    # train/validation/test sets
    S, Q, A = vectorize_data(train, word_idx, sentence_size, memory_size)
    trainS, valS, trainQ, valQ, trainA, valA = cross_validation.train_test_split(S, Q, A, test_size=.1, random_state=None)
    testS, testQ, testA = vectorize_data(test, word_idx, sentence_size, memory_size)

    # params
    n_train = trainS.shape[0]
    n_test = testS.shape[0]
    n_val = valS.shape[0]

    batches = zip(range(0, n_train-batch_size, batch_size), range(batch_size, n_train, batch_size))
    batches = [(start, end) for start, end in batches]

    data = {
        'trS' : trainS,
        'trQ' : trainQ,
        'trA' : trainA,
        'teS' : testS,
        'teQ' : testQ,
        'teA' : testA,
        'vaS' : valS,
        'vaQ' : valQ,
        'vaA' : valA,
        'batches' : batches
        }


    metadata = {
            'vocab_size' : vocab_size,
            'vocab' : vocab,
            'word_idx' : word_idx,
            'sentence_size' : sentence_size,
            'memory_size' : memory_size
            }

    return data, metadata