Python pyspark.mllib.regression.LabeledPoint() Examples

The following are code examples for showing how to use pyspark.mllib.regression.LabeledPoint(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: tools   Author: dongjoon-hyun   File: spark.py    Apache License 2.0 6 votes vote down vote up
def naivebayes_predict(model, inpath, outpath):
    """
    fab spark.naivebayes_predict:/tmp/nb.model,/sample/naive_bayes_test.txt,/tmp/nb.result
    """
    run('mkdir %s' % env.dir)
    with cd(env.dir):
        run('''cat <<EOF > spark.naivebayes_test.py
# -*- coding: utf-8 -*-
from pyspark import SparkContext
from pyspark.mllib.classification import NaiveBayesModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

def parseLine(line):
    features = Vectors.dense([float(x) for x in line.split(' ')])
    return features

sc = SparkContext(appName='Naive Bayes Predict')
model = NaiveBayesModel.load(sc, '%(model)s')
sc.textFile('%(inpath)s').map(parseLine).map(model.predict).saveAsTextFile('%(outpath)s')
EOF''' % locals())
        cmd = '/opt/spark/bin/spark-submit spark.naivebayes_test.py 2> /dev/null'
        run(cmd) 
Example 2
Project: kaggle-spark-ml   Author: imgoodman   File: ml.py    MIT License 6 votes vote down vote up
def predict_SVMWithSGD(numIterations,step,regParam,regType):
    """
    SVMWithSGD.train(data,iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, regType='l2',intercept=False, validateData=True,convergenceTol=0.001)
    data: the training data, an RDD of LabeledPoint
    iterations: the number of iterations, default 100
    step: the step parameter used in SGD, default 1.0
    regParam: the regularizer parameter, default 0.01
    miniBatchFraction: fraction of data to be used for each SGD iteration, default 1.0
    initialWeights: the initial weights, default None
    regType: the type of regularizer used for training our model, allowed values ('l1':for using L1 regularization; 'l2':for using L2 regularization, default; None: for no regularization)
    intercept: boolean parameter which indicates the use or not of the augmented representation for training data (i.e. whether bias feature are activated or not, default False)
    validateData: boolean parameter which indicates if the algorithm should validate data before training, default True
    convergenceTol: a condition which decides iteration termination, default 0.001
    """
    svmModel=SVMWithSGD.train(scaledData, iterations=numIterations,step=step, regParam=regParam, regType=regType)
    svmMetrics=scaledData.map(lambda p:(svmModel.predict(p.features),p.label))
    svmAccuracy=svmMetrics.filter(lambda (actual, pred) : actual==pred).count()*1.0/data.count()
    metrics=BinaryClassificationMetrics(svmMetrics)
    #print "SVMWithSGD model accuracy is: %f in %d iterations,step:%f;regParam:%f;regType:%s" % (svmAccuracy, numIterations,step,regParam,regType)
    return svmAccuracy 
Example 3
Project: elephas   Author: maxpumperla   File: rdd_utils.py    MIT License 6 votes vote down vote up
def to_labeled_point(sc, features, labels, categorical=False):
    """Convert numpy arrays of features and labels into
    a LabeledPoint RDD for MLlib and ML integration.

    :param sc: Spark context
    :param features: numpy array with features
    :param labels: numpy array with labels
    :param categorical: boolean, whether labels are already one-hot encoded or not
    :return: LabeledPoint RDD with features and labels
    """
    labeled_points = []
    for x, y in zip(features, labels):
        if categorical:
            lp = LabeledPoint(np.argmax(y), to_vector(x))
        else:
            lp = LabeledPoint(y, to_vector(x))
        labeled_points.append(lp)
    return sc.parallelize(labeled_points) 
Example 4
Project: elephas   Author: maxpumperla   File: rdd_utils.py    MIT License 6 votes vote down vote up
def from_labeled_point(rdd, categorical=False, nb_classes=None):
    """Convert a LabeledPoint RDD back to a pair of numpy arrays

    :param rdd: LabeledPoint RDD
    :param categorical: boolean, if labels should be one-hot encode when returned
    :param nb_classes: optional int, indicating the number of class labels
    :return: pair of numpy arrays, features and labels
    """
    features = np.asarray(
        rdd.map(lambda lp: from_vector(lp.features)).collect())
    labels = np.asarray(rdd.map(lambda lp: lp.label).collect(), dtype='int32')
    if categorical:
        if not nb_classes:
            nb_classes = np.max(labels) + 1
        temp = np.zeros((len(labels), nb_classes))
        for i, label in enumerate(labels):
            temp[i, label] = 1.
        labels = temp
    return features, labels 
Example 5
Project: elephas   Author: maxpumperla   File: rdd_utils.py    MIT License 6 votes vote down vote up
def lp_to_simple_rdd(lp_rdd, categorical=False, nb_classes=None):
    """Convert a LabeledPoint RDD into an RDD of feature-label pairs

    :param lp_rdd: LabeledPoint RDD of features and labels
    :param categorical: boolean, if labels should be one-hot encode when returned
    :param nb_classes: int, number of total classes
    :return: Spark RDD with feature-label pairs
    """
    if categorical:
        if not nb_classes:
            labels = np.asarray(lp_rdd.map(
                lambda lp: lp.label).collect(), dtype='int32')
            nb_classes = np.max(labels) + 1
        rdd = lp_rdd.map(lambda lp: (from_vector(lp.features),
                                     encode_label(lp.label, nb_classes)))
    else:
        rdd = lp_rdd.map(lambda lp: (from_vector(lp.features), lp.label))
    return rdd 
Example 6
Project: LearningApacheSpark   Author: runawayhorse001   File: tests.py    MIT License 6 votes vote down vote up
def test_chi_sq_pearson(self):
        data = [
            LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
            LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
            LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
            LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
        ]

        for numParts in [2, 4, 6, 8]:
            chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
            feature1 = chi[0]
            self.assertEqual(feature1.statistic, 0.75)
            self.assertEqual(feature1.degreesOfFreedom, 2)
            self.assertAlmostEqual(feature1.pValue, 0.6873, 4)

            feature2 = chi[1]
            self.assertEqual(feature2.statistic, 1.5)
            self.assertEqual(feature2.degreesOfFreedom, 3)
            self.assertAlmostEqual(feature2.pValue, 0.6823, 4) 
Example 7
Project: LearningApacheSpark   Author: runawayhorse001   File: util.py    MIT License 6 votes vote down vote up
def saveAsLibSVMFile(data, dir):
        """
        Save labeled data in LIBSVM format.

        :param data: an RDD of LabeledPoint to be saved
        :param dir: directory to save the data

        >>> from tempfile import NamedTemporaryFile
        >>> from fileinput import input
        >>> from pyspark.mllib.regression import LabeledPoint
        >>> from glob import glob
        >>> from pyspark.mllib.util import MLUtils
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])),
        ...             LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)
        >>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))
        '0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'
        """
        lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))
        lines.saveAsTextFile(dir) 
Example 8
Project: LearningApacheSpark   Author: runawayhorse001   File: util.py    MIT License 6 votes vote down vote up
def loadLabeledPoints(sc, path, minPartitions=None):
        """
        Load labeled points saved using RDD.saveAsTextFile.

        :param sc: Spark context
        :param path: file or directory path in any Hadoop-supported file
                     system URI
        :param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> from pyspark.mllib.regression import LabeledPoint
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])),
        ...             LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
        >>> MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
        [LabeledPoint(1.1, (3,[0,2],[-1.23,4.56e-07])), LabeledPoint(0.0, [1.01,2.02,3.03])]
        """
        minPartitions = minPartitions or min(sc.defaultParallelism, 2)
        return callMLlibFunc("loadLabeledPoints", sc, path, minPartitions) 
Example 9
Project: nyc-taxi-spark-ml   Author: notthatbreezy   File: rdd_utils.py    Apache License 2.0 6 votes vote down vote up
def labeled_point_to_row_col_period(labeled_point):
    """Helper function to reconstruct period, row, and column of labeled point

    Used in .map call for predictions

    Args:
      labeled_point (LabeledPoint): cell with label and features

    Returns:
      tuple (int, int, str): row, col, period
    """
    features = labeled_point.features
    row, col = features[0], features[1]
    month, day, hour = features[2], features[3], features[4]
    period = '2013{:02d}{:02d}{:02d}'.format(int(month), int(day), int(hour))
    return row, col, period 
Example 10
Project: GEQE   Author: Sotera   File: aggregatedComparison.py    The Unlicense 6 votes vote down vote up
def mapForPrecomp(record, bUseDate, fBinSize):
    if bUseDate:
        catch = record[1][0].find("_",9)
        shiftLat = shiftedPoint(float(record[1][0][9:catch]), fBinSize)
        shiftLon = shiftedPoint(float(record[1][0][catch+1:]), fBinSize)
        return Row(key=record[1][0],
                   dt=datetime.datetime.strptime(record[1][0][:8],"%Y%m%d"),
                   lat=shiftLat,
                   lon=shiftLon,
                   vector=record[0].features,
                   size=len(record[1][1]),
                   binSize=fBinSize)
    else:
        catch = record[1][0].find("_",2)
        shiftLat = shiftedPoint(float(record[1][0][:catch]), fBinSize)
        shiftLon = shiftedPoint(float(record[1][0][catch+1:]), fBinSize)
        return Row(key=record[1][0],
                   lat=shiftLat,
                   lon=shiftLon,
                   vector=record[0].features,
                   size=len(record[1][1]),
                   binSize=fBinSize)

#(x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize]) 
Example 11
Project: GEQE   Author: Sotera   File: aggregatedComparison.py    The Unlicense 6 votes vote down vote up
def removeStopWords(record, lStop):
    fLP  = record[1][0]
    fVec = fLP.features
    bMod = False
    for w in lStop:
        if w in fVec.indices:
            bMod=True

    if bMod is True:
        retInd = []
        retVal = []
        for i in range(len(fVec.indices)):
            if fVec.indices[i] not in lStop:
                retInd.append(fVec.indices[i])
                retVal.append(fVec.values[i])
        return (record[0], [LabeledPoint(fLP.label, SparseVector(fVec.size, retInd, retVal)), record[1][1], record[1][2], record[1][3], record[1][4]])
    else:
        return record 
Example 12
Project: GEQE   Author: Sotera   File: fspLib.py    The Unlicense 6 votes vote down vote up
def combineGroups(record):
    lPlace = []
    label = record[0]
    bFirst = True
    for key in record[1]:
        if bFirst==True:
            bFirst=False
            lPlace = key.features.toArray()
        else:
            lPlace = lPlace + key.features.toArray()
    ind = 0
    cVec = []
    for t in lPlace:
        if t != 0:
            cVec.append((ind,t))
        ind = ind +1
    return LabeledPoint(label, SparseVector(len(lPlace), cVec)) 
Example 13
Project: catraca   Author: tinchoa   File: new-offline.py    GNU General Public License v2.0 6 votes vote down vote up
def pass2libsvm(vectors2,classes):

	newVector=classes.zip(vectors2)
	grouped=newVector.groupByKey().mapValues(list)
	final=newVector.map(lambda x : LabeledPoint(x[0],x[1]))


	
	print 'returning libsvm format'
	# final=sc.parallelize(e) #return in libsvm format

	return final

#to save file in disk

#tempFile = NamedTemporaryFile(delete=True)
#tempFile.close()
#MLUtils.saveAsLibSVMFile(sc.parallelize(final), 'hdfs://master:9000/user/app/dataset_GTA.csv')


#prepare the data for the libsvm


#### 
Example 14
Project: catraca   Author: tinchoa   File: working.py    GNU General Public License v2.0 6 votes vote down vote up
def pass2libsvm(vectors2,classes):
	vectorRDD = sc.parallelize(vectors2) #alterei aqui
	newVector=classes.zip(vectorRDD)
	grouped=newVector.groupByKey().mapValues(list)
	final=newVector.map(lambda x : LabeledPoint(x[0],x[1]))

	print 'returning libsvm format'

	return final
#to save file in disk

#tempFile = NamedTemporaryFile(delete=True)
#tempFile.close()
#MLUtils.saveAsLibSVMFile(sc.parallelize(final), 'hdfs://master:9000/user/app/dataset_GTA.csv')


#prepare the data for the libsvm


#### 
Example 15
Project: Data_Analytics_with_Hadoop   Author: oreillymedia   File: wines.py    MIT License 5 votes vote down vote up
def parsePoint(line):
    values = csv.reader(StringIO(line), delimiter=";").next() # CSV parsing of line
    values = [float(x) for x in values]                       # Cast to all floats
    return LabeledPoint(values[-1], values[:-1])              # y = quality, X = row[:-1] 
Example 16
Project: learning-spark   Author: hbwzhsh   File: logistic_regression.py    GNU General Public License v2.0 5 votes vote down vote up
def parsePoint(line):
    """
    Parse a line of text into an MLlib LabeledPoint object.
    """
    values = [float(s) for s in line.split(' ')]
    if values[0] == -1:   # Convert -1 labels to 0 for MLlib
        values[0] = 0
    return LabeledPoint(values[0], values[1:]) 
Example 17
Project: tools   Author: dongjoon-hyun   File: spark.py    Apache License 2.0 5 votes vote down vote up
def naivebayes_train(inpath, lambda_, outpath):
    """
    fab spark.naivebayes_train:/sample/sample_naive_bayes_data.txt,1.0,/tmp/nb.model
    """
    run('mkdir %s' % env.dir)
    with cd(env.dir):
        run('''cat <<EOF > spark.naivebayes_train.py
# -*- coding: utf-8 -*-
from pyspark import SparkContext
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

def parseLine(line):
    parts = line.split(',')
    label = float(parts[0])
    features = Vectors.dense([float(x) for x in parts[1].split(' ')])
    return LabeledPoint(label, features)

sc = SparkContext(appName='Naive Bayes Train')
data = sc.textFile('%(inpath)s').map(parseLine)
model = NaiveBayes.train(data, %(lambda_)s)
model.save(sc, '%(outpath)s')
EOF''' % locals())
        cmd = '/opt/spark/bin/spark-submit spark.naivebayes_train.py 2> /dev/null'
        run(cmd) 
Example 18
Project: tools   Author: dongjoon-hyun   File: spark.py    Apache License 2.0 5 votes vote down vote up
def lm(inpath, outpath, step, maxiter):
    """
    fab spark.lm:/sample/sample_regression,/user/hadoop/lm_result,0.1,1000
    """
    if not (outpath.startswith('/tmp/') or outpath.startswith('/user/hadoop/')):
        print 'Unauthorized path: %(outpath)s' % locals()
        return
    run('mkdir %s' % env.dir)
    with cd(env.dir):
        run('''cat <<'EOF' > spark.lm.py
# -*- coding: utf-8 -*-
from pyspark import SparkContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
sc = SparkContext(appName='Linear Regression')

data = sc.textFile('%(inpath)s').filter(lambda x: not x.startswith('#')).map(lambda x: x.split())
label = data.map(lambda x: x[-1])
feature = data.map(lambda x: x[0:-1])
scaler = StandardScaler().fit(feature)
feature = scaler.transform(feature)
model = LinearRegressionWithSGD.train(label.zip(feature).map(lambda (x,y): LabeledPoint(x,y)), intercept=True, \
iterations=%(maxiter)s, step=%(step)s)
print model
EOF''' % locals())
        cmd = '/opt/spark/bin/spark-submit spark.lm.py 2> /dev/null'
        run(cmd) 
Example 19
Project: kaggle-spark-ml   Author: imgoodman   File: ml.py    MIT License 5 votes vote down vote up
def predict_NaiveBayes(lamb):
    """
    NaiveBayes.train(data, lambda=1.0)
    data: the training data of RDD of LabeledPoint
    lambda: the smoothing parameter, default 1.0
    """
    naiveBayesModel=NaiveBayes.train(scaledData, lamb)
    naiveBayesMetrics=scaledData.map(lambda p: (p.label, naiveBayesModel.predict(p.features)))
    naiveBayesAccuracy=naiveBayesMetrics.filter(lambda (actual,pred):actual==pred).count()*1.0/data.count()
    return naiveBayesAccuracy 
Example 20
Project: kaggle-spark-ml   Author: imgoodman   File: ml.py    MIT License 5 votes vote down vote up
def predict_LogisticRegressionWithSGD(iterations,step,regParam,regType):
    """
    LogisticRegressionWithLBFGS.train(data, iterations=100, initialWeights=None, regParam=0.0, regType='l2', intercept=False, corrections=10, tolerance=1e-06, validateData=True, numClasses=2)
    data: the training data, an RDD of LabeledPoint
    iterations: the number of iterations
    corrections: the number of corrections used in the LBFGS update. if a known updater is used for binary classification, it calls the ml implementation and this parameter will have no effect. default 10
    tolerance: the convergence tolerance of iterations for L-BFGS
    numClasses: the number of classes (i.e., outcomes) a label can take in Multinomial logistic regression, default 2
    """
    lrModel=LogisticRegressionWithSGD.train(scaledData, iterations=iterations,step=step,regParam=regParam, regType=regType)
    lrMetrics=scaledData.map(lambda p: (p.label, lrModel.predict(p.features)))
    lrAccuracy=lrMetrics.filter(lambda (actual,pred):actual==pred).count()*1.0/data.count()
    return lrAccuracy 
Example 21
Project: Twitter-Hashtag-Tracking   Author: xuwenyihust   File: analysis.py    MIT License 5 votes vote down vote up
def sentiment_analysis(lines, model, hashingTF, iDF):
	analysis = lines.map(lambda line: line.split()) \
					.map(lambda x: hashingTF.transform(x)) \
					.transform(classify_tweet) \
					.map(lambda x: LabeledPoint(1, x)) \
					.map(lambda x: model.predict(x.features)) \
					.reduce(lambda x,y: x+y)

	analysis.foreachRDD(lambda x: pos_cnt_li.extend(x.collect())) 
Example 22
Project: Identifying-Patterns-in-Stock-Price-Movements-and-Predicting-Future-Price   Author: Sapphirine   File: random_forest_regression.py    Apache License 2.0 5 votes vote down vote up
def parsePoint(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(values[0], values[1:])

# Get ticker 
Example 23
Project: elephas   Author: maxpumperla   File: adapter.py    MIT License 5 votes vote down vote up
def from_data_frame(df, categorical=False, nb_classes=None):
    """Convert DataFrame back to pair of numpy arrays
    """
    lp_rdd = df.rdd.map(lambda row: LabeledPoint(row.label, row.features))
    features, labels = from_labeled_point(lp_rdd, categorical, nb_classes)
    return features, labels 
Example 24
Project: elephas   Author: maxpumperla   File: adapter.py    MIT License 5 votes vote down vote up
def df_to_simple_rdd(df, categorical=False, nb_classes=None, features_col='features', label_col='label'):
    """Convert DataFrame into RDD of pairs
    """
    sql_context = df.sql_ctx
    sql_context.registerDataFrameAsTable(df, "temp_table")
    selected_df = sql_context.sql(
        "SELECT {0} AS features, {1} as label from temp_table".format(features_col, label_col))
    if isinstance(selected_df.first().features, MLLibVector):
        lp_rdd = selected_df.rdd.map(
            lambda row: LabeledPoint(row.label, row.features))
    else:
        lp_rdd = selected_df.rdd.map(lambda row: LabeledPoint(
            row.label, MLLibVectors.fromML(row.features)))
    rdd = lp_to_simple_rdd(lp_rdd, categorical, nb_classes)
    return rdd 
Example 25
Project: LearningApacheSpark   Author: runawayhorse001   File: tests.py    MIT License 5 votes vote down vote up
def test_infer_schema(self):
        rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = df.rdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise TypeError("expecting a vector but got %r of type %r" % (v, type(v))) 
Example 26
Project: LearningApacheSpark   Author: runawayhorse001   File: tests.py    MIT License 5 votes vote down vote up
def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0) 
Example 27
Project: LearningApacheSpark   Author: runawayhorse001   File: tests.py    MIT License 5 votes vote down vote up
def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0) 
Example 28
Project: LearningApacheSpark   Author: runawayhorse001   File: tests.py    MIT License 5 votes vote down vote up
def test_right_number_of_results(self):
        num_cols = 1001
        sparse_data = [
            LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
            LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
        ]
        chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
        self.assertEqual(len(chi), num_cols)
        self.assertIsNotNone(chi[1000]) 
Example 29
Project: LearningApacheSpark   Author: runawayhorse001   File: util.py    MIT License 5 votes vote down vote up
def _convert_labeled_point_to_libsvm(p):
        """Converts a LabeledPoint to a string in LIBSVM format."""
        from pyspark.mllib.regression import LabeledPoint
        assert isinstance(p, LabeledPoint)
        items = [str(p.label)]
        v = _convert_to_vector(p.features)
        if isinstance(v, SparseVector):
            nnz = len(v.indices)
            for i in xrange(nnz):
                items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
        else:
            for i in xrange(len(v)):
                items.append(str(i + 1) + ":" + str(v[i]))
        return " ".join(items) 
Example 30
Project: LearningApacheSpark   Author: runawayhorse001   File: feature.py    MIT License 5 votes vote down vote up
def fit(self, data):
        """
        Returns a ChiSquared feature selector.

        :param data: an `RDD[LabeledPoint]` containing the labeled dataset
                     with categorical features. Real-valued features will be
                     treated as categorical for each distinct value.
                     Apply feature discretizer before using this function.
        """
        jmodel = callMLlibFunc("fitChiSqSelector", self.selectorType, self.numTopFeatures,
                               self.percentile, self.fpr, self.fdr, self.fwe, data)
        return ChiSqSelectorModel(jmodel) 
Example 31
Project: LearningApacheSpark   Author: runawayhorse001   File: tree.py    MIT License 5 votes vote down vote up
def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees,
               featureSubsetStrategy, impurity, maxDepth, maxBins, seed):
        first = data.first()
        assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
        if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies:
            raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy)
        if seed is None:
            seed = random.randint(0, 1 << 30)
        model = callMLlibFunc("trainRandomForestModel", data, algo, numClasses,
                              categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity,
                              maxDepth, maxBins, seed)
        return RandomForestModel(model) 
Example 32
Project: LearningApacheSpark   Author: runawayhorse001   File: tree.py    MIT License 5 votes vote down vote up
def _train(cls, data, algo, categoricalFeaturesInfo,
               loss, numIterations, learningRate, maxDepth, maxBins):
        first = data.first()
        assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
        model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo,
                              loss, numIterations, learningRate, maxDepth, maxBins)
        return GradientBoostedTreesModel(model) 
Example 33
Project: nyc-taxi-spark-ml   Author: notthatbreezy   File: rdd_utils.py    Apache License 2.0 5 votes vote down vote up
def get_labeled_points(period, data_dict, data_ordering, mask, weekday, geovars, weather_data):
    """Produce a set of labeled points for a period

    Args:
      period (str): period the data represents
      data_dict (dict): dictionary type_of_array => array
      mask (list): list of tuples (row, col)
      weekday (broadcast dict): broadcast var where period => list of 1/0 for day of week
      geovars (broadcast list): list of static geovars used for modeling, each is an array
      weather_data (dict): period => list of weather observations

    Returns:
      list of LabeledPoints with labels and features
    """
    ## Verify this period has observed values and all the other
    ## Necessary Variables
    observed_arr = data_dict.pop('observed', None)
    if observed_arr is None:
        return []

    if set(data_dict.keys()) != set(data_ordering):
        return []

    month = int(period[4:6])
    day = int(period[6:8])
    hour = int(period[-2:])
    weekday_list = weekday.value[period[:-2]]
    weather = weather_data.value[period]

    def get_labeled_point(row, col):
        observed = observed_arr[(row, col)]
        measurements = [row, col, month, day, hour] + weekday_list + weather
        for k in data_ordering:
            value = data_dict[k]
            measurements.append(value[(row, col)])
            for geovar in geovars:
                measurements.append(geovar.value[(row, col)])

        return LabeledPoint(observed, measurements)

    return [get_labeled_point(row, col) for row, col in mask.value] 
Example 34
Project: GEQE   Author: Sotera   File: aggregatedComparison.py    The Unlicense 5 votes vote down vote up
def createAggregatedLabledPoint(rddIn, bUseDate, binSize, bc_dIDF, bUserStopFilter, bc_lStopWords, nTot, lpVal, nMin):
    grouped = rddIn.map(lambda x: (groupString(x,bUseDate,binSize), x))\
        .groupByKey()\
        .filter(lambda x: hasMinUsers(x[1],nMin))
    return grouped.map(lambda x: (LabeledPoint(lpVal, megaVector(x[1], bc_dIDF, bUserStopFilter, bc_lStopWords, nTot)),x)) 
Example 35
Project: GEQE   Author: Sotera   File: fspLib.py    The Unlicense 5 votes vote down vote up
def placeToLP(record, bInRegion, bc_dArrPos):
    caption = record.text
    sTPos = set()
    dArrPos = bc_dArrPos.value
    for term in uniqueWords(caption, False, []):
        if term in dArrPos:
            sTPos.add(dArrPos[term])
    featureVector = SparseVector(len(dArrPos), sorted(list(sTPos)), [1.]*len(sTPos))
    return (LabeledPoint(bInRegion, featureVector),record) 
Example 36
Project: GEQE   Author: Sotera   File: createROC.py    The Unlicense 5 votes vote down vote up
def trainSVMModel(data):
    """
    Train an SVM model and return it
    :param data: RDD[LabeledPoint]
    :return: svm classification model
    """
    from pyspark.mllib.classification import SVMWithSGD, SVMModel
    model = SVMWithSGD.train(data, iterations=100)
    return model 
Example 37
Project: catraca   Author: tinchoa   File: kafka-spark-ml-with-http.py    GNU General Public License v2.0 5 votes vote down vote up
def pass2libsvm(vectors2,classes):

	newVector=classes.zip(vectors2)
	grouped=newVector.groupByKey().mapValues(list)
	final=newVector.map(lambda x : LabeledPoint(x[0],x[1]))


	# ###to make the reduced matrix with vectors
	# dif1=[]
	# #dif1 = [0]*len(vectors)
	# z={}
	# z[1]=[]
	# dif2=[]
	# #dif2 = [0]*len(vectors)
	# z[2]=[]

	# dif3=[]
	# z[3]=[]
	# #dif3 = [0]*len(vectors)
	# e=[]
	# for i in range(len(vectors2)):
	# 		if int(classes[i]) == 0:
	# 			dif1.append(vectors2[i])
	# 			e.append(LabeledPoint(0,np.array(dif1)))
	# 			dif1=[]
	# 		if int(classes[i]) == 1:
	# 			dif2.append(vectors2[i])
	# 			e.append(LabeledPoint(1,np.array(dif2)))
	# 			dif2=[]
	# 		if int(classes[i]) == 2:
	# 			dif3.append(vectors2[i])
	# 			e.append(LabeledPoint(2,np.array(dif3)))
	# 			dif3=[]
		
	# 	#ver como hacer el tema de la libsvm list
	# 	#deveria ser algo del tipo 1, () ,2 (), 1 (), 3 (), 2()

	#print 'returning libsvm format'
	# final=sc.parallelize(e) #return in libsvm format

	return final 
Example 38
Project: catraca   Author: tinchoa   File: detection-with-elastic.py    GNU General Public License v2.0 5 votes vote down vote up
def pass2libsvm(vectors2,classes):

	newVector=classes.zip(vectors2)
	grouped=newVector.groupByKey().mapValues(list)
	final=newVector.map(lambda x : LabeledPoint(x[0],x[1]))


	return final 
Example 39
Project: pyspark_mllib   Author: animenon   File: lrwithSGD.py    Apache License 2.0 5 votes vote down vote up
def parsePoint(line):
    values = [float(x) for x in line.replace(',', '').split('')]
    return LabeledPoint(values[0], values[1:]) 
Example 40
Project: pyspark_mllib   Author: animenon   File: lrModel.py    Apache License 2.0 5 votes vote down vote up
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:]) 
Example 41
Project: K_Means_with_MLlib   Author: IBMPredictiveAnalytics   File: kmeans_score.py    Apache License 2.0 4 votes vote down vote up
def encode(self,df,target,predictors,setToFlag):
        if not self.dm:
            self.computeDataModel(df)
        schema = df.dtypes[:]
        lookup = {}
        for i in range(0,len(schema)):
            lookup[unicode(schema[i][0],"utf-8")] = i
            lookup[schema[i][0]] = i

        target_index = -1
        if target:
            target_index = lookup[target]
        dm = self.dm

        def mapFn(row):
            pvals = []
            for predictor in predictors:
                predictor_index = lookup[predictor]
                if isinstance(dm[predictor],list):
                    try:
                        encoded_val = dm[predictor].index(row[predictor_index])
                        if setToFlag == None:
                            pvals.append(encoded_val)
                        else:
                            flags = [0.0]*len(dm[predictor])
                            flags[encoded_val]=setToFlag
                            pvals += flags
                    except ValueError:
                        if setToFlag == None:
                            pvals.append(None)
                        else:
                            pvals += [0.0]*len(dm[predictor])
                else:
                    pval = row[predictor_index]
                    # if pval == None:
                    #    pval_min = dm[predictor]["min"]
                    #    pval_max = dm[predictor]["max"]
                    #    pval=pval_min+(pval_max - pval_min)*0.5
                    pvals.append(pval)
            dv = DenseVector(pvals)
            if target_index == -1:
                return (row,dv)
            tval = row[target_index]
            if isinstance(dm[target],list): # target is categorical
                try:
                    tval = dm[target].index(tval)
                except ValueError:
                    tval = None
            return (row,LabeledPoint(tval,dv))

        return df.map(mapFn) 
Example 42
Project: K_Means_with_MLlib   Author: IBMPredictiveAnalytics   File: kmeans.py    Apache License 2.0 4 votes vote down vote up
def encode(self,df,target,predictors,setToFlag):
        if not self.dm:
            self.computeDataModel(df)
        schema = df.dtypes[:]
        lookup = {}
        for i in range(0,len(schema)):
            lookup[unicode(schema[i][0],"utf-8")] = i
            lookup[schema[i][0]] = i

        target_index = -1
        if target:
            target_index = lookup[target]
        dm = self.dm

        def mapFn(row):
            pvals = []
            for predictor in predictors:
                predictor_index = lookup[predictor]
                if isinstance(dm[predictor],list):
                    try:
                        encoded_val = dm[predictor].index(row[predictor_index])
                        if setToFlag == None:
                            pvals.append(encoded_val)
                        else:
                            flags = [0.0]*len(dm[predictor])
                            flags[encoded_val]=setToFlag
                            pvals += flags
                    except ValueError:
                        if setToFlag == None:
                            pvals.append(None)
                        else:
                            pvals += [0.0]*len(dm[predictor])
                else:
                    pval = row[predictor_index]
                    # if pval == None:
                    #    pval_min = dm[predictor]["min"]
                    #    pval_max = dm[predictor]["max"]
                    #    pval=pval_min+(pval_max - pval_min)*0.5
                    pvals.append(pval)
            dv = DenseVector(pvals)
            if target_index == -1:
                return (row,dv)
            tval = row[target_index]
            if isinstance(dm[target],list): # target is categorical
                try:
                    tval = dm[target].index(tval)
                except ValueError:
                    tval = None
            return (row,LabeledPoint(tval,dv))

        return df.map(mapFn) 
Example 43
Project: BinaryLogReg_with_MLlib   Author: IBMPredictiveAnalytics   File: logistic_regression.py    Apache License 2.0 4 votes vote down vote up
def encode(self,df,target,predictors,setToFlag):
        if not self.dm:
            self.computeDataModel(df)
        schema = df.dtypes[:]
        lookup = {}
        for i in range(0,len(schema)):
            lookup[schema[i][0]] = i

        target_index = -1
        if target:
            target_index = lookup[target]
        dm = self.dm

        def mapFn(row):
            pvals = []
            for predictor in predictors:
                predictor_index = lookup[predictor]
                if isinstance(dm[predictor],list):
                    try:
                        encoded_val = dm[predictor].index(row[predictor_index])
                        if setToFlag == None:
                            pvals.append(encoded_val)
                        else:
                            flags = [0.0]*len(dm[predictor])
                            flags[encoded_val]=setToFlag
                            pvals += flags
                    except ValueError:
                        if setToFlag == None:
                            pvals.append(None)
                        else:
                            pvals += [0.0]*len(dm[predictor])
                else:
                    pval = row[predictor_index]
                    # if pval == None:
                    #    pval_min = dm[predictor]["min"]
                    #    pval_max = dm[predictor]["max"]
                    #    pval=pval_min+(pval_max - pval_min)*0.5
                    pvals.append(pval)
            dv = DenseVector(pvals)
            if target_index == -1:
                return (row,dv)
            tval = row[target_index]
            if isinstance(dm[target],list): # target is categorical
                try:
                    tval = dm[target].index(tval)
                except ValueError:
                    tval = None
            return (row,LabeledPoint(tval,dv))

        return df.map(mapFn) 
Example 44
Project: BinaryLogReg_with_MLlib   Author: IBMPredictiveAnalytics   File: logistic_regression_score.py    Apache License 2.0 4 votes vote down vote up
def encode(self,df,target,predictors,setToFlag):
        if not self.dm:
            self.computeDataModel(df)
        schema = df.dtypes[:]
        lookup = {}
        for i in range(0,len(schema)):
            lookup[schema[i][0]] = i

        target_index = -1
        if target:
            target_index = lookup[target]
        dm = self.dm

        def mapFn(row):
            pvals = []
            for predictor in predictors:
                predictor_index = lookup[predictor]
                if isinstance(dm[predictor],list):
                    try:
                        encoded_val = dm[predictor].index(row[predictor_index])
                        if setToFlag == None:
                            pvals.append(encoded_val)
                        else:
                            flags = [0.0]*len(dm[predictor])
                            flags[encoded_val]=setToFlag
                            pvals += flags
                    except ValueError:
                        if setToFlag == None:
                            pvals.append(None)
                        else:
                            pvals += [0.0]*len(dm[predictor])
                else:
                    pval = row[predictor_index]
                    # if pval == None:
                    #    pval_min = dm[predictor]["min"]
                    #    pval_max = dm[predictor]["max"]
                    #    pval=pval_min+(pval_max - pval_min)*0.5
                    pvals.append(pval)
            dv = DenseVector(pvals)
            if target_index == -1:
                return (row,dv)
            tval = row[target_index]
            if isinstance(dm[target],list): # target is categorical
                try:
                    tval = dm[target].index(tval)
                except ValueError:
                    tval = None
            return (row,LabeledPoint(tval,dv))

        return df.map(mapFn) 
Example 45
Project: LearningApacheSpark   Author: runawayhorse001   File: util.py    MIT License 4 votes vote down vote up
def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None, multiclass=None):
        """
        Loads labeled data in the LIBSVM format into an RDD of
        LabeledPoint. The LIBSVM format is a text-based format used by
        LIBSVM and LIBLINEAR. Each line represents a labeled sparse
        feature vector using the following format:

        label index1:value1 index2:value2 ...

        where the indices are one-based and in ascending order. This
        method parses each line into a LabeledPoint, where the feature
        indices are converted to zero-based.

        :param sc: Spark context
        :param path: file or directory path in any Hadoop-supported file
                     system URI
        :param numFeatures: number of features, which will be determined
                            from the input data if a nonpositive value
                            is given. This is useful when the dataset is
                            already split into multiple files and you
                            want to load them separately, because some
                            features may not present in certain files,
                            which leads to inconsistent feature
                            dimensions.
        :param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> from pyspark.mllib.regression import LabeledPoint
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> _ = tempFile.write(b"+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
        >>> tempFile.flush()
        >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
        >>> tempFile.close()
        >>> examples[0]
        LabeledPoint(1.0, (6,[0,2,4],[1.0,2.0,3.0]))
        >>> examples[1]
        LabeledPoint(-1.0, (6,[],[]))
        >>> examples[2]
        LabeledPoint(-1.0, (6,[1,3,5],[4.0,5.0,6.0]))
        """
        from pyspark.mllib.regression import LabeledPoint
        if multiclass is not None:
            warnings.warn("deprecated", DeprecationWarning)

        lines = sc.textFile(path, minPartitions)
        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
        if numFeatures <= 0:
            parsed.cache()
            numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
        return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2]))) 
Example 46
Project: LearningApacheSpark   Author: runawayhorse001   File: tree.py    MIT License 4 votes vote down vote up
def trainRegressor(cls, data, categoricalFeaturesInfo,
                       impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1,
                       minInfoGain=0.0):
        """
        Train a decision tree model for regression.

        :param data:
          Training data: RDD of LabeledPoint. Labels are real numbers.
        :param categoricalFeaturesInfo:
          Map storing arity of categorical features. An entry (n -> k)
          indicates that feature n is categorical with k categories
          indexed from 0: {0, 1, ..., k-1}.
        :param impurity:
          Criterion used for information gain calculation.
          The only supported value for regression is "variance".
          (default: "variance")
        :param maxDepth:
          Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
          means 1 internal node + 2 leaf nodes).
          (default: 5)
        :param maxBins:
          Number of bins used for finding splits at each node.
          (default: 32)
        :param minInstancesPerNode:
          Minimum number of instances required at child nodes to create
          the parent split.
          (default: 1)
        :param minInfoGain:
          Minimum info gain required to create a split.
          (default: 0.0)
        :return:
          DecisionTreeModel.

        Example usage:

        >>> from pyspark.mllib.regression import LabeledPoint
        >>> from pyspark.mllib.tree import DecisionTree
        >>> from pyspark.mllib.linalg import SparseVector
        >>>
        >>> sparse_data = [
        ...     LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
        ...     LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
        ...     LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
        ...     LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
        ... ]
        >>>
        >>> model = DecisionTree.trainRegressor(sc.parallelize(sparse_data), {})
        >>> model.predict(SparseVector(2, {1: 1.0}))
        1.0
        >>> model.predict(SparseVector(2, {1: 0.0}))
        0.0
        >>> rdd = sc.parallelize([[0.0, 1.0], [0.0, 0.0]])
        >>> model.predict(rdd).collect()
        [1.0, 0.0]
        """
        return cls._train(data, "regression", 0, categoricalFeaturesInfo,
                          impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) 
Example 47
Project: PCA_with_MLlib   Author: IBMPredictiveAnalytics   File: pca.py    Apache License 2.0 4 votes vote down vote up
def encode(self,df,target,predictors,setToFlag):
        if not self.dm:
            self.computeDataModel(df)
        schema = df.dtypes[:]
        lookup = {}
        for i in range(0,len(schema)):
            lookup[schema[i][0]] = i
        target_index = -1
        if target:
            target_index = lookup[target]
        dm = self.dm

        def mapFn(row):
            pvals = []
            for predictor in predictors:
                predictor_index = lookup[predictor]
                if isinstance(dm[predictor],list):
                    try:
                        encoded_val = dm[predictor].index(row[predictor_index])
                        if setToFlag == None:
                            pvals.append(encoded_val)
                        else:
                            flags = [0.0]*len(dm[predictor])
                            flags[encoded_val]=setToFlag
                            pvals += flags
                    except ValueError:
                        if setToFlag == None:
                            pvals.append(None)
                        else:
                            pvals += [0.0]*len(dm[predictor])
                else:
                    pval = row[predictor_index]
                    # if pval == None:
                    #    pval_min = dm[predictor]["min"]
                    #    pval_max = dm[predictor]["max"]
                    #    pval=pval_min+(pval_max - pval_min)*0.5
                    pvals.append(pval)
            dv = DenseVector(pvals)
            if target_index == -1:
                return (row,dv)
            tval = row[target_index]
            if isinstance(dm[target],list): # target is categorical
                try:
                    tval = dm[target].index(tval)
                except ValueError:
                    tval = None
            return (row,LabeledPoint(tval,dv))
        return df.map(mapFn) 
Example 48
Project: PCA_with_MLlib   Author: IBMPredictiveAnalytics   File: pca_score.py    Apache License 2.0 4 votes vote down vote up
def encode(self,df,target,predictors,setToFlag):
        if not self.dm:
            self.computeDataModel(df)
        schema = df.dtypes[:]
        lookup = {}
        for i in range(0,len(schema)):
            lookup[schema[i][0]] = i
        target_index = -1
        if target:
            target_index = lookup[target]
        dm = self.dm

        def mapFn(row):
            pvals = []
            for predictor in predictors:
                predictor_index = lookup[predictor]
                if isinstance(dm[predictor],list):
                    try:
                        encoded_val = dm[predictor].index(row[predictor_index])
                        if setToFlag == None:
                            pvals.append(encoded_val)
                        else:
                            flags = [0.0]*len(dm[predictor])
                            flags[encoded_val]=setToFlag
                            pvals += flags
                    except ValueError:
                        if setToFlag == None:
                            pvals.append(None)
                        else:
                            pvals += [0.0]*len(dm[predictor])
                else:
                    pval = row[predictor_index]
                    # if pval == None:
                    #    pval_min = dm[predictor]["min"]
                    #    pval_max = dm[predictor]["max"]
                    #    pval=pval_min+(pval_max - pval_min)*0.5
                    pvals.append(pval)
            dv = DenseVector(pvals)
            if target_index == -1:
                return (row,dv)
            tval = row[target_index]
            if isinstance(dm[target],list): # target is categorical
                try:
                    tval = dm[target].index(tval)
                except ValueError:
                    tval = None
            return (row,LabeledPoint(tval,dv))
        return df.map(mapFn) 
Example 49
Project: Multinomial_Naive_Bayes_with_MLlib   Author: IBMPredictiveAnalytics   File: naivebayes_score.py    Apache License 2.0 4 votes vote down vote up
def encode(self,df,target,predictors,setToFlag):
        if not self.dm:
            self.computeDataModel(df)
        schema = df.dtypes[:]
        lookup = {}
        for i in range(0,len(schema)):
            lookup[schema[i][0]] = i

        target_index = -1
        if target:
            target_index = lookup[target]
        dm = self.dm

        def mapFn(row):
            pvals = []
            for predictor in predictors:
                predictor_index = lookup[predictor]
                if isinstance(dm[predictor],list):
                    try:
                        encoded_val = dm[predictor].index(row[predictor_index])
                        if setToFlag == None:
                            pvals.append(encoded_val)
                        else:
                            flags = [0.0]*len(dm[predictor])
                            flags[encoded_val]=setToFlag
                            pvals += flags
                    except ValueError:
                        if setToFlag == None:
                            pvals.append(None)
                        else:
                            pvals += [0.0]*len(dm[predictor])
                else:
                    pval = row[predictor_index]
                    # if pval == None:
                    #    pval_min = dm[predictor]["min"]
                    #    pval_max = dm[predictor]["max"]
                    #    pval=pval_min+(pval_max - pval_min)*0.5
                    pvals.append(pval)
            dv = DenseVector(pvals)
            if target_index == -1:
                return (row,dv)
            tval = row[target_index]
            if isinstance(dm[target],list): # target is categorical
                try:
                    tval = dm[target].index(tval)
                except ValueError:
                    tval = None
            return (row,LabeledPoint(tval,dv))

        return df.map(mapFn) 
Example 50
Project: Multinomial_Naive_Bayes_with_MLlib   Author: IBMPredictiveAnalytics   File: naivebayes.py    Apache License 2.0 4 votes vote down vote up
def encode(self,df,target,predictors,setToFlag):
        if not self.dm:
            self.computeDataModel(df)
        schema = df.dtypes[:]
        lookup = {}
        for i in range(0,len(schema)):
            lookup[schema[i][0]] = i

        target_index = -1
        if target:
            target_index = lookup[target]
        dm = self.dm

        def mapFn(row):
            pvals = []
            for predictor in predictors:
                predictor_index = lookup[predictor]
                if isinstance(dm[predictor],list):
                    try:
                        encoded_val = dm[predictor].index(row[predictor_index])
                        if setToFlag == None:
                            pvals.append(encoded_val)
                        else:
                            flags = [0.0]*len(dm[predictor])
                            flags[encoded_val]=setToFlag
                            pvals += flags
                    except ValueError:
                        if setToFlag == None:
                            pvals.append(None)
                        else:
                            pvals += [0.0]*len(dm[predictor])
                else:
                    pval = row[predictor_index]
                    # if pval == None:
                    #    pval_min = dm[predictor]["min"]
                    #    pval_max = dm[predictor]["max"]
                    #    pval=pval_min+(pval_max - pval_min)*0.5
                    pvals.append(pval)
            dv = DenseVector(pvals)
            if target_index == -1:
                return (row,dv)
            tval = row[target_index]
            if isinstance(dm[target],list): # target is categorical
                try:
                    tval = dm[target].index(tval)
                except ValueError:
                    tval = None
            return (row,LabeledPoint(tval,dv))

        return df.map(mapFn)