Python pyspark.mllib.regression.LabeledPoint() Examples

The following are 30 code examples of pyspark.mllib.regression.LabeledPoint(). You may also want to check out all available functions/classes of the module pyspark.mllib.regression , or try the search function .
Example #1
Source File:    From LearningApacheSpark with MIT License 6 votes vote down vote up
def loadLabeledPoints(sc, path, minPartitions=None):
        Load labeled points saved using RDD.saveAsTextFile.

        :param sc: Spark context
        :param path: file or directory path in any Hadoop-supported file
                     system URI
        :param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> from pyspark.mllib.regression import LabeledPoint
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])),
        ...             LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> sc.parallelize(examples, 1).saveAsTextFile(
        >>> MLUtils.loadLabeledPoints(sc,
        [LabeledPoint(1.1, (3,[0,2],[-1.23,4.56e-07])), LabeledPoint(0.0, [1.01,2.02,3.03])]
        minPartitions = minPartitions or min(sc.defaultParallelism, 2)
        return callMLlibFunc("loadLabeledPoints", sc, path, minPartitions) 
Example #2
Source File:    From LearningApacheSpark with MIT License 6 votes vote down vote up
def saveAsLibSVMFile(data, dir):
        Save labeled data in LIBSVM format.

        :param data: an RDD of LabeledPoint to be saved
        :param dir: directory to save the data

        >>> from tempfile import NamedTemporaryFile
        >>> from fileinput import input
        >>> from pyspark.mllib.regression import LabeledPoint
        >>> from glob import glob
        >>> from pyspark.mllib.util import MLUtils
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])),
        ...             LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples),
        >>> ''.join(sorted(input(glob( + "/part-0000*"))))
        '0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'
        lines = p: MLUtils._convert_labeled_point_to_libsvm(p))
Example #3
Source File:    From elephas with MIT License 6 votes vote down vote up
def to_labeled_point(sc, features, labels, categorical=False):
    """Convert numpy arrays of features and labels into
    a LabeledPoint RDD for MLlib and ML integration.

    :param sc: Spark context
    :param features: numpy array with features
    :param labels: numpy array with labels
    :param categorical: boolean, whether labels are already one-hot encoded or not
    :return: LabeledPoint RDD with features and labels
    labeled_points = []
    for x, y in zip(features, labels):
        if categorical:
            lp = LabeledPoint(np.argmax(y), to_vector(x))
            lp = LabeledPoint(y, to_vector(x))
    return sc.parallelize(labeled_points) 
Example #4
Source File:    From elephas with MIT License 6 votes vote down vote up
def from_labeled_point(rdd, categorical=False, nb_classes=None):
    """Convert a LabeledPoint RDD back to a pair of numpy arrays

    :param rdd: LabeledPoint RDD
    :param categorical: boolean, if labels should be one-hot encode when returned
    :param nb_classes: optional int, indicating the number of class labels
    :return: pair of numpy arrays, features and labels
    features = np.asarray( lp: from_vector(lp.features)).collect())
    labels = np.asarray( lp: lp.label).collect(), dtype='int32')
    if categorical:
        if not nb_classes:
            nb_classes = np.max(labels) + 1
        temp = np.zeros((len(labels), nb_classes))
        for i, label in enumerate(labels):
            temp[i, label] = 1.
        labels = temp
    return features, labels 
Example #5
Source File:    From elephas with MIT License 6 votes vote down vote up
def lp_to_simple_rdd(lp_rdd, categorical=False, nb_classes=None):
    """Convert a LabeledPoint RDD into an RDD of feature-label pairs

    :param lp_rdd: LabeledPoint RDD of features and labels
    :param categorical: boolean, if labels should be one-hot encode when returned
    :param nb_classes: int, number of total classes
    :return: Spark RDD with feature-label pairs
    if categorical:
        if not nb_classes:
            labels = np.asarray(
                lambda lp: lp.label).collect(), dtype='int32')
            nb_classes = np.max(labels) + 1
        rdd = lp: (from_vector(lp.features),
                                     encode_label(lp.label, nb_classes)))
        rdd = lp: (from_vector(lp.features), lp.label))
    return rdd 
Example #6
Source File:    From Hanhan-Spark-Python with MIT License 6 votes vote down vote up
def parse_point(line):
    ptn1 = "\(([\d\.]*),\sSparseVector\((.*?)\)\)"
    ptn2 = "(\d+),\s+\{(.*?)\}"
    m =, line)
    if m:
        label = float(
        features_str =
        mx =, features_str)
        num = float(
        fs =
        idx_set = []
        tfidf_scores = []
        if fs != '':
            fs_split = fs.split(', ')
            for f in fs_split:
                idx_set.append(f.split(': ')[0])
                tfidf_scores.append(f.split(': ')[1])
        sp = SparseVector(num, idx_set, tfidf_scores)
        LP = LabeledPoint(label, sp)
        return LP
    return None

# Find the best step_size through cross validation, using RMSE as the error measurement 
Example #7
Source File:    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_chi_sq_pearson(self):
        data = [
            LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
            LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
            LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
            LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))

        for numParts in [2, 4, 6, 8]:
            chi = Statistics.chiSqTest(, numParts))
            feature1 = chi[0]
            self.assertEqual(feature1.statistic, 0.75)
            self.assertEqual(feature1.degreesOfFreedom, 2)
            self.assertAlmostEqual(feature1.pValue, 0.6873, 4)

            feature2 = chi[1]
            self.assertEqual(feature2.statistic, 1.5)
            self.assertEqual(feature2.degreesOfFreedom, 3)
            self.assertAlmostEqual(feature2.pValue, 0.6823, 4) 
Example #8
Source File:    From Hanhan-Spark-Python with MIT License 6 votes vote down vote up
def parse_point(line):
    ptn1 = "\(([\d\.]*),\sSparseVector\((.*?)\)\)"
    ptn2 = "(\d+),\s+\{(.*?)\}"
    m =, line)
    if m:
        label = float(
        features_str =
        mx =, features_str)
        num = float(
        fs =
        idx_set = []
        tfidf_scores = []
        if fs != '':
            fs_split = fs.split(', ')
            for f in fs_split:
                idx_set.append(f.split(': ')[0])
                tfidf_scores.append(f.split(': ')[1])
        sp = SparseVector(num, idx_set, tfidf_scores)
        LP = LabeledPoint(label, sp)
        return LP
    return None

# Find the best step_size through cross validation, using RMSE as the error measurement 
Example #9
Source File:    From spark-cluster-deployment with Apache License 2.0 6 votes vote down vote up
def saveAsLibSVMFile(data, dir):
        Save labeled data in LIBSVM format.

        @param data: an RDD of LabeledPoint to be saved
        @param dir: directory to save the data

        >>> from tempfile import NamedTemporaryFile
        >>> from fileinput import input
        >>> from glob import glob
        >>> from pyspark.mllib.util import MLUtils
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), \
                        LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples),
        >>> ''.join(sorted(input(glob( + "/part-0000*"))))
        '0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'
        lines = p: MLUtils._convert_labeled_point_to_libsvm(p))
Example #10
Source File:    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def _get_unmangled_double_vector_rdd(data):
    return _get_unmangled_rdd(data, _serialize_double_vector)

# Map a pickled Python RDD of LabeledPoint to a Java RDD of _serialized_labeled_points 
Example #11
Source File:    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees,
               featureSubsetStrategy, impurity, maxDepth, maxBins, seed):
        first = data.first()
        assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
        if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies:
            raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy)
        if seed is None:
            seed = random.randint(0, 1 << 30)
        model = callMLlibFunc("trainRandomForestModel", data, algo, numClasses,
                              categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity,
                              maxDepth, maxBins, seed)
        return RandomForestModel(model) 
Example #12
Source File:    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _train(cls, data, algo, categoricalFeaturesInfo,
               loss, numIterations, learningRate, maxDepth, maxBins):
        first = data.first()
        assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
        model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo,
                              loss, numIterations, learningRate, maxDepth, maxBins)
        return GradientBoostedTreesModel(model) 
Example #13
Source File:    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def parsePoint(line):
    values = [float(s) for s in line.split(' ')]
    if values[0] == -1:   # Convert -1 labels to 0 for MLlib
        values[0] = 0
    return LabeledPoint(values[0], values[1:]) 
Example #14
Source File:    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def _convert_labeled_point_to_libsvm(p):
        """Converts a LabeledPoint to a string in LIBSVM format."""
        items = [str(p.label)]
        v = _convert_vector(p.features)
        if type(v) == np.ndarray:
            for i in xrange(len(v)):
                items.append(str(i + 1) + ":" + str(v[i]))
        elif type(v) == SparseVector:
            nnz = len(v.indices)
            for i in xrange(nnz):
                items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
            raise TypeError("_convert_labeled_point_to_libsvm needs either ndarray or SparseVector"
                            " but got " % type(v))
        return " ".join(items) 
Example #15
Source File:    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def _serialize_labeled_point(p):
    """Serialize a LabeledPoint with a features vector of any type."""
    from pyspark.mllib.regression import LabeledPoint
    serialized_features = _serialize_double_vector(p.features)
    header = bytearray(9)
    header[0] = LABELED_POINT_MAGIC
    header_float = ndarray(shape=[1], buffer=header, offset=1, dtype=float64)
    header_float[0] = p.label
    return header + serialized_features 
Example #16
Source File:    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _convert_labeled_point_to_libsvm(p):
        """Converts a LabeledPoint to a string in LIBSVM format."""
        from pyspark.mllib.regression import LabeledPoint
        assert isinstance(p, LabeledPoint)
        items = [str(p.label)]
        v = _convert_to_vector(p.features)
        if isinstance(v, SparseVector):
            nnz = len(v.indices)
            for i in xrange(nnz):
                items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
            for i in xrange(len(v)):
                items.append(str(i + 1) + ":" + str(v[i]))
        return " ".join(items) 
Example #17
Source File:    From intro_ds with Apache License 2.0 5 votes vote down vote up
def trans2RDD(data, sc):
    data = sc.parallelize(data)
    data = line: LabeledPoint(line[0], line[1:]))
    return data 
Example #18
Source File:    From intro_ds with Apache License 2.0 5 votes vote down vote up
def trans2RDD(data, sc):
    data = sc.parallelize(data)
    data = line: LabeledPoint(line[0], line[1:]))
    return data 
Example #19
Source File:    From ferry with Apache License 2.0 5 votes vote down vote up
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[0], values[1:]) 
Example #20
Source File:    From spark-workshop with MIT License 5 votes vote down vote up
def parsePoint(line):
    values = csv.reader(StringIO(line), delimiter=";").next() # CSV parsing of line
    values = [float(x) for x in values]                       # Cast to all floats
    return LabeledPoint(values[-1], values[:-1])              # y = quality, X = row[:-1] 
Example #21
Source File:    From spark-workshop with MIT License 5 votes vote down vote up
def parsePoint(tup):
    Parse text data into floats.
    Return tuple of (label, features).
    values = [float(x) for x in tup[1].split(';')]
    return LabeledPoint(tup[0], values[1:]) 
Example #22
Source File:    From LearningApacheSpark with MIT License 5 votes vote down vote up
def fit(self, data):
        Returns a ChiSquared feature selector.

        :param data: an `RDD[LabeledPoint]` containing the labeled dataset
                     with categorical features. Real-valued features will be
                     treated as categorical for each distinct value.
                     Apply feature discretizer before using this function.
        jmodel = callMLlibFunc("fitChiSqSelector", self.selectorType, self.numTopFeatures,
                               self.percentile, self.fpr, self.fdr, self.fwe, data)
        return ChiSqSelectorModel(jmodel) 
Example #23
Source File:    From Data_Analytics_with_Hadoop with MIT License 5 votes vote down vote up
def parsePoint(line):
    values = csv.reader(StringIO(line), delimiter=";").next() # CSV parsing of line
    values = [float(x) for x in values]                       # Cast to all floats
    return LabeledPoint(values[-1], values[:-1])              # y = quality, X = row[:-1] 
Example #24
Source File:    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_right_number_of_results(self):
        num_cols = 1001
        sparse_data = [
            LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
            LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
        chi = Statistics.chiSqTest(
        self.assertEqual(len(chi), num_cols)
Example #25
Source File:    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        rdd =
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0) 
Example #26
Source File:    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        rdd =
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0) 
Example #27
Source File:    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_infer_schema(self):
        rdd =[LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
                raise TypeError("expecting a vector but got %r of type %r" % (v, type(v))) 
Example #28
Source File:    From Hanhan-Spark-Python with MIT License 5 votes vote down vote up
def get_lp(t):
    rating = t[1][0]
    avg_features = t[1][1]
    return LabeledPoint(rating, avg_features) 
Example #29
Source File:    From Hanhan-Spark-Python with MIT License 5 votes vote down vote up
def get_lp(t):
    rating = t[0]
    sp = t[1]
    return LabeledPoint(rating, sp) 
Example #30
Source File:    From Hanhan-Spark-Python with MIT License 5 votes vote down vote up
def get_lp(t):
    rating = t[1][0]
    avg_features = t[1][1]
    return LabeledPoint(rating, avg_features)