Python pyspark.mllib.regression.LabeledPoint() Examples

The following are 30 code examples of pyspark.mllib.regression.LabeledPoint(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.mllib.regression , or try the search function

Example #1

Source File: util.py From LearningApacheSpark with MIT License

6 votes

def loadLabeledPoints(sc, path, minPartitions=None):
        """
        Load labeled points saved using RDD.saveAsTextFile.

        :param sc: Spark context
        :param path: file or directory path in any Hadoop-supported file
                     system URI
        :param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> from pyspark.mllib.regression import LabeledPoint
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])),
        ...             LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
        >>> MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
        [LabeledPoint(1.1, (3,[0,2],[-1.23,4.56e-07])), LabeledPoint(0.0, [1.01,2.02,3.03])]
        """
        minPartitions = minPartitions or min(sc.defaultParallelism, 2)
        return callMLlibFunc("loadLabeledPoints", sc, path, minPartitions)

Example #2

Source File: util.py From LearningApacheSpark with MIT License

6 votes

def saveAsLibSVMFile(data, dir):
        """
        Save labeled data in LIBSVM format.

        :param data: an RDD of LabeledPoint to be saved
        :param dir: directory to save the data

        >>> from tempfile import NamedTemporaryFile
        >>> from fileinput import input
        >>> from pyspark.mllib.regression import LabeledPoint
        >>> from glob import glob
        >>> from pyspark.mllib.util import MLUtils
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])),
        ...             LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)
        >>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))
        '0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'
        """
        lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))
        lines.saveAsTextFile(dir)

Example #3

Source File: rdd_utils.py From elephas with MIT License

6 votes

def to_labeled_point(sc, features, labels, categorical=False):
    """Convert numpy arrays of features and labels into
    a LabeledPoint RDD for MLlib and ML integration.

    :param sc: Spark context
    :param features: numpy array with features
    :param labels: numpy array with labels
    :param categorical: boolean, whether labels are already one-hot encoded or not
    :return: LabeledPoint RDD with features and labels
    """
    labeled_points = []
    for x, y in zip(features, labels):
        if categorical:
            lp = LabeledPoint(np.argmax(y), to_vector(x))
        else:
            lp = LabeledPoint(y, to_vector(x))
        labeled_points.append(lp)
    return sc.parallelize(labeled_points)

Example #4

Source File: rdd_utils.py From elephas with MIT License

6 votes

def from_labeled_point(rdd, categorical=False, nb_classes=None):
    """Convert a LabeledPoint RDD back to a pair of numpy arrays

    :param rdd: LabeledPoint RDD
    :param categorical: boolean, if labels should be one-hot encode when returned
    :param nb_classes: optional int, indicating the number of class labels
    :return: pair of numpy arrays, features and labels
    """
    features = np.asarray(
        rdd.map(lambda lp: from_vector(lp.features)).collect())
    labels = np.asarray(rdd.map(lambda lp: lp.label).collect(), dtype='int32')
    if categorical:
        if not nb_classes:
            nb_classes = np.max(labels) + 1
        temp = np.zeros((len(labels), nb_classes))
        for i, label in enumerate(labels):
            temp[i, label] = 1.
        labels = temp
    return features, labels

Example #5

Source File: rdd_utils.py From elephas with MIT License

6 votes

def lp_to_simple_rdd(lp_rdd, categorical=False, nb_classes=None):
    """Convert a LabeledPoint RDD into an RDD of feature-label pairs

    :param lp_rdd: LabeledPoint RDD of features and labels
    :param categorical: boolean, if labels should be one-hot encode when returned
    :param nb_classes: int, number of total classes
    :return: Spark RDD with feature-label pairs
    """
    if categorical:
        if not nb_classes:
            labels = np.asarray(lp_rdd.map(
                lambda lp: lp.label).collect(), dtype='int32')
            nb_classes = np.max(labels) + 1
        rdd = lp_rdd.map(lambda lp: (from_vector(lp.features),
                                     encode_label(lp.label, nb_classes)))
    else:
        rdd = lp_rdd.map(lambda lp: (from_vector(lp.features), lp.label))
    return rdd

Example #6

Source File: tfidf_cv_lowestRMSE.py From Hanhan-Spark-Python with MIT License

6 votes

def parse_point(line):
    ptn1 = "\(([\d\.]*),\sSparseVector\((.*?)\)\)"
    ptn2 = "(\d+),\s+\{(.*?)\}"
    m = re.search(ptn1, line)
    if m:
        label = float(m.group(1))
        features_str = m.group(2)
        mx = re.search(ptn2, features_str)
        num = float(mx.group(1))
        fs = mx.group(2)
        idx_set = []
        tfidf_scores = []
        if fs != '':
            fs_split = fs.split(', ')
            for f in fs_split:
                idx_set.append(f.split(': ')[0])
                tfidf_scores.append(f.split(': ')[1])
        sp = SparseVector(num, idx_set, tfidf_scores)
        LP = LabeledPoint(label, sp)
        return LP
    return None


# Find the best step_size through cross validation, using RMSE as the error measurement

Example #7

Source File: tests.py From LearningApacheSpark with MIT License

6 votes

def test_chi_sq_pearson(self):
        data = [
            LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
            LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
            LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
            LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
        ]

        for numParts in [2, 4, 6, 8]:
            chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
            feature1 = chi[0]
            self.assertEqual(feature1.statistic, 0.75)
            self.assertEqual(feature1.degreesOfFreedom, 2)
            self.assertAlmostEqual(feature1.pValue, 0.6873, 4)

            feature2 = chi[1]
            self.assertEqual(feature2.statistic, 1.5)
            self.assertEqual(feature2.degreesOfFreedom, 3)
            self.assertAlmostEqual(feature2.pValue, 0.6823, 4)

Example #8

Source File: tfidf_cv_lowestRMSE_normalized.py From Hanhan-Spark-Python with MIT License

6 votes

def parse_point(line):
    ptn1 = "\(([\d\.]*),\sSparseVector\((.*?)\)\)"
    ptn2 = "(\d+),\s+\{(.*?)\}"
    m = re.search(ptn1, line)
    if m:
        label = float(m.group(1))
        features_str = m.group(2)
        mx = re.search(ptn2, features_str)
        num = float(mx.group(1))
        fs = mx.group(2)
        idx_set = []
        tfidf_scores = []
        if fs != '':
            fs_split = fs.split(', ')
            for f in fs_split:
                idx_set.append(f.split(': ')[0])
                tfidf_scores.append(f.split(': ')[1])
        sp = SparseVector(num, idx_set, tfidf_scores)
        LP = LabeledPoint(label, sp)
        return LP
    return None


# Find the best step_size through cross validation, using RMSE as the error measurement

Example #9

Source File: util.py From spark-cluster-deployment with Apache License 2.0

6 votes

def saveAsLibSVMFile(data, dir):
        """
        Save labeled data in LIBSVM format.

        @param data: an RDD of LabeledPoint to be saved
        @param dir: directory to save the data

        >>> from tempfile import NamedTemporaryFile
        >>> from fileinput import input
        >>> from glob import glob
        >>> from pyspark.mllib.util import MLUtils
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), \
                        LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)
        >>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))
        '0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'
        """
        lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))
        lines.saveAsTextFile(dir)

Example #10

Source File: _common.py From spark-cluster-deployment with Apache License 2.0

5 votes

def _get_unmangled_double_vector_rdd(data):
    return _get_unmangled_rdd(data, _serialize_double_vector)


# Map a pickled Python RDD of LabeledPoint to a Java RDD of _serialized_labeled_points

Example #11

Source File: tree.py From LearningApacheSpark with MIT License

5 votes

def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees,
               featureSubsetStrategy, impurity, maxDepth, maxBins, seed):
        first = data.first()
        assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
        if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies:
            raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy)
        if seed is None:
            seed = random.randint(0, 1 << 30)
        model = callMLlibFunc("trainRandomForestModel", data, algo, numClasses,
                              categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity,
                              maxDepth, maxBins, seed)
        return RandomForestModel(model)

Example #12

Source File: tree.py From LearningApacheSpark with MIT License

5 votes

def _train(cls, data, algo, categoricalFeaturesInfo,
               loss, numIterations, learningRate, maxDepth, maxBins):
        first = data.first()
        assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
        model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo,
                              loss, numIterations, learningRate, maxDepth, maxBins)
        return GradientBoostedTreesModel(model)

Example #13

Source File: logistic_regression.py From spark-cluster-deployment with Apache License 2.0

5 votes

def parsePoint(line):
    values = [float(s) for s in line.split(' ')]
    if values[0] == -1:   # Convert -1 labels to 0 for MLlib
        values[0] = 0
    return LabeledPoint(values[0], values[1:])

Example #14

Source File: util.py From spark-cluster-deployment with Apache License 2.0

5 votes

def _convert_labeled_point_to_libsvm(p):
        """Converts a LabeledPoint to a string in LIBSVM format."""
        items = [str(p.label)]
        v = _convert_vector(p.features)
        if type(v) == np.ndarray:
            for i in xrange(len(v)):
                items.append(str(i + 1) + ":" + str(v[i]))
        elif type(v) == SparseVector:
            nnz = len(v.indices)
            for i in xrange(nnz):
                items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
        else:
            raise TypeError("_convert_labeled_point_to_libsvm needs either ndarray or SparseVector"
                            " but got " % type(v))
        return " ".join(items)

Example #15

Source File: _common.py From spark-cluster-deployment with Apache License 2.0

5 votes

def _serialize_labeled_point(p):
    """Serialize a LabeledPoint with a features vector of any type."""
    from pyspark.mllib.regression import LabeledPoint
    serialized_features = _serialize_double_vector(p.features)
    header = bytearray(9)
    header[0] = LABELED_POINT_MAGIC
    header_float = ndarray(shape=[1], buffer=header, offset=1, dtype=float64)
    header_float[0] = p.label
    return header + serialized_features

Example #16

Source File: util.py From LearningApacheSpark with MIT License

5 votes

def _convert_labeled_point_to_libsvm(p):
        """Converts a LabeledPoint to a string in LIBSVM format."""
        from pyspark.mllib.regression import LabeledPoint
        assert isinstance(p, LabeledPoint)
        items = [str(p.label)]
        v = _convert_to_vector(p.features)
        if isinstance(v, SparseVector):
            nnz = len(v.indices)
            for i in xrange(nnz):
                items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
        else:
            for i in xrange(len(v)):
                items.append(str(i + 1) + ":" + str(v[i]))
        return " ".join(items)

Example #17

Source File: sparkml_vs_sklearn_solution.py From intro_ds with Apache License 2.0

5 votes

def trans2RDD(data, sc):
    """
    将Python里的数据转换为RDD
    """
    data = sc.parallelize(data)
    data = data.map(lambda line: LabeledPoint(line[0], line[1:]))
    return data

Example #18

Source File: sparkml_vs_sklearn.py From intro_ds with Apache License 2.0

5 votes

def trans2RDD(data, sc):
    """
    将Python里的数据转换为RDD
    """
    data = sc.parallelize(data)
    data = data.map(lambda line: LabeledPoint(line[0], line[1:]))
    return data

Example #19

Source File: regression.py From ferry with Apache License 2.0

5 votes

def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[0], values[1:])

Example #20

Source File: wines.py From spark-workshop with MIT License

5 votes

def parsePoint(line):
    values = csv.reader(StringIO(line), delimiter=";").next() # CSV parsing of line
    values = [float(x) for x in values]                       # Cast to all floats
    return LabeledPoint(values[-1], values[:-1])              # y = quality, X = row[:-1]

Example #21

Source File: logreg_wines.py From spark-workshop with MIT License

5 votes

def parsePoint(tup):
    """
    Parse text data into floats.
    Return tuple of (label, features).
    """
    values = [float(x) for x in tup[1].split(';')]
    return LabeledPoint(tup[0], values[1:])

Example #22

Source File: feature.py From LearningApacheSpark with MIT License

5 votes

def fit(self, data):
        """
        Returns a ChiSquared feature selector.

        :param data: an `RDD[LabeledPoint]` containing the labeled dataset
                     with categorical features. Real-valued features will be
                     treated as categorical for each distinct value.
                     Apply feature discretizer before using this function.
        """
        jmodel = callMLlibFunc("fitChiSqSelector", self.selectorType, self.numTopFeatures,
                               self.percentile, self.fpr, self.fdr, self.fwe, data)
        return ChiSqSelectorModel(jmodel)

Example #23

Source File: wines.py From Data_Analytics_with_Hadoop with MIT License

5 votes

def parsePoint(line):
    values = csv.reader(StringIO(line), delimiter=";").next() # CSV parsing of line
    values = [float(x) for x in values]                       # Cast to all floats
    return LabeledPoint(values[-1], values[:-1])              # y = quality, X = row[:-1]

Example #24

Source File: tests.py From LearningApacheSpark with MIT License

5 votes

def test_right_number_of_results(self):
        num_cols = 1001
        sparse_data = [
            LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
            LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
        ]
        chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
        self.assertEqual(len(chi), num_cols)
        self.assertIsNotNone(chi[1000])

Example #25

Source File: tests.py From LearningApacheSpark with MIT License

5 votes

def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

Example #26

Source File: tests.py From LearningApacheSpark with MIT License

5 votes

def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

Example #27

Source File: tests.py From LearningApacheSpark with MIT License

5 votes

def test_infer_schema(self):
        rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = df.rdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))

Example #28

Source File: GradientBoostedTrees.py From Hanhan-Spark-Python with MIT License

5 votes

def get_lp(t):
    rating = t[1][0]
    avg_features = t[1][1]
    return LabeledPoint(rating, avg_features)

Example #29

Source File: word2vec_histogram_best_RMSE.py From Hanhan-Spark-Python with MIT License

5 votes

def get_lp(t):
    rating = t[0]
    sp = t[1]
    return LabeledPoint(rating, sp)

Example #30

Source File: word2vec_best_RMSE.py From Hanhan-Spark-Python with MIT License

5 votes

def get_lp(t):
    rating = t[1][0]
    avg_features = t[1][1]
    return LabeledPoint(rating, avg_features)