Python pyspark.mllib.regression.LabeledPoint() Examples

The following are 30 code examples of pyspark.mllib.regression.LabeledPoint(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.mllib.regression , or try the search function .
Example #1
Source File: util.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def loadLabeledPoints(sc, path, minPartitions=None):
        """
        Load labeled points saved using RDD.saveAsTextFile.

        :param sc: Spark context
        :param path: file or directory path in any Hadoop-supported file
                     system URI
        :param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> from pyspark.mllib.regression import LabeledPoint
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])),
        ...             LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
        >>> MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
        [LabeledPoint(1.1, (3,[0,2],[-1.23,4.56e-07])), LabeledPoint(0.0, [1.01,2.02,3.03])]
        """
        minPartitions = minPartitions or min(sc.defaultParallelism, 2)
        return callMLlibFunc("loadLabeledPoints", sc, path, minPartitions) 
Example #2
Source File: util.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def saveAsLibSVMFile(data, dir):
        """
        Save labeled data in LIBSVM format.

        :param data: an RDD of LabeledPoint to be saved
        :param dir: directory to save the data

        >>> from tempfile import NamedTemporaryFile
        >>> from fileinput import input
        >>> from pyspark.mllib.regression import LabeledPoint
        >>> from glob import glob
        >>> from pyspark.mllib.util import MLUtils
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])),
        ...             LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)
        >>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))
        '0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'
        """
        lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))
        lines.saveAsTextFile(dir) 
Example #3
Source File: rdd_utils.py    From elephas with MIT License 6 votes vote down vote up
def to_labeled_point(sc, features, labels, categorical=False):
    """Convert numpy arrays of features and labels into
    a LabeledPoint RDD for MLlib and ML integration.

    :param sc: Spark context
    :param features: numpy array with features
    :param labels: numpy array with labels
    :param categorical: boolean, whether labels are already one-hot encoded or not
    :return: LabeledPoint RDD with features and labels
    """
    labeled_points = []
    for x, y in zip(features, labels):
        if categorical:
            lp = LabeledPoint(np.argmax(y), to_vector(x))
        else:
            lp = LabeledPoint(y, to_vector(x))
        labeled_points.append(lp)
    return sc.parallelize(labeled_points) 
Example #4
Source File: rdd_utils.py    From elephas with MIT License 6 votes vote down vote up
def from_labeled_point(rdd, categorical=False, nb_classes=None):
    """Convert a LabeledPoint RDD back to a pair of numpy arrays

    :param rdd: LabeledPoint RDD
    :param categorical: boolean, if labels should be one-hot encode when returned
    :param nb_classes: optional int, indicating the number of class labels
    :return: pair of numpy arrays, features and labels
    """
    features = np.asarray(
        rdd.map(lambda lp: from_vector(lp.features)).collect())
    labels = np.asarray(rdd.map(lambda lp: lp.label).collect(), dtype='int32')
    if categorical:
        if not nb_classes:
            nb_classes = np.max(labels) + 1
        temp = np.zeros((len(labels), nb_classes))
        for i, label in enumerate(labels):
            temp[i, label] = 1.
        labels = temp
    return features, labels 
Example #5
Source File: rdd_utils.py    From elephas with MIT License 6 votes vote down vote up
def lp_to_simple_rdd(lp_rdd, categorical=False, nb_classes=None):
    """Convert a LabeledPoint RDD into an RDD of feature-label pairs

    :param lp_rdd: LabeledPoint RDD of features and labels
    :param categorical: boolean, if labels should be one-hot encode when returned
    :param nb_classes: int, number of total classes
    :return: Spark RDD with feature-label pairs
    """
    if categorical:
        if not nb_classes:
            labels = np.asarray(lp_rdd.map(
                lambda lp: lp.label).collect(), dtype='int32')
            nb_classes = np.max(labels) + 1
        rdd = lp_rdd.map(lambda lp: (from_vector(lp.features),
                                     encode_label(lp.label, nb_classes)))
    else:
        rdd = lp_rdd.map(lambda lp: (from_vector(lp.features), lp.label))
    return rdd 
Example #6
Source File: tfidf_cv_lowestRMSE.py    From Hanhan-Spark-Python with MIT License 6 votes vote down vote up
def parse_point(line):
    ptn1 = "\(([\d\.]*),\sSparseVector\((.*?)\)\)"
    ptn2 = "(\d+),\s+\{(.*?)\}"
    m = re.search(ptn1, line)
    if m:
        label = float(m.group(1))
        features_str = m.group(2)
        mx = re.search(ptn2, features_str)
        num = float(mx.group(1))
        fs = mx.group(2)
        idx_set = []
        tfidf_scores = []
        if fs != '':
            fs_split = fs.split(', ')
            for f in fs_split:
                idx_set.append(f.split(': ')[0])
                tfidf_scores.append(f.split(': ')[1])
        sp = SparseVector(num, idx_set, tfidf_scores)
        LP = LabeledPoint(label, sp)
        return LP
    return None


# Find the best step_size through cross validation, using RMSE as the error measurement 
Example #7
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_chi_sq_pearson(self):
        data = [
            LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
            LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
            LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
            LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
        ]

        for numParts in [2, 4, 6, 8]:
            chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
            feature1 = chi[0]
            self.assertEqual(feature1.statistic, 0.75)
            self.assertEqual(feature1.degreesOfFreedom, 2)
            self.assertAlmostEqual(feature1.pValue, 0.6873, 4)

            feature2 = chi[1]
            self.assertEqual(feature2.statistic, 1.5)
            self.assertEqual(feature2.degreesOfFreedom, 3)
            self.assertAlmostEqual(feature2.pValue, 0.6823, 4) 
Example #8
Source File: tfidf_cv_lowestRMSE_normalized.py    From Hanhan-Spark-Python with MIT License 6 votes vote down vote up
def parse_point(line):
    ptn1 = "\(([\d\.]*),\sSparseVector\((.*?)\)\)"
    ptn2 = "(\d+),\s+\{(.*?)\}"
    m = re.search(ptn1, line)
    if m:
        label = float(m.group(1))
        features_str = m.group(2)
        mx = re.search(ptn2, features_str)
        num = float(mx.group(1))
        fs = mx.group(2)
        idx_set = []
        tfidf_scores = []
        if fs != '':
            fs_split = fs.split(', ')
            for f in fs_split:
                idx_set.append(f.split(': ')[0])
                tfidf_scores.append(f.split(': ')[1])
        sp = SparseVector(num, idx_set, tfidf_scores)
        LP = LabeledPoint(label, sp)
        return LP
    return None


# Find the best step_size through cross validation, using RMSE as the error measurement 
Example #9
Source File: util.py    From spark-cluster-deployment with Apache License 2.0 6 votes vote down vote up
def saveAsLibSVMFile(data, dir):
        """
        Save labeled data in LIBSVM format.

        @param data: an RDD of LabeledPoint to be saved
        @param dir: directory to save the data

        >>> from tempfile import NamedTemporaryFile
        >>> from fileinput import input
        >>> from glob import glob
        >>> from pyspark.mllib.util import MLUtils
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), \
                        LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)
        >>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))
        '0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'
        """
        lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))
        lines.saveAsTextFile(dir) 
Example #10
Source File: _common.py    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def _get_unmangled_double_vector_rdd(data):
    return _get_unmangled_rdd(data, _serialize_double_vector)


# Map a pickled Python RDD of LabeledPoint to a Java RDD of _serialized_labeled_points 
Example #11
Source File: tree.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees,
               featureSubsetStrategy, impurity, maxDepth, maxBins, seed):
        first = data.first()
        assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
        if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies:
            raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy)
        if seed is None:
            seed = random.randint(0, 1 << 30)
        model = callMLlibFunc("trainRandomForestModel", data, algo, numClasses,
                              categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity,
                              maxDepth, maxBins, seed)
        return RandomForestModel(model) 
Example #12
Source File: tree.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _train(cls, data, algo, categoricalFeaturesInfo,
               loss, numIterations, learningRate, maxDepth, maxBins):
        first = data.first()
        assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
        model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo,
                              loss, numIterations, learningRate, maxDepth, maxBins)
        return GradientBoostedTreesModel(model) 
Example #13
Source File: logistic_regression.py    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def parsePoint(line):
    values = [float(s) for s in line.split(' ')]
    if values[0] == -1:   # Convert -1 labels to 0 for MLlib
        values[0] = 0
    return LabeledPoint(values[0], values[1:]) 
Example #14
Source File: util.py    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def _convert_labeled_point_to_libsvm(p):
        """Converts a LabeledPoint to a string in LIBSVM format."""
        items = [str(p.label)]
        v = _convert_vector(p.features)
        if type(v) == np.ndarray:
            for i in xrange(len(v)):
                items.append(str(i + 1) + ":" + str(v[i]))
        elif type(v) == SparseVector:
            nnz = len(v.indices)
            for i in xrange(nnz):
                items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
        else:
            raise TypeError("_convert_labeled_point_to_libsvm needs either ndarray or SparseVector"
                            " but got " % type(v))
        return " ".join(items) 
Example #15
Source File: _common.py    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def _serialize_labeled_point(p):
    """Serialize a LabeledPoint with a features vector of any type."""
    from pyspark.mllib.regression import LabeledPoint
    serialized_features = _serialize_double_vector(p.features)
    header = bytearray(9)
    header[0] = LABELED_POINT_MAGIC
    header_float = ndarray(shape=[1], buffer=header, offset=1, dtype=float64)
    header_float[0] = p.label
    return header + serialized_features 
Example #16
Source File: util.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _convert_labeled_point_to_libsvm(p):
        """Converts a LabeledPoint to a string in LIBSVM format."""
        from pyspark.mllib.regression import LabeledPoint
        assert isinstance(p, LabeledPoint)
        items = [str(p.label)]
        v = _convert_to_vector(p.features)
        if isinstance(v, SparseVector):
            nnz = len(v.indices)
            for i in xrange(nnz):
                items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
        else:
            for i in xrange(len(v)):
                items.append(str(i + 1) + ":" + str(v[i]))
        return " ".join(items) 
Example #17
Source File: sparkml_vs_sklearn_solution.py    From intro_ds with Apache License 2.0 5 votes vote down vote up
def trans2RDD(data, sc):
    """
    将Python里的数据转换为RDD
    """
    data = sc.parallelize(data)
    data = data.map(lambda line: LabeledPoint(line[0], line[1:]))
    return data 
Example #18
Source File: sparkml_vs_sklearn.py    From intro_ds with Apache License 2.0 5 votes vote down vote up
def trans2RDD(data, sc):
    """
    将Python里的数据转换为RDD
    """
    data = sc.parallelize(data)
    data = data.map(lambda line: LabeledPoint(line[0], line[1:]))
    return data 
Example #19
Source File: regression.py    From ferry with Apache License 2.0 5 votes vote down vote up
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[0], values[1:]) 
Example #20
Source File: wines.py    From spark-workshop with MIT License 5 votes vote down vote up
def parsePoint(line):
    values = csv.reader(StringIO(line), delimiter=";").next() # CSV parsing of line
    values = [float(x) for x in values]                       # Cast to all floats
    return LabeledPoint(values[-1], values[:-1])              # y = quality, X = row[:-1] 
Example #21
Source File: logreg_wines.py    From spark-workshop with MIT License 5 votes vote down vote up
def parsePoint(tup):
    """
    Parse text data into floats.
    Return tuple of (label, features).
    """
    values = [float(x) for x in tup[1].split(';')]
    return LabeledPoint(tup[0], values[1:]) 
Example #22
Source File: feature.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def fit(self, data):
        """
        Returns a ChiSquared feature selector.

        :param data: an `RDD[LabeledPoint]` containing the labeled dataset
                     with categorical features. Real-valued features will be
                     treated as categorical for each distinct value.
                     Apply feature discretizer before using this function.
        """
        jmodel = callMLlibFunc("fitChiSqSelector", self.selectorType, self.numTopFeatures,
                               self.percentile, self.fpr, self.fdr, self.fwe, data)
        return ChiSqSelectorModel(jmodel) 
Example #23
Source File: wines.py    From Data_Analytics_with_Hadoop with MIT License 5 votes vote down vote up
def parsePoint(line):
    values = csv.reader(StringIO(line), delimiter=";").next() # CSV parsing of line
    values = [float(x) for x in values]                       # Cast to all floats
    return LabeledPoint(values[-1], values[:-1])              # y = quality, X = row[:-1] 
Example #24
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_right_number_of_results(self):
        num_cols = 1001
        sparse_data = [
            LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
            LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
        ]
        chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
        self.assertEqual(len(chi), num_cols)
        self.assertIsNotNone(chi[1000]) 
Example #25
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0) 
Example #26
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0) 
Example #27
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_infer_schema(self):
        rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = df.rdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise TypeError("expecting a vector but got %r of type %r" % (v, type(v))) 
Example #28
Source File: GradientBoostedTrees.py    From Hanhan-Spark-Python with MIT License 5 votes vote down vote up
def get_lp(t):
    rating = t[1][0]
    avg_features = t[1][1]
    return LabeledPoint(rating, avg_features) 
Example #29
Source File: word2vec_histogram_best_RMSE.py    From Hanhan-Spark-Python with MIT License 5 votes vote down vote up
def get_lp(t):
    rating = t[0]
    sp = t[1]
    return LabeledPoint(rating, sp) 
Example #30
Source File: word2vec_best_RMSE.py    From Hanhan-Spark-Python with MIT License 5 votes vote down vote up
def get_lp(t):
    rating = t[1][0]
    avg_features = t[1][1]
    return LabeledPoint(rating, avg_features)