Python pyspark.mllib.regression.LabeledPoint() Examples
The following are 30
code examples of pyspark.mllib.regression.LabeledPoint().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.mllib.regression
, or try the search function
.
Example #1
Source File: util.py From LearningApacheSpark with MIT License | 6 votes |
def loadLabeledPoints(sc, path, minPartitions=None): """ Load labeled points saved using RDD.saveAsTextFile. :param sc: Spark context :param path: file or directory path in any Hadoop-supported file system URI :param minPartitions: min number of partitions @return: labeled data stored as an RDD of LabeledPoint >>> from tempfile import NamedTemporaryFile >>> from pyspark.mllib.util import MLUtils >>> from pyspark.mllib.regression import LabeledPoint >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), ... LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))] >>> tempFile = NamedTemporaryFile(delete=True) >>> tempFile.close() >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name) >>> MLUtils.loadLabeledPoints(sc, tempFile.name).collect() [LabeledPoint(1.1, (3,[0,2],[-1.23,4.56e-07])), LabeledPoint(0.0, [1.01,2.02,3.03])] """ minPartitions = minPartitions or min(sc.defaultParallelism, 2) return callMLlibFunc("loadLabeledPoints", sc, path, minPartitions)
Example #2
Source File: util.py From LearningApacheSpark with MIT License | 6 votes |
def saveAsLibSVMFile(data, dir): """ Save labeled data in LIBSVM format. :param data: an RDD of LabeledPoint to be saved :param dir: directory to save the data >>> from tempfile import NamedTemporaryFile >>> from fileinput import input >>> from pyspark.mllib.regression import LabeledPoint >>> from glob import glob >>> from pyspark.mllib.util import MLUtils >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), ... LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))] >>> tempFile = NamedTemporaryFile(delete=True) >>> tempFile.close() >>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name) >>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*")))) '0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n' """ lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p)) lines.saveAsTextFile(dir)
Example #3
Source File: rdd_utils.py From elephas with MIT License | 6 votes |
def to_labeled_point(sc, features, labels, categorical=False): """Convert numpy arrays of features and labels into a LabeledPoint RDD for MLlib and ML integration. :param sc: Spark context :param features: numpy array with features :param labels: numpy array with labels :param categorical: boolean, whether labels are already one-hot encoded or not :return: LabeledPoint RDD with features and labels """ labeled_points = [] for x, y in zip(features, labels): if categorical: lp = LabeledPoint(np.argmax(y), to_vector(x)) else: lp = LabeledPoint(y, to_vector(x)) labeled_points.append(lp) return sc.parallelize(labeled_points)
Example #4
Source File: rdd_utils.py From elephas with MIT License | 6 votes |
def from_labeled_point(rdd, categorical=False, nb_classes=None): """Convert a LabeledPoint RDD back to a pair of numpy arrays :param rdd: LabeledPoint RDD :param categorical: boolean, if labels should be one-hot encode when returned :param nb_classes: optional int, indicating the number of class labels :return: pair of numpy arrays, features and labels """ features = np.asarray( rdd.map(lambda lp: from_vector(lp.features)).collect()) labels = np.asarray(rdd.map(lambda lp: lp.label).collect(), dtype='int32') if categorical: if not nb_classes: nb_classes = np.max(labels) + 1 temp = np.zeros((len(labels), nb_classes)) for i, label in enumerate(labels): temp[i, label] = 1. labels = temp return features, labels
Example #5
Source File: rdd_utils.py From elephas with MIT License | 6 votes |
def lp_to_simple_rdd(lp_rdd, categorical=False, nb_classes=None): """Convert a LabeledPoint RDD into an RDD of feature-label pairs :param lp_rdd: LabeledPoint RDD of features and labels :param categorical: boolean, if labels should be one-hot encode when returned :param nb_classes: int, number of total classes :return: Spark RDD with feature-label pairs """ if categorical: if not nb_classes: labels = np.asarray(lp_rdd.map( lambda lp: lp.label).collect(), dtype='int32') nb_classes = np.max(labels) + 1 rdd = lp_rdd.map(lambda lp: (from_vector(lp.features), encode_label(lp.label, nb_classes))) else: rdd = lp_rdd.map(lambda lp: (from_vector(lp.features), lp.label)) return rdd
Example #6
Source File: tfidf_cv_lowestRMSE.py From Hanhan-Spark-Python with MIT License | 6 votes |
def parse_point(line): ptn1 = "\(([\d\.]*),\sSparseVector\((.*?)\)\)" ptn2 = "(\d+),\s+\{(.*?)\}" m = re.search(ptn1, line) if m: label = float(m.group(1)) features_str = m.group(2) mx = re.search(ptn2, features_str) num = float(mx.group(1)) fs = mx.group(2) idx_set = [] tfidf_scores = [] if fs != '': fs_split = fs.split(', ') for f in fs_split: idx_set.append(f.split(': ')[0]) tfidf_scores.append(f.split(': ')[1]) sp = SparseVector(num, idx_set, tfidf_scores) LP = LabeledPoint(label, sp) return LP return None # Find the best step_size through cross validation, using RMSE as the error measurement
Example #7
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_chi_sq_pearson(self): data = [ LabeledPoint(0.0, Vectors.dense([0.5, 10.0])), LabeledPoint(0.0, Vectors.dense([1.5, 20.0])), LabeledPoint(1.0, Vectors.dense([1.5, 30.0])), LabeledPoint(0.0, Vectors.dense([3.5, 30.0])), LabeledPoint(0.0, Vectors.dense([3.5, 40.0])), LabeledPoint(1.0, Vectors.dense([3.5, 40.0])) ] for numParts in [2, 4, 6, 8]: chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts)) feature1 = chi[0] self.assertEqual(feature1.statistic, 0.75) self.assertEqual(feature1.degreesOfFreedom, 2) self.assertAlmostEqual(feature1.pValue, 0.6873, 4) feature2 = chi[1] self.assertEqual(feature2.statistic, 1.5) self.assertEqual(feature2.degreesOfFreedom, 3) self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
Example #8
Source File: tfidf_cv_lowestRMSE_normalized.py From Hanhan-Spark-Python with MIT License | 6 votes |
def parse_point(line): ptn1 = "\(([\d\.]*),\sSparseVector\((.*?)\)\)" ptn2 = "(\d+),\s+\{(.*?)\}" m = re.search(ptn1, line) if m: label = float(m.group(1)) features_str = m.group(2) mx = re.search(ptn2, features_str) num = float(mx.group(1)) fs = mx.group(2) idx_set = [] tfidf_scores = [] if fs != '': fs_split = fs.split(', ') for f in fs_split: idx_set.append(f.split(': ')[0]) tfidf_scores.append(f.split(': ')[1]) sp = SparseVector(num, idx_set, tfidf_scores) LP = LabeledPoint(label, sp) return LP return None # Find the best step_size through cross validation, using RMSE as the error measurement
Example #9
Source File: util.py From spark-cluster-deployment with Apache License 2.0 | 6 votes |
def saveAsLibSVMFile(data, dir): """ Save labeled data in LIBSVM format. @param data: an RDD of LabeledPoint to be saved @param dir: directory to save the data >>> from tempfile import NamedTemporaryFile >>> from fileinput import input >>> from glob import glob >>> from pyspark.mllib.util import MLUtils >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), \ LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))] >>> tempFile = NamedTemporaryFile(delete=True) >>> tempFile.close() >>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name) >>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*")))) '0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n' """ lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p)) lines.saveAsTextFile(dir)
Example #10
Source File: _common.py From spark-cluster-deployment with Apache License 2.0 | 5 votes |
def _get_unmangled_double_vector_rdd(data): return _get_unmangled_rdd(data, _serialize_double_vector) # Map a pickled Python RDD of LabeledPoint to a Java RDD of _serialized_labeled_points
Example #11
Source File: tree.py From LearningApacheSpark with MIT License | 5 votes |
def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed): first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies: raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy) if seed is None: seed = random.randint(0, 1 << 30) model = callMLlibFunc("trainRandomForestModel", data, algo, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed) return RandomForestModel(model)
Example #12
Source File: tree.py From LearningApacheSpark with MIT License | 5 votes |
def _train(cls, data, algo, categoricalFeaturesInfo, loss, numIterations, learningRate, maxDepth, maxBins): first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo, loss, numIterations, learningRate, maxDepth, maxBins) return GradientBoostedTreesModel(model)
Example #13
Source File: logistic_regression.py From spark-cluster-deployment with Apache License 2.0 | 5 votes |
def parsePoint(line): values = [float(s) for s in line.split(' ')] if values[0] == -1: # Convert -1 labels to 0 for MLlib values[0] = 0 return LabeledPoint(values[0], values[1:])
Example #14
Source File: util.py From spark-cluster-deployment with Apache License 2.0 | 5 votes |
def _convert_labeled_point_to_libsvm(p): """Converts a LabeledPoint to a string in LIBSVM format.""" items = [str(p.label)] v = _convert_vector(p.features) if type(v) == np.ndarray: for i in xrange(len(v)): items.append(str(i + 1) + ":" + str(v[i])) elif type(v) == SparseVector: nnz = len(v.indices) for i in xrange(nnz): items.append(str(v.indices[i] + 1) + ":" + str(v.values[i])) else: raise TypeError("_convert_labeled_point_to_libsvm needs either ndarray or SparseVector" " but got " % type(v)) return " ".join(items)
Example #15
Source File: _common.py From spark-cluster-deployment with Apache License 2.0 | 5 votes |
def _serialize_labeled_point(p): """Serialize a LabeledPoint with a features vector of any type.""" from pyspark.mllib.regression import LabeledPoint serialized_features = _serialize_double_vector(p.features) header = bytearray(9) header[0] = LABELED_POINT_MAGIC header_float = ndarray(shape=[1], buffer=header, offset=1, dtype=float64) header_float[0] = p.label return header + serialized_features
Example #16
Source File: util.py From LearningApacheSpark with MIT License | 5 votes |
def _convert_labeled_point_to_libsvm(p): """Converts a LabeledPoint to a string in LIBSVM format.""" from pyspark.mllib.regression import LabeledPoint assert isinstance(p, LabeledPoint) items = [str(p.label)] v = _convert_to_vector(p.features) if isinstance(v, SparseVector): nnz = len(v.indices) for i in xrange(nnz): items.append(str(v.indices[i] + 1) + ":" + str(v.values[i])) else: for i in xrange(len(v)): items.append(str(i + 1) + ":" + str(v[i])) return " ".join(items)
Example #17
Source File: sparkml_vs_sklearn_solution.py From intro_ds with Apache License 2.0 | 5 votes |
def trans2RDD(data, sc): """ 将Python里的数据转换为RDD """ data = sc.parallelize(data) data = data.map(lambda line: LabeledPoint(line[0], line[1:])) return data
Example #18
Source File: sparkml_vs_sklearn.py From intro_ds with Apache License 2.0 | 5 votes |
def trans2RDD(data, sc): """ 将Python里的数据转换为RDD """ data = sc.parallelize(data) data = data.map(lambda line: LabeledPoint(line[0], line[1:])) return data
Example #19
Source File: regression.py From ferry with Apache License 2.0 | 5 votes |
def parsePoint(line): values = [float(x) for x in line.replace(',', ' ').split(' ')] return LabeledPoint(values[0], values[1:])
Example #20
Source File: wines.py From spark-workshop with MIT License | 5 votes |
def parsePoint(line): values = csv.reader(StringIO(line), delimiter=";").next() # CSV parsing of line values = [float(x) for x in values] # Cast to all floats return LabeledPoint(values[-1], values[:-1]) # y = quality, X = row[:-1]
Example #21
Source File: logreg_wines.py From spark-workshop with MIT License | 5 votes |
def parsePoint(tup): """ Parse text data into floats. Return tuple of (label, features). """ values = [float(x) for x in tup[1].split(';')] return LabeledPoint(tup[0], values[1:])
Example #22
Source File: feature.py From LearningApacheSpark with MIT License | 5 votes |
def fit(self, data): """ Returns a ChiSquared feature selector. :param data: an `RDD[LabeledPoint]` containing the labeled dataset with categorical features. Real-valued features will be treated as categorical for each distinct value. Apply feature discretizer before using this function. """ jmodel = callMLlibFunc("fitChiSqSelector", self.selectorType, self.numTopFeatures, self.percentile, self.fpr, self.fdr, self.fwe, data) return ChiSqSelectorModel(jmodel)
Example #23
Source File: wines.py From Data_Analytics_with_Hadoop with MIT License | 5 votes |
def parsePoint(line): values = csv.reader(StringIO(line), delimiter=";").next() # CSV parsing of line values = [float(x) for x in values] # Cast to all floats return LabeledPoint(values[-1], values[:-1]) # y = quality, X = row[:-1]
Example #24
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_right_number_of_results(self): num_cols = 1001 sparse_data = [ LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])), LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)])) ] chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data)) self.assertEqual(len(chi), num_cols) self.assertIsNotNone(chi[1000])
Example #25
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
Example #26
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier(rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
Example #27
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_infer_schema(self): rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)]) df = rdd.toDF() schema = df.schema field = [f for f in schema.fields if f.name == "features"][0] self.assertEqual(field.dataType, self.udt) vectors = df.rdd.map(lambda p: p.features).collect() self.assertEqual(len(vectors), 2) for v in vectors: if isinstance(v, SparseVector): self.assertEqual(v, self.sv1) elif isinstance(v, DenseVector): self.assertEqual(v, self.dv1) else: raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))
Example #28
Source File: GradientBoostedTrees.py From Hanhan-Spark-Python with MIT License | 5 votes |
def get_lp(t): rating = t[1][0] avg_features = t[1][1] return LabeledPoint(rating, avg_features)
Example #29
Source File: word2vec_histogram_best_RMSE.py From Hanhan-Spark-Python with MIT License | 5 votes |
def get_lp(t): rating = t[0] sp = t[1] return LabeledPoint(rating, sp)
Example #30
Source File: word2vec_best_RMSE.py From Hanhan-Spark-Python with MIT License | 5 votes |
def get_lp(t): rating = t[1][0] avg_features = t[1][1] return LabeledPoint(rating, avg_features)