Python pyspark.RDD Examples
The following are 30
code examples of pyspark.RDD().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark
, or try the search function
.
Example #1
Source File: _common.py From spark-cluster-deployment with Apache License 2.0 | 6 votes |
def _linear_predictor_typecheck(x, coeffs): """ Check that x is a one-dimensional vector of the right shape. This is a temporary hackaround until we actually implement bulk predict. """ x = _convert_vector(x) if type(x) == ndarray: if x.ndim == 1: if x.shape != coeffs.shape: raise RuntimeError("Got array of %d elements; wanted %d" % ( numpy.shape(x)[0], coeffs.shape[0])) else: raise RuntimeError("Bulk predict not yet supported.") elif type(x) == SparseVector: if x.size != coeffs.shape[0]: raise RuntimeError("Got sparse vector of size %d; wanted %d" % ( x.size, coeffs.shape[0])) elif (type(x) == RDD): raise RuntimeError("Bulk predict not yet supported.") else: raise TypeError("Argument of type " + type(x).__name__ + " unsupported") # If we weren't given initial weights, take a zero vector of the appropriate # length.
Example #2
Source File: sys_exec.py From cadCAD with MIT License | 6 votes |
def to_spark_df(rdd: RDD, spark: SparkSession, init_condition: dict = None): # Typefull if init_condition is not None: return to_spark(rdd, init_condition) # Typeless else: spark.conf.set("spark.sql.execution.arrow.enabled", "true") spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true") warnings.simplefilter(action='ignore', category=UserWarning) pdf_from_rdd: DataFrame = to_pandas(rdd) result = spark.createDataFrame(pdf_from_rdd) del pdf_from_rdd return result
Example #3
Source File: transform.py From search-MjoLniR with MIT License | 6 votes |
def partition_per_row(rdd: RDD) -> RDD: """Place each row in an RDD into a separate partition. Only useful if that row represents something large to be computed over, perhaps an external resource such as a multi-gb training dataset. The spark part of the dataset is expected to be tiny and easily fit in a single partition. """ num_rows = rdd.count() # Help out mypy. Also don't use `identity`, as it somehow fails serialization partition_fn = cast(Callable[[int], int], lambda x: x) return ( # bring everything together and assign each row a partition id rdd.repartition(1) .mapPartitions(lambda rows: enumerate(rows)) # Partition by the new parition_id .partitionBy(num_rows, partition_fn) # Drop the partition id, giving back the origional shape .map(lambda pair: pair[1]) ) # Shared joins
Example #4
Source File: spark.py From snorkel with Apache License 2.0 | 6 votes |
def apply(self, data_points: RDD, fault_tolerant: bool = False) -> np.ndarray: """Label PySpark RDD of data points with LFs. Parameters ---------- data_points PySpark RDD containing data points to be labeled by LFs fault_tolerant Output ``-1`` if LF execution fails? Returns ------- np.ndarray Matrix of labels emitted by LFs """ f_caller = _FunctionCaller(fault_tolerant) def map_fn(args: Tuple[DataPoint, int]) -> RowData: return apply_lfs_to_data_point(*args, lfs=self._lfs, f_caller=f_caller) labels = data_points.zipWithIndex().map(map_fn).collect() return self._numpy_from_row_data(labels)
Example #5
Source File: wikidata.py From qb with MIT License | 6 votes |
def clean_claims(claims: RDD, b_item_map: Broadcast): def clean(claim): item_map = b_item_map.value if claim.datatype == 'wikibase-item': if claim.object in item_map: claim = claim._replace(object=item_map[claim.object]) return claim else: return None elif claim.datatype == 'quantity': unit = claim.object.unit unit = unit.split('/')[-1] if unit in item_map: claim = claim._replace(object=item_map[unit]) return claim else: return None return claim dt_filter = {'wikibase-item', 'string', 'monolingualtext', 'quantity', 'time'} return claims.filter(lambda c: c.datatype in dt_filter).map(clean).filter(lambda c: c is not None)
Example #6
Source File: common.py From LearningApacheSpark with MIT License | 6 votes |
def _py2java(sc, obj): """ Convert Python object into Java """ if isinstance(obj, RDD): obj = _to_java_object_rdd(obj) elif isinstance(obj, DataFrame): obj = obj._jdf elif isinstance(obj, SparkContext): obj = obj._jsc elif isinstance(obj, list): obj = [_py2java(sc, x) for x in obj] elif isinstance(obj, JavaObject): pass elif isinstance(obj, (int, long, float, bool, bytes, unicode)): pass else: data = bytearray(PickleSerializer().dumps(obj)) obj = sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(data) return obj
Example #7
Source File: classification.py From LearningApacheSpark with MIT License | 6 votes |
def train(cls, data, lambda_=1.0): """ Train a Naive Bayes model given an RDD of (label, features) vectors. This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can handle all kinds of discrete data. For example, by converting documents into TF-IDF vectors, it can be used for document classification. By making every vector a 0-1 vector, it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}). The input feature values must be nonnegative. :param data: RDD of LabeledPoint. :param lambda_: The smoothing parameter. (default: 1.0) """ first = data.first() if not isinstance(first, LabeledPoint): raise ValueError("`data` should be an RDD of LabeledPoint") labels, pi, theta = callMLlibFunc("trainNaiveBayesModel", data, lambda_) return NaiveBayesModel(labels.toArray(), pi.toArray(), numpy.array(theta))
Example #8
Source File: tree.py From LearningApacheSpark with MIT License | 6 votes |
def predict(self, x): """ Predict the label of one or more examples. .. note:: In Python, predict cannot currently be used within an RDD transformation or action. Call predict directly on the RDD instead. :param x: Data point (feature vector), or an RDD of data points (feature vectors). """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) else: return self.call("predict", _convert_to_vector(x))
Example #9
Source File: regression.py From LearningApacheSpark with MIT License | 6 votes |
def predict(self, x): """ Predict labels for provided features. Using a piecewise linear function. 1) If x exactly matches a boundary then associated prediction is returned. In case there are multiple predictions with the same boundary then one of them is returned. Which one is undefined (same as java.util.Arrays.binarySearch). 2) If x is lower or higher than all boundaries then first or last prediction is returned respectively. In case there are multiple predictions with the same boundary then the lowest or highest is returned respectively. 3) If x falls between two values in boundary array then prediction is treated as piecewise linear function and interpolated value is returned. In case there are multiple values with the same boundary then the same rules as in 2) are used. :param x: Feature or RDD of Features to be labeled. """ if isinstance(x, RDD): return x.map(lambda v: self.predict(v)) return np.interp(x, self.boundaries, self.predictions)
Example #10
Source File: distributed.py From LearningApacheSpark with MIT License | 6 votes |
def entries(self): """ Entries of the CoordinateMatrix stored as an RDD of MatrixEntries. >>> mat = CoordinateMatrix(sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(6, 4, 2.1)])) >>> entries = mat.entries >>> entries.first() MatrixEntry(0, 0, 1.2) """ # We use DataFrames for serialization of MatrixEntry entries # from Java, so we first convert the RDD of entries to a # DataFrame on the Scala/Java side. Then we map each Row in # the DataFrame back to a MatrixEntry on this side. entries_df = callMLlibFunc("getMatrixEntries", self._java_matrix_wrapper._java_model) entries = entries_df.rdd.map(lambda row: MatrixEntry(row[0], row[1], row[2])) return entries
Example #11
Source File: distributed.py From LearningApacheSpark with MIT License | 6 votes |
def blocks(self): """ The RDD of sub-matrix blocks ((blockRowIndex, blockColIndex), sub-matrix) that form this distributed matrix. >>> mat = BlockMatrix( ... sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]), 3, 2) >>> blocks = mat.blocks >>> blocks.first() ((0, 0), DenseMatrix(3, 2, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 0)) """ # We use DataFrames for serialization of sub-matrix blocks # from Java, so we first convert the RDD of blocks to a # DataFrame on the Scala/Java side. Then we map each Row in # the DataFrame back to a sub-matrix block on this side. blocks_df = callMLlibFunc("getMatrixBlocks", self._java_matrix_wrapper._java_model) blocks = blocks_df.rdd.map(lambda row: ((row[0][0], row[0][1]), row[1])) return blocks
Example #12
Source File: dstream.py From LearningApacheSpark with MIT License | 6 votes |
def pprint(self, num=10): """ Print the first num elements of each RDD generated in this DStream. @param num: the number of elements from the first will be printed. """ def takeAndPrint(time, rdd): taken = rdd.take(num + 1) print("-------------------------------------------") print("Time: %s" % time) print("-------------------------------------------") for record in taken[:num]: print(record) if len(taken) > num: print("...") print("") self.foreachRDD(takeAndPrint)
Example #13
Source File: dstream.py From LearningApacheSpark with MIT License | 6 votes |
def transformWith(self, func, other, keepSerializer=False): """ Return a new DStream in which each RDD is generated by applying a function on each RDD of this DStream and 'other' DStream. `func` can have two arguments of (`rdd_a`, `rdd_b`) or have three arguments of (`time`, `rdd_a`, `rdd_b`) """ if func.__code__.co_argcount == 2: oldfunc = func func = lambda t, a, b: oldfunc(a, b) assert func.__code__.co_argcount == 3, "func should take two or three arguments" jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer, other._jrdd_deserializer) dstream = self._sc._jvm.PythonTransformed2DStream(self._jdstream.dstream(), other._jdstream.dstream(), jfunc) jrdd_serializer = self._jrdd_deserializer if keepSerializer else self._sc.serializer return DStream(dstream.asJavaDStream(), self._ssc, jrdd_serializer)
Example #14
Source File: dstream.py From LearningApacheSpark with MIT License | 6 votes |
def window(self, windowDuration, slideDuration=None): """ Return a new DStream in which each RDD contains all the elements in seen in a sliding window of time over this DStream. @param windowDuration: width of the window; must be a multiple of this DStream's batching interval @param slideDuration: sliding interval of the window (i.e., the interval after which the new DStream will generate RDDs); must be a multiple of this DStream's batching interval """ self._validate_window_param(windowDuration, slideDuration) d = self._ssc._jduration(windowDuration) if slideDuration is None: return DStream(self._jdstream.window(d), self._ssc, self._jrdd_deserializer) s = self._ssc._jduration(slideDuration) return DStream(self._jdstream.window(d, s), self._ssc, self._jrdd_deserializer)
Example #15
Source File: dstream.py From LearningApacheSpark with MIT License | 6 votes |
def countByValueAndWindow(self, windowDuration, slideDuration, numPartitions=None): """ Return a new DStream in which each RDD contains the count of distinct elements in RDDs in a sliding window over this DStream. @param windowDuration: width of the window; must be a multiple of this DStream's batching interval @param slideDuration: sliding interval of the window (i.e., the interval after which the new DStream will generate RDDs); must be a multiple of this DStream's batching interval @param numPartitions: number of partitions of each RDD in the new DStream. """ keyed = self.map(lambda x: (x, 1)) counted = keyed.reduceByKeyAndWindow(operator.add, operator.sub, windowDuration, slideDuration, numPartitions) return counted.filter(lambda kv: kv[1] > 0)
Example #16
Source File: dstream.py From LearningApacheSpark with MIT License | 6 votes |
def groupByKeyAndWindow(self, windowDuration, slideDuration, numPartitions=None): """ Return a new DStream by applying `groupByKey` over a sliding window. Similar to `DStream.groupByKey()`, but applies it over a sliding window. @param windowDuration: width of the window; must be a multiple of this DStream's batching interval @param slideDuration: sliding interval of the window (i.e., the interval after which the new DStream will generate RDDs); must be a multiple of this DStream's batching interval @param numPartitions: Number of partitions of each RDD in the new DStream. """ ls = self.mapValues(lambda x: [x]) grouped = ls.reduceByKeyAndWindow(lambda a, b: a.extend(b) or a, lambda a, b: a[len(b):], windowDuration, slideDuration, numPartitions) return grouped.mapValues(ResultIterable)
Example #17
Source File: common.py From LearningApacheSpark with MIT License | 6 votes |
def _py2java(sc, obj): """ Convert Python object into Java """ if isinstance(obj, RDD): obj = _to_java_object_rdd(obj) elif isinstance(obj, DataFrame): obj = obj._jdf elif isinstance(obj, SparkContext): obj = obj._jsc elif isinstance(obj, list): obj = [_py2java(sc, x) for x in obj] elif isinstance(obj, JavaObject): pass elif isinstance(obj, (int, long, float, bool, bytes, unicode)): pass else: data = bytearray(PickleSerializer().dumps(obj)) obj = sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(data) return obj
Example #18
Source File: distributed.py From LearningApacheSpark with MIT License | 6 votes |
def rows(self): """ Rows of the IndexedRowMatrix stored as an RDD of IndexedRows. >>> mat = IndexedRowMatrix(sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(1, [4, 5, 6])])) >>> rows = mat.rows >>> rows.first() IndexedRow(0, [1.0,2.0,3.0]) """ # We use DataFrames for serialization of IndexedRows from # Java, so we first convert the RDD of rows to a DataFrame # on the Scala/Java side. Then we map each Row in the # DataFrame back to an IndexedRow on this side. rows_df = callMLlibFunc("getIndexedRows", self._java_matrix_wrapper._java_model) rows = rows_df.rdd.map(lambda row: IndexedRow(row[0], row[1])) return rows
Example #19
Source File: dstream.py From LearningApacheSpark with MIT License | 5 votes |
def combineByKey(self, createCombiner, mergeValue, mergeCombiners, numPartitions=None): """ Return a new DStream by applying combineByKey to each RDD. """ if numPartitions is None: numPartitions = self._sc.defaultParallelism def func(rdd): return rdd.combineByKey(createCombiner, mergeValue, mergeCombiners, numPartitions) return self.transform(func)
Example #20
Source File: dstream.py From LearningApacheSpark with MIT License | 5 votes |
def mapPartitionsWithIndex(self, f, preservesPartitioning=False): """ Return a new DStream in which each RDD is generated by applying mapPartitionsWithIndex() to each RDDs of this DStream. """ return self.transform(lambda rdd: rdd.mapPartitionsWithIndex(f, preservesPartitioning))
Example #21
Source File: dstream.py From LearningApacheSpark with MIT License | 5 votes |
def count(self): """ Return a new DStream in which each RDD has a single element generated by counting each RDD of this DStream. """ return self.mapPartitions(lambda i: [sum(1 for _ in i)]).reduce(operator.add)
Example #22
Source File: wikidata.py From qb with MIT License | 5 votes |
def extract_item_page_map(wikidata_items: RDD): def parse_item_page(item): item_id = item['id'] if 'enwiki' in item['sitelinks']: return [(item_id, item['sitelinks']['enwiki']['title'])] else: return [] return wikidata_items.flatMap(parse_item_page).collectAsMap()
Example #23
Source File: dstream.py From LearningApacheSpark with MIT License | 5 votes |
def reduce(self, func): """ Return a new DStream in which each RDD has a single element generated by reducing each RDD of this DStream. """ return self.map(lambda x: (None, x)).reduceByKey(func, 1).map(lambda x: x[1])
Example #24
Source File: dstream.py From LearningApacheSpark with MIT License | 5 votes |
def glom(self): """ Return a new DStream in which RDD is generated by applying glom() to RDD of this DStream. """ def func(iterator): yield list(iterator) return self.mapPartitions(func)
Example #25
Source File: dstream.py From LearningApacheSpark with MIT License | 5 votes |
def checkpoint(self, interval): """ Enable periodic checkpointing of RDDs of this DStream @param interval: time in seconds, after each period of that, generated RDD will be checkpointed """ self.is_checkpointed = True self._jdstream.checkpoint(self._ssc._jduration(interval)) return self
Example #26
Source File: dstream.py From LearningApacheSpark with MIT License | 5 votes |
def groupByKey(self, numPartitions=None): """ Return a new DStream by applying groupByKey on each RDD. """ if numPartitions is None: numPartitions = self._sc.defaultParallelism return self.transform(lambda rdd: rdd.groupByKey(numPartitions))
Example #27
Source File: dstream.py From LearningApacheSpark with MIT License | 5 votes |
def countByValue(self): """ Return a new DStream in which each RDD contains the counts of each distinct value in each RDD of this DStream. """ return self.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y)
Example #28
Source File: dstream.py From LearningApacheSpark with MIT License | 5 votes |
def transform(self, func): """ Return a new DStream in which each RDD is generated by applying a function on each RDD of this DStream. `func` can have one argument of `rdd`, or have two arguments of (`time`, `rdd`) """ if func.__code__.co_argcount == 1: oldfunc = func func = lambda t, rdd: oldfunc(rdd) assert func.__code__.co_argcount == 2, "func should take one or two arguments" return TransformedDStream(self, func)
Example #29
Source File: wikidata.py From qb with MIT License | 5 votes |
def extract_items(wikidata_items: RDD, b_property_map: Broadcast, b_item_page_map: Broadcast): def parse_item(item): property_map = b_property_map.value item_page_map = b_item_page_map.value if 'enwiki' in item['sitelinks']: page_title = item['sitelinks']['enwiki']['title'] else: return None, None claims = {} for prop_id, property_claims in item['claims'].items(): if prop_id in property_map: prop_name = property_map[prop_id] parsed_claims = [] for c in property_claims: if 'datavalue' in c['mainsnak']: c = c['mainsnak']['datavalue']['value'] if type(c) == dict and 'entity-type' in c: claim_item_id = c['id'] if claim_item_id in item_page_map: c = item_page_map[c['id']] else: continue parsed_claims.append(c) claims[prop_name] = parsed_claims return page_title, claims return wikidata_items\ .map(parse_item)\ .filter(lambda pc: pc[0] is not None)\ .reduceByKey(lambda x, y: x)\ .collectAsMap()
Example #30
Source File: vocabulary2id.py From code2vec with Apache License 2.0 | 5 votes |
def __call__(self, rows: RDD): value2index, path2index, value2freq, path2freq = self.build_vocabularies(rows) doc2path_contexts = self.build_doc2pc(value2index, path2index, rows) doc2path_contexts = doc2path_contexts.collect() Code2VecFeatures().construct(value2index=value2index, path2index=path2index, value2freq=value2freq, path2freq=path2freq, path_contexts=doc2path_contexts).save( self.output)