Python pyspark.sql.Row() Examples
The following are 30
code examples of pyspark.sql.Row().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql
, or try the search function
.
Example #1
Source File: functions.py From LearningApacheSpark with MIT License | 6 votes |
def randn(seed=None): """Generates a column with independent and identically distributed (i.i.d.) samples from the standard normal distribution. .. note:: The function is non-deterministic in general case. >>> df.withColumn('randn', randn(seed=42)).collect() [Row(age=2, name=u'Alice', randn=-0.7556247885860078), Row(age=5, name=u'Bob', randn=-0.0861619008451133)] """ sc = SparkContext._active_spark_context if seed is not None: jc = sc._jvm.functions.randn(seed) else: jc = sc._jvm.functions.randn() return Column(jc)
Example #2
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 6 votes |
def _inferSchemaFromList(self, data, names=None): """ Infer schema from list of Row or tuple. :param data: list of Row or tuple :param names: list of column names :return: :class:`pyspark.sql.types.StructType` """ if not data: raise ValueError("can not infer schema from empty dataset") first = data[0] if type(first) is dict: warnings.warn("inferring schema from dict is deprecated," "please use pyspark.sql.Row instead") schema = reduce(_merge_type, (_infer_schema(row, names) for row in data)) if _has_nulltype(schema): raise ValueError("Some of types cannot be determined after inferring") return schema
Example #3
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 6 votes |
def _test(): import os import doctest from pyspark.context import SparkContext from pyspark.sql import Row import pyspark.sql.session os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.session.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['spark'] = SparkSession(sc) globs['rdd'] = rdd = sc.parallelize( [Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3")]) globs['df'] = rdd.toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.session, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) globs['sc'].stop() if failure_count: sys.exit(-1)
Example #4
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_string_indexer_handle_invalid(self): df = self.spark.createDataFrame([ (0, "a"), (1, "d"), (2, None)], ["id", "label"]) si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep", stringOrderType="alphabetAsc") model1 = si1.fit(df) td1 = model1.transform(df) actual1 = td1.select("id", "indexed").collect() expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)] self.assertEqual(actual1, expected1) si2 = si1.setHandleInvalid("skip") model2 = si2.fit(df) td2 = model2.transform(df) actual2 = td2.select("id", "indexed").collect() expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)] self.assertEqual(actual2, expected2)
Example #5
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 6 votes |
def _monkey_patch_RDD(sparkSession): def toDF(self, schema=None, sampleRatio=None): """ Converts current :class:`RDD` into a :class:`DataFrame` This is a shorthand for ``spark.createDataFrame(rdd, schema, sampleRatio)`` :param schema: a :class:`pyspark.sql.types.StructType` or list of names of columns :param samplingRatio: the sample ratio of rows used for inferring :return: a DataFrame >>> rdd.toDF().collect() [Row(name=u'Alice', age=1)] """ return sparkSession.createDataFrame(self, schema, sampleRatio) RDD.toDF = toDF
Example #6
Source File: keras_sql_udf_test.py From spark-deep-learning with Apache License 2.0 | 6 votes |
def test_map_blocks_sql_1(self): data = [Row(x=float(x)) for x in range(5)] df = self.sql.createDataFrame(data) with IsolatedSession() as issn: # The placeholder that corresponds to column 'x' as a whole column x = tf.placeholder(tf.double, shape=[None], name="x") # The output that adds 3 to x z = tf.add(x, 3, name='z') # Let's register these computations in SQL. makeGraphUDF(issn.graph, "map_blocks_sql_1", [z], blocked=True) # Here we go, for the SQL users, straight from PySpark. df2 = df.selectExpr("map_blocks_sql_1(x) AS z") print("df2 = %s" % df2) data2 = df2.collect() assert len(data2) == 5, data2 assert data2[0].z == 3.0, data2
Example #7
Source File: keras_sql_udf_test.py From spark-deep-learning with Apache License 2.0 | 6 votes |
def test_map_rows_sql_1(self): data = [Row(x=float(x)) for x in range(5)] df = self.sql.createDataFrame(data) with IsolatedSession() as issn: # The placeholder that corresponds to column 'x' as a whole column x = tf.placeholder(tf.double, shape=[], name="x") # The output that adds 3 to x z = tf.add(x, 3, name='z') # Let's register these computations in SQL. makeGraphUDF(issn.graph, "map_rows_sql_1", [z]) # Here we go, for the SQL users, straight from PySpark. df2 = df.selectExpr("map_rows_sql_1(x) AS z") print("df2 = %s" % df2) data2 = df2.collect() assert data2[0].z == 3.0, data2
Example #8
Source File: tf_transformer_test.py From spark-deep-learning with Apache License 2.0 | 6 votes |
def _build_local_features(np_dtype): """ Build numpy array (i.e. local) features. """ # Build local features and DataFrame from it local_features = [] np.random.seed(997) for idx in range(100): _dict = {'idx': idx} for colname, _ in _input_mapping.items(): colvalue = np.random.randn(_tensor_size) * 100 _dict[colname] = colvalue.astype(np_dtype).tolist() local_features.append(Row(**_dict)) return local_features
Example #9
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 6 votes |
def _monkey_patch_RDD(sparkSession): def toDF(self, schema=None, sampleRatio=None): """ Converts current :class:`RDD` into a :class:`DataFrame` This is a shorthand for ``spark.createDataFrame(rdd, schema, sampleRatio)`` :param schema: a :class:`pyspark.sql.types.StructType` or list of names of columns :param samplingRatio: the sample ratio of rows used for inferring :return: a DataFrame >>> rdd.toDF().collect() [Row(name=u'Alice', age=1)] """ return sparkSession.createDataFrame(self, schema, sampleRatio) RDD.toDF = toDF
Example #10
Source File: functions.py From LearningApacheSpark with MIT License | 6 votes |
def date_format(date, format): """ Converts a date/timestamp/string to a value of string in the format specified by the date format given by the second argument. A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All pattern letters of the Java class `java.text.SimpleDateFormat` can be used. .. note:: Use when ever possible specialized functions like `year`. These benefit from a specialized implementation. >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) >>> df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect() [Row(date=u'04/08/2015')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.date_format(_to_java_column(date), format))
Example #11
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 6 votes |
def _test(): import os import doctest from pyspark.context import SparkContext from pyspark.sql import Row import pyspark.sql.session os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.session.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['spark'] = SparkSession(sc) globs['rdd'] = rdd = sc.parallelize( [Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3")]) globs['df'] = rdd.toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.session, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) globs['sc'].stop() if failure_count: exit(-1)
Example #12
Source File: functions.py From LearningApacheSpark with MIT License | 6 votes |
def approx_count_distinct(col, rsd=None): """Aggregate function: returns a new :class:`Column` for approximate distinct count of column `col`. :param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more efficient to use :func:`countDistinct` >>> df.agg(approx_count_distinct(df.age).alias('distinct_ages')).collect() [Row(distinct_ages=2)] """ sc = SparkContext._active_spark_context if rsd is None: jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col)) else: jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col), rsd) return Column(jc)
Example #13
Source File: test_spark.py From snorkel with Apache License 2.0 | 6 votes |
def test_decorator_mapper_memoized_none(self) -> None: square_hit_tracker = SquareHitTracker() @lambda_mapper(memoize=True) def square(x: DataPoint) -> DataPoint: fields = x.asDict() fields["num_squared"] = square_hit_tracker(x.num) if x.num == 21: return None return Row(**fields) x21 = self._get_x(21) x21_mapped = square(x21) self.assertIsNone(x21_mapped) self.assertEqual(square_hit_tracker.n_hits, 1) x21_mapped = square(x21) self.assertIsNone(x21_mapped) self.assertEqual(square_hit_tracker.n_hits, 1)
Example #14
Source File: make_folds.py From search-MjoLniR with MIT License | 6 votes |
def convert_svmrank_to_xgboost(df: DataFrame) -> DataFrame: def convert_one(row: Row) -> Row: # For now place the .xgb right next to the svmrank files. Naming/path # options could be added if needed later. out_path = row.path + '.xgb' _convert_xgboost_remote(row.path, out_path) return Row(**dict( row.asDict(), vec_format='xgboost', path=out_path)) # Each row represents potentially gigabytes, convince spark # to create a partition per row. rdd_xgb = mt.partition_per_row(df.rdd).map(convert_one) df_xgb = df.sql_ctx.createDataFrame(rdd_xgb, df.schema) # type: ignore # Return both the xgb and svmrank datasets since # we aren't purging the related files. df is safe to reuse since # svmrank conversion returns a new dataframe with no lineage. return df.union(df_xgb)
Example #15
Source File: test_spark.py From snorkel with Apache License 2.0 | 6 votes |
def test_lf_applier_spark_preprocessor_memoized(self) -> None: sc = SparkContext.getOrCreate() sql = SQLContext(sc) @preprocessor(memoize=True) def square_memoize(x: DataPoint) -> DataPoint: return Row(num=x.num, num_squared=x.num ** 2) @labeling_function(pre=[square_memoize]) def fp_memoized(x: DataPoint) -> int: return 0 if x.num_squared > 42 else -1 df = pd.DataFrame(dict(num=DATA)) rdd = sql.createDataFrame(df).rdd applier = SparkLFApplier([f, fp_memoized]) L = applier.apply(rdd) np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)
Example #16
Source File: functions.py From LearningApacheSpark with MIT License | 6 votes |
def monotonically_increasing_id(): """A column that generates monotonically increasing 64-bit integers. The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. The current implementation puts the partition ID in the upper 31 bits, and the record number within each partition in the lower 33 bits. The assumption is that the data frame has less than 1 billion partitions, and each partition has less than 8 billion records. .. note:: The function is non-deterministic because its result depends on partition IDs. As an example, consider a :class:`DataFrame` with two partitions, each with 3 records. This expression would return the following IDs: 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. >>> df0 = sc.parallelize(range(2), 2).mapPartitions(lambda x: [(1,), (2,), (3,)]).toDF(['col1']) >>> df0.select(monotonically_increasing_id().alias('id')).collect() [Row(id=0), Row(id=1), Row(id=2), Row(id=8589934592), Row(id=8589934593), Row(id=8589934594)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.monotonically_increasing_id())
Example #17
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 6 votes |
def _inferSchemaFromList(self, data, names=None): """ Infer schema from list of Row or tuple. :param data: list of Row or tuple :param names: list of column names :return: :class:`pyspark.sql.types.StructType` """ if not data: raise ValueError("can not infer schema from empty dataset") first = data[0] if type(first) is dict: warnings.warn("inferring schema from dict is deprecated," "please use pyspark.sql.Row instead") schema = reduce(_merge_type, (_infer_schema(row, names) for row in data)) if _has_nulltype(schema): raise ValueError("Some of types cannot be determined after inferring") return schema
Example #18
Source File: medline_spark.py From pubmed_parser with MIT License | 6 votes |
def process_file(date_update): """Process downloaded MEDLINE folder to parquet file""" print("Process MEDLINE file to parquet") # remove if folder still exist if glob(os.path.join(save_dir, 'medline_*.parquet')): subprocess.call(['rm', '-rf', 'medline_*.parquet']) date_update_str = date_update.strftime("%Y_%m_%d") path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000) parse_results_rdd = path_rdd.\ flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict) for publication_dict in pp.parse_medline_xml(x)]) medline_df = parse_results_rdd.toDF() medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str), mode='overwrite') window = Window.partitionBy(['pmid']).orderBy(desc('file_name')) windowed_df = medline_df.select( max('delete').over(window).alias('is_deleted'), rank().over(window).alias('pos'), '*') windowed_df.\ where('is_deleted = False and pos = 1').\ write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str), mode='overwrite') # parse grant database parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\ .filter(lambda x: x is not None)\ .map(lambda x: Row(**x)) grant_df = parse_grant_rdd.toDF() grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str), mode='overwrite')
Example #19
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_infer_schema(self): rdd = self.sc.parallelize([Row(label=1.0, features=self.dv1), Row(label=0.0, features=self.sv1)]) df = rdd.toDF() schema = df.schema field = [f for f in schema.fields if f.name == "features"][0] self.assertEqual(field.dataType, self.udt) vectors = df.rdd.map(lambda p: p.features).collect() self.assertEqual(len(vectors), 2) for v in vectors: if isinstance(v, SparseVector): self.assertEqual(v, self.sv1) elif isinstance(v, DenseVector): self.assertEqual(v, self.dv1) else: raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))
Example #20
Source File: functions.py From LearningApacheSpark with MIT License | 5 votes |
def countDistinct(col, *cols): """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``. >>> df.agg(countDistinct(df.age, df.name).alias('c')).collect() [Row(c=2)] >>> df.agg(countDistinct("age", "name").alias('c')).collect() [Row(c=2)] """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.countDistinct(_to_java_column(col), _to_seq(sc, cols, _to_java_column)) return Column(jc)
Example #21
Source File: functions.py From LearningApacheSpark with MIT License | 5 votes |
def covar_pop(col1, col2): """Returns a new :class:`Column` for the population covariance of ``col1`` and ``col2``. >>> a = [1] * 10 >>> b = [1] * 10 >>> df = spark.createDataFrame(zip(a, b), ["a", "b"]) >>> df.agg(covar_pop("a", "b").alias('c')).collect() [Row(c=0.0)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.covar_pop(_to_java_column(col1), _to_java_column(col2)))
Example #22
Source File: functions.py From LearningApacheSpark with MIT License | 5 votes |
def isnull(col): """An expression that returns true iff the column is null. >>> df = spark.createDataFrame([(1, None), (None, 2)], ("a", "b")) >>> df.select(isnull("a").alias("r1"), isnull(df.a).alias("r2")).collect() [Row(r1=False, r2=False), Row(r1=True, r2=True)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.isnull(_to_java_column(col)))
Example #23
Source File: functions.py From LearningApacheSpark with MIT License | 5 votes |
def corr(col1, col2): """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` and ``col2``. >>> a = range(20) >>> b = [2 * x for x in range(20)] >>> df = spark.createDataFrame(zip(a, b), ["a", "b"]) >>> df.agg(corr("a", "b").alias('c')).collect() [Row(c=1.0)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.corr(_to_java_column(col1), _to_java_column(col2)))
Example #24
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_stopwordsremover(self): dataset = self.spark.createDataFrame([Row(input=["a", "panda"])]) stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output") # Default self.assertEqual(stopWordRemover.getInputCol(), "input") transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["panda"]) self.assertEqual(type(stopWordRemover.getStopWords()), list) self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring)) # Custom stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getInputCol(), "input") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) # with language selection stopwords = StopWordsRemover.loadDefaultStopWords("turkish") dataset = self.spark.createDataFrame([Row(input=["acaba", "ama", "biri"])]) stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, []) # with locale stopwords = ["BELKÄ°"] dataset = self.spark.createDataFrame([Row(input=["belki"])]) stopWordRemover.setStopWords(stopwords).setLocale("tr") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, [])
Example #25
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_ngram(self): dataset = self.spark.createDataFrame([ Row(input=["a", "b", "c", "d", "e"])]) ngram0 = NGram(n=4, inputCol="input", outputCol="output") self.assertEqual(ngram0.getN(), 4) self.assertEqual(ngram0.getInputCol(), "input") self.assertEqual(ngram0.getOutputCol(), "output") transformedDF = ngram0.transform(dataset) self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
Example #26
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_string_indexer_from_labels(self): model = StringIndexerModel.from_labels(["a", "b", "c"], inputCol="label", outputCol="indexed", handleInvalid="keep") self.assertEqual(model.labels, ["a", "b", "c"]) df1 = self.spark.createDataFrame([ (0, "a"), (1, "c"), (2, None), (3, "b"), (4, "b")], ["id", "label"]) result1 = model.transform(df1) actual1 = result1.select("id", "indexed").collect() expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=2.0), Row(id=2, indexed=3.0), Row(id=3, indexed=1.0), Row(id=4, indexed=1.0)] self.assertEqual(actual1, expected1) model_empty_labels = StringIndexerModel.from_labels( [], inputCol="label", outputCol="indexed", handleInvalid="keep") actual2 = model_empty_labels.transform(df1).select("id", "indexed").collect() expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=0.0), Row(id=2, indexed=0.0), Row(id=3, indexed=0.0), Row(id=4, indexed=0.0)] self.assertEqual(actual2, expected2) # Test model with default settings can transform model_default = StringIndexerModel.from_labels(["a", "b", "c"], inputCol="label") df2 = self.spark.createDataFrame([ (0, "a"), (1, "c"), (2, "b"), (3, "b"), (4, "b")], ["id", "label"]) transformed_list = model_default.transform(df2)\ .select(model_default.getOrDefault(model_default.outputCol)).collect() self.assertEqual(len(transformed_list), 5)
Example #27
Source File: functions.py From LearningApacheSpark with MIT License | 5 votes |
def nanvl(col1, col2): """Returns col1 if it is not NaN, or col2 if col1 is NaN. Both inputs should be floating point columns (:class:`DoubleType` or :class:`FloatType`). >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b")) >>> df.select(nanvl("a", "b").alias("r1"), nanvl(df.a, df.b).alias("r2")).collect() [Row(r1=1.0, r2=1.0), Row(r1=2.0, r2=2.0)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.nanvl(_to_java_column(col1), _to_java_column(col2)))
Example #28
Source File: functions.py From LearningApacheSpark with MIT License | 5 votes |
def dayofyear(col): """ Extract the day of the year of a given date as integer. >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) >>> df.select(dayofyear('dt').alias('day')).collect() [Row(day=98)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.dayofyear(_to_java_column(col)))
Example #29
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_read_images(self): data_path = 'data/mllib/images/origin/kittens' df = ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True) self.assertEqual(df.count(), 4) first_row = df.take(1)[0][0] array = ImageSchema.toNDArray(first_row) self.assertEqual(len(array), first_row[1]) self.assertEqual(ImageSchema.toImage(array, origin=first_row[0]), first_row) self.assertEqual(df.schema, ImageSchema.imageSchema) self.assertEqual(df.schema["image"].dataType, ImageSchema.columnSchema) expected = {'CV_8UC3': 16, 'Undefined': -1, 'CV_8U': 0, 'CV_8UC1': 0, 'CV_8UC4': 24} self.assertEqual(ImageSchema.ocvTypes, expected) expected = ['origin', 'height', 'width', 'nChannels', 'mode', 'data'] self.assertEqual(ImageSchema.imageFields, expected) self.assertEqual(ImageSchema.undefinedImageType, "Undefined") with QuietTest(self.sc): self.assertRaisesRegexp( TypeError, "image argument should be pyspark.sql.types.Row; however", lambda: ImageSchema.toNDArray("a")) with QuietTest(self.sc): self.assertRaisesRegexp( ValueError, "image argument should have attributes specified in", lambda: ImageSchema.toNDArray(Row(a=1))) with QuietTest(self.sc): self.assertRaisesRegexp( TypeError, "array argument should be numpy.ndarray; however, it got", lambda: ImageSchema.toImage("a"))
Example #30
Source File: functions.py From LearningApacheSpark with MIT License | 5 votes |
def isnan(col): """An expression that returns true iff the column is NaN. >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b")) >>> df.select(isnan("a").alias("r1"), isnan(df.a).alias("r2")).collect() [Row(r1=False, r2=False), Row(r1=True, r2=True)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.isnan(_to_java_column(col)))