Python pyspark.sql.types.StructType() Examples
The following are 30 code examples for showing how to use pyspark.sql.types.StructType(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
You may check out the related API usage on the sidebar.
You may also want to check out all available functions/classes of the module
pyspark.sql.types
, or try the search function
.
Example 1
Project: Hanhan-Spark-Python Author: hanhanwu File: temp_range_sql.py License: MIT License | 10 votes |
def main(): temp_schema = StructType([ StructField('StationID', StringType(), False), StructField('DateTime', StringType(), False), StructField('Observation', StringType(), False), StructField('DataValue', DoubleType(), False), StructField('MFlag', StringType(), True), StructField('QFlag', StringType(), True), StructField('SFlag', StringType(), True), StructField('OBSTime', StringType(), True), ]) df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(inputs1, schema=temp_schema) df = df.filter(df.QFlag == '') dfrange = get_range(df) result = dfrange.rdd.map(lambda r: str(r.DateTime)+' '+str(r.StationID)+' '+str(r.MaxRange)) outdata = result.sortBy(lambda r: r[0]).coalesce(1) outdata.saveAsTextFile(output)
Example 2
Project: spark-deep-learning Author: databricks File: imageIO.py License: Apache License 2.0 | 7 votes |
def filesToDF(sc, path, numPartitions=None): """ Read files from a directory to a DataFrame. :param sc: SparkContext. :param path: str, path to files. :param numPartition: int, number or partitions to use for reading files. :return: DataFrame, with columns: (filePath: str, fileData: BinaryType) """ numPartitions = numPartitions or sc.defaultParallelism schema = StructType([StructField("filePath", StringType(), False), StructField("fileData", BinaryType(), False)]) rdd = sc.binaryFiles( path, minPartitions=numPartitions).repartition(numPartitions) rdd = rdd.map(lambda x: (x[0], bytearray(x[1]))) return rdd.toDF(schema)
Example 3
Project: search-MjoLniR Author: wikimedia File: transform.py License: MIT License | 6 votes |
def _simplify_data_type(data_type: T.DataType) -> Tuple: """Simplify datatype into a tuple of equality information we care about Most notably this ignores nullability concerns due to hive not being able to represent not null in it's schemas. """ try: # Normalize UDT into it's sql form. Allows comparison of schemas # from hive and spark. sql_type = data_type.sqlType() # type: ignore except AttributeError: sql_type = data_type if isinstance(sql_type, T.StructType): return ('StructType', [(field.name, _simplify_data_type(field.dataType)) for field in sql_type]) elif isinstance(sql_type, T.ArrayType): return ('ArrayType', _simplify_data_type(sql_type.elementType)) else: return (type(sql_type).__name__,)
Example 4
Project: search-MjoLniR Author: wikimedia File: transform.py License: MIT License | 6 votes |
def _verify_schema_compatability(expect: T.StructType, have: T.StructType) -> List[str]: """Verify all expected fields and types are present Allows additional columns in the `have` schema. Additionally allows relaxing nullability """ errors = [] for expect_field in expect: try: have_field = have[expect_field.name] except KeyError: errors.append('Field {} missing. Have: {}'.format(expect_field.name, ','.join(have.names))) continue expect_type = _simplify_data_type(expect_field.dataType) have_type = _simplify_data_type(have_field.dataType) if expect_type != have_type: errors.append('Field {} has incompatible data types: expect {} != have {}'.format( expect_field.name, expect_type, have_type)) return errors
Example 5
Project: search-MjoLniR Author: wikimedia File: transform.py License: MIT License | 6 votes |
def _merge_schemas(*schemas: T.StructType): """Merge one or more spark schemas into a new schema""" fields = cast(Dict[str, T.StructField], {}) errors = [] for schema in schemas: for field in schema: if field.name not in fields: fields[field.name] = field elif field != fields[field.name]: errors.append('Incompatible fields: {} != {}'.format(field, fields[field.name])) if errors: raise Exception('\n'.join(errors)) return T.StructType(list(fields.values())) # Primary input schema from which most everything else is derived
Example 6
Project: petastorm Author: uber File: unischema.py License: Apache License 2.0 | 6 votes |
def as_spark_schema(self): """Returns an object derived from the unischema as spark schema. Example: >>> spark.createDataFrame(dataset_rows, >>> SomeSchema.as_spark_schema()) """ # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path # (currently works only with make_batch_reader) import pyspark.sql.types as sql_types schema_entries = [] for field in self._fields.values(): spark_type = _field_spark_dtype(field) schema_entries.append(sql_types.StructField(field.name, spark_type, field.nullable)) return sql_types.StructType(schema_entries)
Example 7
Project: Hanhan-Spark-Python Author: hanhanwu File: reddit_average_sql.py License: MIT License | 6 votes |
def main(): schema = StructType([ StructField('subreddit', StringType(), False), StructField('score', IntegerType(), False), ]) inputs = sqlContext.read.json(inputs1, schema=schema) # Uncomment this then shcema is not added # inputs = sqlContext.read.json(inputs1) # Uncomment these when there are 2 inputs dir # comments_input1 = sqlContext.read.json(inputs1, schema=schema) # comments_input2 = sqlContext.read.json(inputs2, schema=schema) # inputs = comments_input1.unionAll(comments_input2) df = get_avg(inputs) df.write.save(output, format='json', mode='overwrite')
Example 8
Project: HoloClean-Legacy-deprecated Author: HoloClean File: accuracy.py License: Apache License 2.0 | 6 votes |
def read_groundtruth(self): """ Create a dataframe from the ground truth csv file Takes as argument the full path name of the csv file and the spark_session """ filereader = Reader(self.spark_session) groundtruth_schema = StructType([ StructField("tid", IntegerType(), False), StructField("attr_name", StringType(), False), StructField("attr_val", StringType(), False)]) self.ground_truth_flat = filereader.read(self.path_to_grand_truth, 0, groundtruth_schema).\ drop(GlobalVariables.index_name) self.dataengine.add_db_table( 'Groundtruth', self.ground_truth_flat, self.dataset)
Example 9
Project: mlflow Author: mlflow File: utils.py License: Apache License 2.0 | 6 votes |
def format_to_file_path(spark_session): rows = [ Row(8, 32, "bat"), Row(64, 40, "mouse"), Row(-27, 55, "horse") ] schema = StructType([ StructField("number2", IntegerType()), StructField("number1", IntegerType()), StructField("word", StringType()) ]) rdd = spark_session.sparkContext.parallelize(rows) df = spark_session.createDataFrame(rdd, schema) res = {} tempdir = tempfile.mkdtemp() for data_format in ["csv", "parquet", "json"]: res[data_format] = os.path.join(tempdir, "test-data-%s" % data_format) for data_format, file_path in res.items(): df.write.option("header", "true").format(data_format).save(file_path) yield res shutil.rmtree(tempdir)
Example 10
Project: incubator-spot Author: apache File: streaming.py License: Apache License 2.0 | 5 votes |
def dstream(self): ''' Return the schema of this :class:`DataFrame` as a :class:`pyspark.sql.types.StructType`. ''' return self.__dstream\ .map(lambda x: x[1])\ .flatMap(lambda x: x)\ .map(lambda x: _analyzer(x))
Example 11
Project: search-MjoLniR Author: wikimedia File: es_hits.py License: MIT License | 5 votes |
def transform(df, url_list=None, brokers=None, **kwargs): if brokers and url_list: raise ValueError('cannot specify brokers and url_list') if brokers: rdd = transform_from_kafka(df, brokers, **kwargs) else: rdd = transform_from_elasticsearch(df, url_list, **kwargs) return df.sql_ctx.createDataFrame(rdd, T.StructType([ df.schema['wikiid'], df.schema['query'], df.schema['norm_query'], T.StructField('hit_page_ids', T.ArrayType(T.IntegerType()), nullable=False), ]))
Example 12
Project: search-MjoLniR Author: wikimedia File: transform.py License: MIT License | 5 votes |
def _verify_schema_equality(expect: T.StructType, have: T.StructType) -> List[str]: """Verify the dataframe and table have equal schemas""" def resolve(schema, field_name) -> Optional[Tuple]: try: field = schema[field_name] except KeyError: return None return _simplify_data_type(field.dataType) errors = [] for field_name in set(expect.names).union(have.names): expect_type = resolve(expect, field_name) if expect_type is None: errors.append('Extra field in provided schema: {}'.format(field_name)) continue have_type = resolve(have, field_name) if have_type is None: errors.append('Missing field in provided schema: {}'.format(field_name)) continue if expect_type != have_type: fmt = 'Column {} of type {} does not match expected {}' errors.append(fmt.format(field_name, have_type, expect_type)) continue # TODO: Test nullability? But hive doesn't track nullability, everything is nullable. return errors
Example 13
Project: search-MjoLniR Author: wikimedia File: transform.py License: MIT License | 5 votes |
def read_partition( spark: SparkSession, table: str, partition_spec: Mapping[str, str], schema: Optional[T.StructType] = None, direct_parquet_read: bool = False ) -> DataFrame: """Read a single partition from a hive table. Verifies the partition specification describes a complete partition, that the partition exists, and optionally that the table is compatible with an expected schema. The partition could still be empty. """ # We don't need to do anything with the result, our goal is to # trigger AnalysisException when the arguments are invalid. spark.sql(_describe_partition_ql(table, partition_spec)).collect() partition_cond = F.lit(True) for k, v in partition_spec.items(): partition_cond &= F.col(k) == v df = spark.read.table(table).where(partition_cond) # The df we have now has types defined by the hive table, but this downgrades # non-standard types like VectorUDT() to it's sql equivalent. Use the first # df to find the files, then read them directly. if direct_parquet_read: input_files = list(df._jdf.inputFiles()) # type: ignore input_dirs = set(os.path.dirname(path) for path in input_files) if len(input_dirs) != 1: raise Exception('Expected single directory containing partition data: [{}]'.format( '],['.join(input_files))) df = spark.read.parquet(list(input_dirs)[0]) if schema is not None: # TODO: This only allows extra top level columns, anything # nested must be exactly the same. Fine for now. _verify_schema_compatability(schema, df.schema) df = df.select(*(field.name for field in schema)) # Drop partitioning columns. These are not part of the mjolnir transformations, and # are only an implementation detail of putting them on disk and tracking history. return df.drop(*partition_spec.keys())
Example 14
Project: search-MjoLniR Author: wikimedia File: transform.py License: MIT License | 5 votes |
def typed_transformer( schema_in: Optional[T.StructType] = None, schema_out: Optional[T.StructType] = None, context: Optional[str] = None ) -> Callable[[Callable[..., Transformer]], Callable[..., Transformer]]: """Decorates a transformer factory with schema validation An idiom in transform is calling a function to return a Transform. This decorator can be applied to those factory functions to return transformers that apply runtime schema validation. """ def decorate(fn: Callable[..., Transformer]) -> Callable[..., Transformer]: def error_context(kind: str) -> str: return 'While checking {} {}:'.format(fn.__name__ if context is None else context, kind) @functools.wraps(fn) def factory(*args, **kwargs) -> Transformer: transformer = fn(*args, **kwargs) @functools.wraps(transformer) def transform(df_in: DataFrame) -> DataFrame: if schema_in is not None: check_schema(df_in, schema_in, error_context('schema_in')) df_in = df_in.select(*schema_in.names) df_out = transformer(df_in) if schema_out is not None: check_schema(df_out, schema_out, error_context('schema_out')) df_out = df_out.select(*schema_out.names) return df_out return transform return factory return decorate # Shared schemas between the primary mjolnir transformations. Transformations # may require a schema with slightly more columns than they require to keep # the total number of schemas low.
Example 15
Project: search-MjoLniR Author: wikimedia File: test_transform.py License: MIT License | 5 votes |
def test_schema_comparison(expect: T.StructType, have: T.StructType, compatible: bool, equal: bool) -> None: if equal and not compatible: raise Exception('Invalid constraint, can not be equal but not compatible') # functions return list of errors, not bool() returns true when everything is ok assert compatible is not bool(mt._verify_schema_compatability(expect, have)) assert equal is not bool(mt._verify_schema_equality(expect, have))
Example 16
Project: python_moztelemetry Author: mozilla File: test_dataset.py License: Mozilla Public License 2.0 | 5 votes |
def test_dataframe_with_schema(dataset, spark): schema = StructType([StructField("foo", IntegerType(), True)]) df = dataset.dataframe(spark, decode=decode, schema=schema, table_name='bar') assert type(df) == DataFrame assert df.columns == ['foo'] assert df.orderBy(["foo"]).collect() == [Row(foo=1), Row(foo=2)]
Example 17
Project: python_moztelemetry Author: mozilla File: test_dataset.py License: Mozilla Public License 2.0 | 5 votes |
def test_dataframe_bad_schema(dataset, spark): spark.catalog.dropTempView('bar') schema = StructType([StructField("name", StringType(), True)]) df = dataset.dataframe(spark, decode=decode, schema=schema, table_name='bar') assert type(df) == DataFrame assert df.collect() == [Row(name=None), Row(name=None)]
Example 18
Project: ibis Author: ibis-project File: udf.py License: Apache License 2.0 | 5 votes |
def validate_func_and_types(self, func): if isinstance(self.spark_output_type, (pt.MapType, pt.StructType)): raise com.IbisTypeError( 'Spark does not support MapType or StructType output for \ Pandas UDFs' ) if not self.input_type: raise com.UnsupportedArgumentError( 'Spark does not support 0-arg pandas UDFs. Instead, create \ a 1-arg pandas UDF and ignore the arg in your function' ) super().validate_func_and_types(func)
Example 19
Project: ibis Author: ibis-project File: datatypes.py License: Apache License 2.0 | 5 votes |
def ibis_struct_dtype_to_spark_dtype(ibis_dtype_obj): fields = [ pt.StructField(n, spark_dtype(t), t.nullable) for n, t in zip(ibis_dtype_obj.names, ibis_dtype_obj.types) ] return pt.StructType(fields)
Example 20
Project: LearningApacheSpark Author: runawayhorse001 File: base.py License: MIT License | 5 votes |
def transformSchema(self, schema): inputType = schema[self.getInputCol()].dataType self.validateInputType(inputType) if self.getOutputCol() in schema.names: raise ValueError("Output column %s already exists." % self.getOutputCol()) outputFields = copy.copy(schema.fields) outputFields.append(StructField(self.getOutputCol(), self.outputDataType(), nullable=False)) return StructType(outputFields)
Example 21
Project: LearningApacheSpark Author: runawayhorse001 File: evaluation.py License: MIT License | 5 votes |
def __init__(self, scoreAndLabels): sc = scoreAndLabels.ctx sql_ctx = SQLContext.getOrCreate(sc) df = sql_ctx.createDataFrame(scoreAndLabels, schema=StructType([ StructField("score", DoubleType(), nullable=False), StructField("label", DoubleType(), nullable=False)])) java_class = sc._jvm.org.apache.spark.mllib.evaluation.BinaryClassificationMetrics java_model = java_class(df._jdf) super(BinaryClassificationMetrics, self).__init__(java_model)
Example 22
Project: LearningApacheSpark Author: runawayhorse001 File: evaluation.py License: MIT License | 5 votes |
def __init__(self, predictionAndObservations): sc = predictionAndObservations.ctx sql_ctx = SQLContext.getOrCreate(sc) df = sql_ctx.createDataFrame(predictionAndObservations, schema=StructType([ StructField("prediction", DoubleType(), nullable=False), StructField("observation", DoubleType(), nullable=False)])) java_class = sc._jvm.org.apache.spark.mllib.evaluation.RegressionMetrics java_model = java_class(df._jdf) super(RegressionMetrics, self).__init__(java_model)
Example 23
Project: LearningApacheSpark Author: runawayhorse001 File: evaluation.py License: MIT License | 5 votes |
def __init__(self, predictionAndLabels): sc = predictionAndLabels.ctx sql_ctx = SQLContext.getOrCreate(sc) df = sql_ctx.createDataFrame(predictionAndLabels, schema=StructType([ StructField("prediction", DoubleType(), nullable=False), StructField("label", DoubleType(), nullable=False)])) java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics java_model = java_class(df._jdf) super(MulticlassMetrics, self).__init__(java_model)
Example 24
Project: LearningApacheSpark Author: runawayhorse001 File: catalog.py License: MIT License | 5 votes |
def createTable(self, tableName, path=None, source=None, schema=None, **options): """Creates a table based on the dataset in a data source. It returns the DataFrame associated with the table. The data source is specified by the ``source`` and a set of ``options``. If ``source`` is not specified, the default data source configured by ``spark.sql.sources.default`` will be used. When ``path`` is specified, an external table is created from the data at the given path. Otherwise a managed table is created. Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and created table. :return: :class:`DataFrame` """ if path is not None: options["path"] = path if source is None: source = self._sparkSession._wrapped._conf.defaultDataSourceName() if schema is None: df = self._jcatalog.createTable(tableName, source, options) else: if not isinstance(schema, StructType): raise TypeError("schema should be StructType") scala_datatype = self._jsparkSession.parseDataType(schema.json()) df = self._jcatalog.createTable(tableName, source, scala_datatype, options) return DataFrame(df, self._sparkSession._wrapped)
Example 25
Project: SMV Author: TresAmigosSD File: smvschema.py License: Apache License 2.0 | 5 votes |
def _toStructType(self): """return equivalent Spark schema (StructType) from this smv schema""" # ss is the raw scala spark schema (Scala StructType). This has no # iterator defined on the python side, so we use old school for loop. ss = self.j_smv_schema.toStructType() spark_schema = sql_types.StructType() for i in range(ss.length()): # use "apply" to get the nth StructField item in StructType ft = self._scala_to_python_field_type(ss.apply(i)) spark_schema = spark_schema.add(ft) return spark_schema
Example 26
Project: dagster Author: dagster-io File: repo.py License: Apache License 2.0 | 5 votes |
def make_people(context) -> DataFrame: schema = StructType([StructField('name', StringType()), StructField('age', IntegerType())]) rows = [Row(name='Thom', age=51), Row(name='Jonny', age=48), Row(name='Nigel', age=49)] return context.resources.pyspark.spark_session.createDataFrame(rows, schema)
Example 27
Project: dagster Author: dagster-io File: repo.py License: Apache License 2.0 | 5 votes |
def make_people(context) -> DataFrame: schema = StructType([StructField('name', StringType()), StructField('age', IntegerType())]) rows = [Row(name='Thom', age=51), Row(name='Jonny', age=48), Row(name='Nigel', age=49)] return context.resources.pyspark.spark_session.createDataFrame(rows, schema)
Example 28
Project: dagster Author: dagster-io File: test_pyspark.py License: Apache License 2.0 | 5 votes |
def make_df_solid(context): schema = StructType([StructField('name', StringType()), StructField('age', IntegerType())]) rows = [Row(name='John', age=19), Row(name='Jennifer', age=29), Row(name='Henry', age=50)] return context.resources.pyspark.spark_session.createDataFrame(rows, schema)
Example 29
Project: dagster Author: dagster-io File: test_pyspark.py License: Apache License 2.0 | 5 votes |
def make_df_solid(context): schema = StructType([StructField('name', StringType()), StructField('age', IntegerType())]) rows = [Row(name='John', age=19), Row(name='Jennifer', age=29), Row(name='Henry', age=50)] return context.resources.pyspark.spark_session.createDataFrame(rows, schema)
Example 30
Project: pipelines Author: kubeflow File: transform_run.py License: Apache License 2.0 | 5 votes |
def load_schema(analysis_path): type_map = { 'KEY': StringType(), 'NUMBER': DoubleType(), 'CATEGORY': StringType(), 'TEXT': StringType(), 'IMAGE_URL': StringType() } schema_file = os.path.join(analysis_path, 'schema.json') schema_json = json.loads(file_io.read_file_to_string(schema_file)) fields = [StructField(x['name'], type_map[x['type']]) for x in schema_json] return schema_json, StructType(fields)