Python pyspark.sql.types.StructType() Examples

The following are 30 code examples for showing how to use pyspark.sql.types.StructType(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module pyspark.sql.types , or try the search function .

Example 1
Project: Hanhan-Spark-Python   Author: hanhanwu   File: temp_range_sql.py    License: MIT License 10 votes vote down vote up
def main():
    temp_schema = StructType([
    StructField('StationID', StringType(), False),
    StructField('DateTime', StringType(), False),
    StructField('Observation', StringType(), False),
    StructField('DataValue', DoubleType(), False),
    StructField('MFlag', StringType(), True),
    StructField('QFlag', StringType(), True),
    StructField('SFlag', StringType(), True),
    StructField('OBSTime', StringType(), True),
    ])

    df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(inputs1, schema=temp_schema)
    df = df.filter(df.QFlag == '')

    dfrange = get_range(df)
    result = dfrange.rdd.map(lambda r: str(r.DateTime)+' '+str(r.StationID)+' '+str(r.MaxRange))
    outdata = result.sortBy(lambda r: r[0]).coalesce(1)
    outdata.saveAsTextFile(output) 
Example 2
Project: spark-deep-learning   Author: databricks   File: imageIO.py    License: Apache License 2.0 7 votes vote down vote up
def filesToDF(sc, path, numPartitions=None):
    """
    Read files from a directory to a DataFrame.

    :param sc: SparkContext.
    :param path: str, path to files.
    :param numPartition: int, number or partitions to use for reading files.
    :return: DataFrame, with columns: (filePath: str, fileData: BinaryType)
    """
    numPartitions = numPartitions or sc.defaultParallelism
    schema = StructType([StructField("filePath", StringType(), False),
                         StructField("fileData", BinaryType(), False)])
    rdd = sc.binaryFiles(
        path, minPartitions=numPartitions).repartition(numPartitions)
    rdd = rdd.map(lambda x: (x[0], bytearray(x[1])))
    return rdd.toDF(schema) 
Example 3
Project: search-MjoLniR   Author: wikimedia   File: transform.py    License: MIT License 6 votes vote down vote up
def _simplify_data_type(data_type: T.DataType) -> Tuple:
    """Simplify datatype into a tuple of equality information we care about

    Most notably this ignores nullability concerns due to hive not
    being able to represent not null in it's schemas.
    """
    try:
        # Normalize UDT into it's sql form. Allows comparison of schemas
        # from hive and spark.
        sql_type = data_type.sqlType()  # type: ignore
    except AttributeError:
        sql_type = data_type

    if isinstance(sql_type, T.StructType):
        return ('StructType', [(field.name, _simplify_data_type(field.dataType)) for field in sql_type])
    elif isinstance(sql_type, T.ArrayType):
        return ('ArrayType', _simplify_data_type(sql_type.elementType))
    else:
        return (type(sql_type).__name__,) 
Example 4
Project: search-MjoLniR   Author: wikimedia   File: transform.py    License: MIT License 6 votes vote down vote up
def _verify_schema_compatability(expect: T.StructType, have: T.StructType) -> List[str]:
    """Verify all expected fields and types are present

    Allows additional columns in the `have` schema. Additionally
    allows relaxing nullability """
    errors = []
    for expect_field in expect:
        try:
            have_field = have[expect_field.name]
        except KeyError:
            errors.append('Field {} missing. Have: {}'.format(expect_field.name, ','.join(have.names)))
            continue
        expect_type = _simplify_data_type(expect_field.dataType)
        have_type = _simplify_data_type(have_field.dataType)
        if expect_type != have_type:
            errors.append('Field {} has incompatible data types: expect {} != have {}'.format(
                          expect_field.name, expect_type, have_type))
    return errors 
Example 5
Project: search-MjoLniR   Author: wikimedia   File: transform.py    License: MIT License 6 votes vote down vote up
def _merge_schemas(*schemas: T.StructType):
    """Merge one or more spark schemas into a new schema"""
    fields = cast(Dict[str, T.StructField], {})
    errors = []
    for schema in schemas:
        for field in schema:
            if field.name not in fields:
                fields[field.name] = field
            elif field != fields[field.name]:
                errors.append('Incompatible fields: {} != {}'.format(field, fields[field.name]))
    if errors:
        raise Exception('\n'.join(errors))
    return T.StructType(list(fields.values()))


# Primary input schema from which most everything else is derived 
Example 6
Project: petastorm   Author: uber   File: unischema.py    License: Apache License 2.0 6 votes vote down vote up
def as_spark_schema(self):
        """Returns an object derived from the unischema as spark schema.

        Example:

        >>> spark.createDataFrame(dataset_rows,
        >>>                       SomeSchema.as_spark_schema())
        """
        # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
        # (currently works only with make_batch_reader)
        import pyspark.sql.types as sql_types

        schema_entries = []
        for field in self._fields.values():
            spark_type = _field_spark_dtype(field)
            schema_entries.append(sql_types.StructField(field.name, spark_type, field.nullable))

        return sql_types.StructType(schema_entries) 
Example 7
Project: Hanhan-Spark-Python   Author: hanhanwu   File: reddit_average_sql.py    License: MIT License 6 votes vote down vote up
def main():
    schema = StructType([
    StructField('subreddit', StringType(), False),
    StructField('score', IntegerType(), False),
    ])
    inputs = sqlContext.read.json(inputs1, schema=schema)

    # Uncomment this then shcema is not added
    # inputs = sqlContext.read.json(inputs1)

    # Uncomment these when there are 2 inputs dir
    # comments_input1 = sqlContext.read.json(inputs1, schema=schema)
    # comments_input2 = sqlContext.read.json(inputs2, schema=schema)
    # inputs = comments_input1.unionAll(comments_input2)

    df = get_avg(inputs)
    df.write.save(output, format='json', mode='overwrite') 
Example 8
Project: HoloClean-Legacy-deprecated   Author: HoloClean   File: accuracy.py    License: Apache License 2.0 6 votes vote down vote up
def read_groundtruth(self):

        """
        Create a dataframe from the ground truth csv file

        Takes as argument the full path name of the csv file
        and the spark_session
        """
        filereader = Reader(self.spark_session)

        groundtruth_schema = StructType([
            StructField("tid", IntegerType(), False),
            StructField("attr_name", StringType(), False),
            StructField("attr_val", StringType(), False)])

        self.ground_truth_flat = filereader.read(self.path_to_grand_truth, 0,
                                                 groundtruth_schema).\
            drop(GlobalVariables.index_name)

        self.dataengine.add_db_table(
            'Groundtruth', self.ground_truth_flat, self.dataset) 
Example 9
Project: mlflow   Author: mlflow   File: utils.py    License: Apache License 2.0 6 votes vote down vote up
def format_to_file_path(spark_session):
    rows = [
        Row(8, 32, "bat"),
        Row(64, 40, "mouse"),
        Row(-27, 55, "horse")
    ]
    schema = StructType([
        StructField("number2", IntegerType()),
        StructField("number1", IntegerType()),
        StructField("word", StringType())
    ])
    rdd = spark_session.sparkContext.parallelize(rows)
    df = spark_session.createDataFrame(rdd, schema)
    res = {}
    tempdir = tempfile.mkdtemp()
    for data_format in ["csv", "parquet", "json"]:
        res[data_format] = os.path.join(tempdir, "test-data-%s" % data_format)

    for data_format, file_path in res.items():
        df.write.option("header", "true").format(data_format).save(file_path)
    yield res
    shutil.rmtree(tempdir) 
Example 10
Project: incubator-spot   Author: apache   File: streaming.py    License: Apache License 2.0 5 votes vote down vote up
def dstream(self):
        '''
            Return the schema of this :class:`DataFrame` as a
        :class:`pyspark.sql.types.StructType`.
        '''
        return self.__dstream\
            .map(lambda x: x[1])\
            .flatMap(lambda x: x)\
            .map(lambda x: _analyzer(x)) 
Example 11
Project: search-MjoLniR   Author: wikimedia   File: es_hits.py    License: MIT License 5 votes vote down vote up
def transform(df, url_list=None, brokers=None, **kwargs):
    if brokers and url_list:
        raise ValueError('cannot specify brokers and url_list')
    if brokers:
        rdd = transform_from_kafka(df, brokers, **kwargs)
    else:
        rdd = transform_from_elasticsearch(df, url_list, **kwargs)
    return df.sql_ctx.createDataFrame(rdd, T.StructType([
        df.schema['wikiid'],
        df.schema['query'],
        df.schema['norm_query'],
        T.StructField('hit_page_ids', T.ArrayType(T.IntegerType()), nullable=False),
    ])) 
Example 12
Project: search-MjoLniR   Author: wikimedia   File: transform.py    License: MIT License 5 votes vote down vote up
def _verify_schema_equality(expect: T.StructType, have: T.StructType) -> List[str]:
    """Verify the dataframe and table have equal schemas"""
    def resolve(schema, field_name) -> Optional[Tuple]:
        try:
            field = schema[field_name]
        except KeyError:
            return None
        return _simplify_data_type(field.dataType)

    errors = []
    for field_name in set(expect.names).union(have.names):
        expect_type = resolve(expect, field_name)
        if expect_type is None:
            errors.append('Extra field in provided schema: {}'.format(field_name))
            continue

        have_type = resolve(have, field_name)
        if have_type is None:
            errors.append('Missing field in provided schema: {}'.format(field_name))
            continue

        if expect_type != have_type:
            fmt = 'Column {} of type {} does not match expected {}'
            errors.append(fmt.format(field_name, have_type, expect_type))
            continue
        # TODO: Test nullability? But hive doesn't track nullability, everything is nullable.
    return errors 
Example 13
Project: search-MjoLniR   Author: wikimedia   File: transform.py    License: MIT License 5 votes vote down vote up
def read_partition(
    spark: SparkSession,
    table: str,
    partition_spec: Mapping[str, str],
    schema: Optional[T.StructType] = None,
    direct_parquet_read: bool = False
) -> DataFrame:
    """Read a single partition from a hive table.

    Verifies the partition specification describes a complete partition,
    that the partition exists, and optionally that the table is compatible
    with an expected schema. The partition could still be empty.
    """
    # We don't need to do anything with the result, our goal is to
    # trigger AnalysisException when the arguments are invalid.
    spark.sql(_describe_partition_ql(table, partition_spec)).collect()

    partition_cond = F.lit(True)
    for k, v in partition_spec.items():
        partition_cond &= F.col(k) == v
    df = spark.read.table(table).where(partition_cond)
    # The df we have now has types defined by the hive table, but this downgrades
    # non-standard types like VectorUDT() to it's sql equivalent. Use the first
    # df to find the files, then read them directly.
    if direct_parquet_read:
        input_files = list(df._jdf.inputFiles())  # type: ignore
        input_dirs = set(os.path.dirname(path) for path in input_files)
        if len(input_dirs) != 1:
            raise Exception('Expected single directory containing partition data: [{}]'.format(
                '],['.join(input_files)))
        df = spark.read.parquet(list(input_dirs)[0])
    if schema is not None:
        # TODO: This only allows extra top level columns, anything
        # nested must be exactly the same. Fine for now.
        _verify_schema_compatability(schema, df.schema)
        df = df.select(*(field.name for field in schema))
    # Drop partitioning columns. These are not part of the mjolnir transformations, and
    # are only an implementation detail of putting them on disk and tracking history.
    return df.drop(*partition_spec.keys()) 
Example 14
Project: search-MjoLniR   Author: wikimedia   File: transform.py    License: MIT License 5 votes vote down vote up
def typed_transformer(
    schema_in: Optional[T.StructType] = None,
    schema_out: Optional[T.StructType] = None,
    context: Optional[str] = None
) -> Callable[[Callable[..., Transformer]], Callable[..., Transformer]]:
    """Decorates a transformer factory with schema validation

    An idiom in transform is calling a function to return a Transform. This
    decorator can be applied to those factory functions to return transformers
    that apply runtime schema validation.
    """
    def decorate(fn: Callable[..., Transformer]) -> Callable[..., Transformer]:
        def error_context(kind: str) -> str:
            return 'While checking {} {}:'.format(fn.__name__ if context is None else context, kind)

        @functools.wraps(fn)
        def factory(*args, **kwargs) -> Transformer:
            transformer = fn(*args, **kwargs)

            @functools.wraps(transformer)
            def transform(df_in: DataFrame) -> DataFrame:
                if schema_in is not None:
                    check_schema(df_in, schema_in, error_context('schema_in'))
                    df_in = df_in.select(*schema_in.names)
                df_out = transformer(df_in)
                if schema_out is not None:
                    check_schema(df_out, schema_out, error_context('schema_out'))
                    df_out = df_out.select(*schema_out.names)
                return df_out
            return transform
        return factory
    return decorate


# Shared schemas between the primary mjolnir transformations. Transformations
# may require a schema with slightly more columns than they require to keep
# the total number of schemas low. 
Example 15
Project: search-MjoLniR   Author: wikimedia   File: test_transform.py    License: MIT License 5 votes vote down vote up
def test_schema_comparison(expect: T.StructType, have: T.StructType, compatible: bool, equal: bool) -> None:
    if equal and not compatible:
        raise Exception('Invalid constraint, can not be equal but not compatible')
    # functions return list of errors, not bool() returns true when everything is ok
    assert compatible is not bool(mt._verify_schema_compatability(expect, have))
    assert equal is not bool(mt._verify_schema_equality(expect, have)) 
Example 16
Project: python_moztelemetry   Author: mozilla   File: test_dataset.py    License: Mozilla Public License 2.0 5 votes vote down vote up
def test_dataframe_with_schema(dataset, spark):
    schema = StructType([StructField("foo", IntegerType(), True)])
    df = dataset.dataframe(spark, decode=decode, schema=schema, table_name='bar')

    assert type(df) == DataFrame
    assert df.columns == ['foo']
    assert df.orderBy(["foo"]).collect() == [Row(foo=1), Row(foo=2)] 
Example 17
Project: python_moztelemetry   Author: mozilla   File: test_dataset.py    License: Mozilla Public License 2.0 5 votes vote down vote up
def test_dataframe_bad_schema(dataset, spark):
    spark.catalog.dropTempView('bar')
    schema = StructType([StructField("name", StringType(), True)])
    df = dataset.dataframe(spark, decode=decode, schema=schema, table_name='bar')

    assert type(df) == DataFrame
    assert df.collect() == [Row(name=None), Row(name=None)] 
Example 18
Project: ibis   Author: ibis-project   File: udf.py    License: Apache License 2.0 5 votes vote down vote up
def validate_func_and_types(self, func):
        if isinstance(self.spark_output_type, (pt.MapType, pt.StructType)):
            raise com.IbisTypeError(
                'Spark does not support MapType or StructType output for \
Pandas UDFs'
            )
        if not self.input_type:
            raise com.UnsupportedArgumentError(
                'Spark does not support 0-arg pandas UDFs. Instead, create \
a 1-arg pandas UDF and ignore the arg in your function'
            )
        super().validate_func_and_types(func) 
Example 19
Project: ibis   Author: ibis-project   File: datatypes.py    License: Apache License 2.0 5 votes vote down vote up
def ibis_struct_dtype_to_spark_dtype(ibis_dtype_obj):
    fields = [
        pt.StructField(n, spark_dtype(t), t.nullable)
        for n, t in zip(ibis_dtype_obj.names, ibis_dtype_obj.types)
    ]
    return pt.StructType(fields) 
Example 20
Project: LearningApacheSpark   Author: runawayhorse001   File: base.py    License: MIT License 5 votes vote down vote up
def transformSchema(self, schema):
        inputType = schema[self.getInputCol()].dataType
        self.validateInputType(inputType)
        if self.getOutputCol() in schema.names:
            raise ValueError("Output column %s already exists." % self.getOutputCol())
        outputFields = copy.copy(schema.fields)
        outputFields.append(StructField(self.getOutputCol(),
                                        self.outputDataType(),
                                        nullable=False))
        return StructType(outputFields) 
Example 21
Project: LearningApacheSpark   Author: runawayhorse001   File: evaluation.py    License: MIT License 5 votes vote down vote up
def __init__(self, scoreAndLabels):
        sc = scoreAndLabels.ctx
        sql_ctx = SQLContext.getOrCreate(sc)
        df = sql_ctx.createDataFrame(scoreAndLabels, schema=StructType([
            StructField("score", DoubleType(), nullable=False),
            StructField("label", DoubleType(), nullable=False)]))
        java_class = sc._jvm.org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
        java_model = java_class(df._jdf)
        super(BinaryClassificationMetrics, self).__init__(java_model) 
Example 22
Project: LearningApacheSpark   Author: runawayhorse001   File: evaluation.py    License: MIT License 5 votes vote down vote up
def __init__(self, predictionAndObservations):
        sc = predictionAndObservations.ctx
        sql_ctx = SQLContext.getOrCreate(sc)
        df = sql_ctx.createDataFrame(predictionAndObservations, schema=StructType([
            StructField("prediction", DoubleType(), nullable=False),
            StructField("observation", DoubleType(), nullable=False)]))
        java_class = sc._jvm.org.apache.spark.mllib.evaluation.RegressionMetrics
        java_model = java_class(df._jdf)
        super(RegressionMetrics, self).__init__(java_model) 
Example 23
Project: LearningApacheSpark   Author: runawayhorse001   File: evaluation.py    License: MIT License 5 votes vote down vote up
def __init__(self, predictionAndLabels):
        sc = predictionAndLabels.ctx
        sql_ctx = SQLContext.getOrCreate(sc)
        df = sql_ctx.createDataFrame(predictionAndLabels, schema=StructType([
            StructField("prediction", DoubleType(), nullable=False),
            StructField("label", DoubleType(), nullable=False)]))
        java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics
        java_model = java_class(df._jdf)
        super(MulticlassMetrics, self).__init__(java_model) 
Example 24
Project: LearningApacheSpark   Author: runawayhorse001   File: catalog.py    License: MIT License 5 votes vote down vote up
def createTable(self, tableName, path=None, source=None, schema=None, **options):
        """Creates a table based on the dataset in a data source.

        It returns the DataFrame associated with the table.

        The data source is specified by the ``source`` and a set of ``options``.
        If ``source`` is not specified, the default data source configured by
        ``spark.sql.sources.default`` will be used. When ``path`` is specified, an external table is
        created from the data at the given path. Otherwise a managed table is created.

        Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and
        created table.

        :return: :class:`DataFrame`
        """
        if path is not None:
            options["path"] = path
        if source is None:
            source = self._sparkSession._wrapped._conf.defaultDataSourceName()
        if schema is None:
            df = self._jcatalog.createTable(tableName, source, options)
        else:
            if not isinstance(schema, StructType):
                raise TypeError("schema should be StructType")
            scala_datatype = self._jsparkSession.parseDataType(schema.json())
            df = self._jcatalog.createTable(tableName, source, scala_datatype, options)
        return DataFrame(df, self._sparkSession._wrapped) 
Example 25
Project: SMV   Author: TresAmigosSD   File: smvschema.py    License: Apache License 2.0 5 votes vote down vote up
def _toStructType(self):
        """return equivalent Spark schema (StructType) from this smv schema"""
        # ss is the raw scala spark schema (Scala StructType).  This has no
        # iterator defined on the python side, so we use old school for loop.
        ss = self.j_smv_schema.toStructType()
        spark_schema = sql_types.StructType()
        for i in range(ss.length()):
            # use "apply" to get the nth StructField item in StructType
            ft = self._scala_to_python_field_type(ss.apply(i))
            spark_schema = spark_schema.add(ft)
        return spark_schema 
Example 26
Project: dagster   Author: dagster-io   File: repo.py    License: Apache License 2.0 5 votes vote down vote up
def make_people(context) -> DataFrame:
    schema = StructType([StructField('name', StringType()), StructField('age', IntegerType())])
    rows = [Row(name='Thom', age=51), Row(name='Jonny', age=48), Row(name='Nigel', age=49)]
    return context.resources.pyspark.spark_session.createDataFrame(rows, schema) 
Example 27
Project: dagster   Author: dagster-io   File: repo.py    License: Apache License 2.0 5 votes vote down vote up
def make_people(context) -> DataFrame:
    schema = StructType([StructField('name', StringType()), StructField('age', IntegerType())])
    rows = [Row(name='Thom', age=51), Row(name='Jonny', age=48), Row(name='Nigel', age=49)]
    return context.resources.pyspark.spark_session.createDataFrame(rows, schema) 
Example 28
Project: dagster   Author: dagster-io   File: test_pyspark.py    License: Apache License 2.0 5 votes vote down vote up
def make_df_solid(context):
    schema = StructType([StructField('name', StringType()), StructField('age', IntegerType())])
    rows = [Row(name='John', age=19), Row(name='Jennifer', age=29), Row(name='Henry', age=50)]
    return context.resources.pyspark.spark_session.createDataFrame(rows, schema) 
Example 29
Project: dagster   Author: dagster-io   File: test_pyspark.py    License: Apache License 2.0 5 votes vote down vote up
def make_df_solid(context):
    schema = StructType([StructField('name', StringType()), StructField('age', IntegerType())])
    rows = [Row(name='John', age=19), Row(name='Jennifer', age=29), Row(name='Henry', age=50)]
    return context.resources.pyspark.spark_session.createDataFrame(rows, schema) 
Example 30
Project: pipelines   Author: kubeflow   File: transform_run.py    License: Apache License 2.0 5 votes vote down vote up
def load_schema(analysis_path):
  type_map = {
    'KEY': StringType(),
    'NUMBER': DoubleType(),
    'CATEGORY': StringType(),
    'TEXT': StringType(),
    'IMAGE_URL': StringType()
  }
  schema_file = os.path.join(analysis_path, 'schema.json')
  schema_json = json.loads(file_io.read_file_to_string(schema_file))
  fields = [StructField(x['name'], type_map[x['type']]) for x in schema_json]
  return schema_json, StructType(fields)