Python pyspark.sql.types.StructType() Examples

The following are 30 code examples of pyspark.sql.types.StructType(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.types , or try the search function

Example #1

Source File: temp_range_sql.py From Hanhan-Spark-Python with MIT License

12 votes

def main():
    temp_schema = StructType([
    StructField('StationID', StringType(), False),
    StructField('DateTime', StringType(), False),
    StructField('Observation', StringType(), False),
    StructField('DataValue', DoubleType(), False),
    StructField('MFlag', StringType(), True),
    StructField('QFlag', StringType(), True),
    StructField('SFlag', StringType(), True),
    StructField('OBSTime', StringType(), True),
    ])

    df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(inputs1, schema=temp_schema)
    df = df.filter(df.QFlag == '')

    dfrange = get_range(df)
    result = dfrange.rdd.map(lambda r: str(r.DateTime)+' '+str(r.StationID)+' '+str(r.MaxRange))
    outdata = result.sortBy(lambda r: r[0]).coalesce(1)
    outdata.saveAsTextFile(output)

Example #2

Source File: imageIO.py From spark-deep-learning with Apache License 2.0

8 votes

def filesToDF(sc, path, numPartitions=None):
    """
    Read files from a directory to a DataFrame.

    :param sc: SparkContext.
    :param path: str, path to files.
    :param numPartition: int, number or partitions to use for reading files.
    :return: DataFrame, with columns: (filePath: str, fileData: BinaryType)
    """
    numPartitions = numPartitions or sc.defaultParallelism
    schema = StructType([StructField("filePath", StringType(), False),
                         StructField("fileData", BinaryType(), False)])
    rdd = sc.binaryFiles(
        path, minPartitions=numPartitions).repartition(numPartitions)
    rdd = rdd.map(lambda x: (x[0], bytearray(x[1])))
    return rdd.toDF(schema)

Example #3

Source File: transform.py From search-MjoLniR with MIT License

6 votes

def _simplify_data_type(data_type: T.DataType) -> Tuple:
    """Simplify datatype into a tuple of equality information we care about

    Most notably this ignores nullability concerns due to hive not
    being able to represent not null in it's schemas.
    """
    try:
        # Normalize UDT into it's sql form. Allows comparison of schemas
        # from hive and spark.
        sql_type = data_type.sqlType()  # type: ignore
    except AttributeError:
        sql_type = data_type

    if isinstance(sql_type, T.StructType):
        return ('StructType', [(field.name, _simplify_data_type(field.dataType)) for field in sql_type])
    elif isinstance(sql_type, T.ArrayType):
        return ('ArrayType', _simplify_data_type(sql_type.elementType))
    else:
        return (type(sql_type).__name__,)

Example #4

Source File: transform.py From search-MjoLniR with MIT License

6 votes

def _verify_schema_compatability(expect: T.StructType, have: T.StructType) -> List[str]:
    """Verify all expected fields and types are present

    Allows additional columns in the `have` schema. Additionally
    allows relaxing nullability """
    errors = []
    for expect_field in expect:
        try:
            have_field = have[expect_field.name]
        except KeyError:
            errors.append('Field {} missing. Have: {}'.format(expect_field.name, ','.join(have.names)))
            continue
        expect_type = _simplify_data_type(expect_field.dataType)
        have_type = _simplify_data_type(have_field.dataType)
        if expect_type != have_type:
            errors.append('Field {} has incompatible data types: expect {} != have {}'.format(
                          expect_field.name, expect_type, have_type))
    return errors

Example #5

Source File: utils.py From mlflow with Apache License 2.0

6 votes

def format_to_file_path(spark_session):
    rows = [
        Row(8, 32, "bat"),
        Row(64, 40, "mouse"),
        Row(-27, 55, "horse")
    ]
    schema = StructType([
        StructField("number2", IntegerType()),
        StructField("number1", IntegerType()),
        StructField("word", StringType())
    ])
    rdd = spark_session.sparkContext.parallelize(rows)
    df = spark_session.createDataFrame(rdd, schema)
    res = {}
    tempdir = tempfile.mkdtemp()
    for data_format in ["csv", "parquet", "json"]:
        res[data_format] = os.path.join(tempdir, "test-data-%s" % data_format)

    for data_format, file_path in res.items():
        df.write.option("header", "true").format(data_format).save(file_path)
    yield res
    shutil.rmtree(tempdir)

Example #6

Source File: reddit_average_sql.py From Hanhan-Spark-Python with MIT License

6 votes

def main():
    schema = StructType([
    StructField('subreddit', StringType(), False),
    StructField('score', IntegerType(), False),
    ])
    inputs = sqlContext.read.json(inputs1, schema=schema)

    # Uncomment this then shcema is not added
    # inputs = sqlContext.read.json(inputs1)

    # Uncomment these when there are 2 inputs dir
    # comments_input1 = sqlContext.read.json(inputs1, schema=schema)
    # comments_input2 = sqlContext.read.json(inputs2, schema=schema)
    # inputs = comments_input1.unionAll(comments_input2)

    df = get_avg(inputs)
    df.write.save(output, format='json', mode='overwrite')

Example #7

Source File: transform.py From search-MjoLniR with MIT License

6 votes

def _merge_schemas(*schemas: T.StructType):
    """Merge one or more spark schemas into a new schema"""
    fields = cast(Dict[str, T.StructField], {})
    errors = []
    for schema in schemas:
        for field in schema:
            if field.name not in fields:
                fields[field.name] = field
            elif field != fields[field.name]:
                errors.append('Incompatible fields: {} != {}'.format(field, fields[field.name]))
    if errors:
        raise Exception('\n'.join(errors))
    return T.StructType(list(fields.values()))


# Primary input schema from which most everything else is derived

Example #8

Source File: unischema.py From petastorm with Apache License 2.0

6 votes

def as_spark_schema(self):
        """Returns an object derived from the unischema as spark schema.

        Example:

        >>> spark.createDataFrame(dataset_rows,
        >>>                       SomeSchema.as_spark_schema())
        """
        # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
        # (currently works only with make_batch_reader)
        import pyspark.sql.types as sql_types

        schema_entries = []
        for field in self._fields.values():
            spark_type = _field_spark_dtype(field)
            schema_entries.append(sql_types.StructField(field.name, spark_type, field.nullable))

        return sql_types.StructType(schema_entries)

Example #9

Source File: accuracy.py From HoloClean-Legacy-deprecated with Apache License 2.0

6 votes

def read_groundtruth(self):

        """
        Create a dataframe from the ground truth csv file

        Takes as argument the full path name of the csv file
        and the spark_session
        """
        filereader = Reader(self.spark_session)

        groundtruth_schema = StructType([
            StructField("tid", IntegerType(), False),
            StructField("attr_name", StringType(), False),
            StructField("attr_val", StringType(), False)])

        self.ground_truth_flat = filereader.read(self.path_to_grand_truth, 0,
                                                 groundtruth_schema).\
            drop(GlobalVariables.index_name)

        self.dataengine.add_db_table(
            'Groundtruth', self.ground_truth_flat, self.dataset)

Example #10

Source File: udf.py From ibis with Apache License 2.0

6 votes

def validate_func_and_types(self, func):
        if isinstance(self.spark_output_type, (pt.MapType, pt.StructType)):
            raise com.IbisTypeError(
                'Spark does not support MapType or StructType output for \
Pandas UDFs'
            )
        if not self.input_type:
            raise com.UnsupportedArgumentError(
                'Spark does not support 0-arg pandas UDFs. Instead, create \
a 1-arg pandas UDF and ignore the arg in your function'
            )
        super().validate_func_and_types(func)

Example #11

Source File: criteo.py From azure-python-labs with MIT License

5 votes

def get_spark_schema(header=DEFAULT_HEADER):
    ## create schema
    schema = StructType()
    ## do label + ints
    n_ints = 14
    for i in range(n_ints):
        schema.add(StructField(header[i], IntegerType()))
    ## do categoricals
    for i in range(26):
        schema.add(StructField(header[i + n_ints], StringType()))
    return schema

Example #12

Source File: base.py From example_dataproc_twitter with MIT License

5 votes

def load_neighbor_schema(self):
        """Loads neighborhood schema for similarity matrix

        :rtype: `pyspark.sql.types.StructField`
        :returns: schema of type ["key", [("key", "value")]]
        """
        return stypes.StructType(fields=[
                stypes.StructField("item", stypes.StringType()),
                 stypes.StructField("similarity_items", stypes.ArrayType(
                  stypes.StructType(fields=[
                   stypes.StructField("item", stypes.StringType()),
                    stypes.StructField("similarity", stypes.FloatType())])))])

Example #13

Source File: base.py From example_dataproc_twitter with MIT License

5 votes

def load_users_schema():
        """Loads schema with data type [user, [(sku, score), (sku, score)]]

        :rtype: `pyspark.sql.type.StructType`
        :returns: schema speficiation for user -> (sku, score) data.
        """
        return stypes.StructType(fields=[
        	stypes.StructField("user", stypes.StringType()),
        	 stypes.StructField('interactions', stypes.ArrayType(
        	  stypes.StructType(fields=[stypes.StructField('item', 
        	   stypes.StringType()), stypes.StructField('score', 
        	    stypes.FloatType())])))])

Example #14

Source File: base.py From example_dataproc_twitter with MIT License

5 votes

def load_neighbor_schema(self):
        """Loads neighborhood schema for similarity matrix

        :rtype: `pyspark.sql.types.StructField`
        :returns: schema of type ["key", [("key", "value")]]
        """
        return stypes.StructType(fields=[
                stypes.StructField("item", stypes.StringType()),
                 stypes.StructField("similarity_items", stypes.ArrayType(
                  stypes.StructType(fields=[
                   stypes.StructField("item", stypes.StringType()),
                    stypes.StructField("similarity", stypes.FloatType())])))])

Example #15

Source File: base.py From example_dataproc_twitter with MIT License

5 votes

def load_users_schema():
        """Loads schema with data type [user, [(sku, score), (sku, score)]]

        :rtype: `pyspark.sql.type.StructType`
        :returns: schema speficiation for user -> (sku, score) data.
        """
        return stypes.StructType(fields=[
        	stypes.StructField("user", stypes.StringType()),
        	 stypes.StructField('interactions', stypes.ArrayType(
        	  stypes.StructType(fields=[stypes.StructField('item', 
        	   stypes.StringType()), stypes.StructField('score', 
        	    stypes.FloatType())])))])

Example #16

Source File: criteo.py From azure-python-labs with MIT License

5 votes

def get_spark_schema(header=DEFAULT_HEADER):
    ## create schema
    schema = StructType()
    ## do label + ints
    n_ints = 14
    for i in range(n_ints):
        schema.add(StructField(header[i], IntegerType()))
    ## do categoricals
    for i in range(26):
        schema.add(StructField(header[i + n_ints], StringType()))
    return schema

Example #17

Source File: __init__.py From listenbrainz-server with GNU General Public License v2.0

5 votes

def upload_test_playcounts(cls):
        schema = StructType(
            [
                StructField("user_id", IntegerType()),
                StructField("recording_id", IntegerType()),
                StructField("count", IntegerType())
            ]
        )
        test_playcounts = []
        for i in range(1, PLAYCOUNTS_COUNT // 2 + 1):
            test_playcounts.append([1, 1, 1])
        for i in range(PLAYCOUNTS_COUNT // 2 + 1, PLAYCOUNTS_COUNT + 1):
            test_playcounts.append([2, 2, 1])
        test_playcounts_df = listenbrainz_spark.session.createDataFrame(test_playcounts, schema=schema)
        utils.save_parquet(test_playcounts_df, TEST_PLAYCOUNTS_PATH)

Example #18

Source File: typehints.py From koalas with Apache License 2.0

5 votes

def __init__(self, tpe, names=None):
        if names is None:
            # Default names `c0, c1, ... cn`.
            self.tpe = types.StructType(
                [types.StructField("c%s" % i, tpe[i]) for i in range(len(tpe))]
            )  # type: types.StructType
        else:
            self.tpe = types.StructType(
                [types.StructField(n, t) for n, t in zip(names, tpe)]
            )  # type: types.StructType

Example #19

Source File: test_upload.py From listenbrainz-server with GNU General Public License v2.0

5 votes

def test_process_json_listens(self, mock_save, mock_read):
        fakeschema = StructType([StructField('xxxxx', StringType(), nullable=True)])
        ListenbrainzDataUploader().process_json_listens('/2020/1.json', '/fakedir', 'fakehdfspath', fakeschema)
        mock_read.assert_called_once_with('fakehdfspath', schema=fakeschema)
        mock_save.assert_called_once_with(mock_read.return_value, '/fakedir/2020/1.parquet')

Example #20

Source File: test_upload.py From listenbrainz-server with GNU General Public License v2.0

5 votes

def test_process_json(self, mock_save, mock_read):
        fakeschema = StructType([StructField('xxxxx', StringType(), nullable=True)])
        ListenbrainzDataUploader().process_json('fakename', '/fakedestpath', '/fakehdfspath', fakeschema)
        mock_read.assert_called_once_with('/fakehdfspath', schema=fakeschema)
        mock_save.assert_called_once_with(mock_read.return_value, '/fakedestpath')

Example #21

Source File: test_pyspark.py From dagster with Apache License 2.0

5 votes

def make_df_solid(context):
    schema = StructType([StructField('name', StringType()), StructField('age', IntegerType())])
    rows = [Row(name='John', age=19), Row(name='Jennifer', age=29), Row(name='Henry', age=50)]
    return context.resources.pyspark.spark_session.createDataFrame(rows, schema)

Example #22

Source File: test_base.py From example_dataproc_twitter with MIT License

5 votes

def test_load_users_schema(self):
        klass = self.get_target_klass()()
        expected = stypes.StructType(fields=[
        	stypes.StructField("user", stypes.StringType()),
        	 stypes.StructField('interactions', stypes.ArrayType(
        	  stypes.StructType(fields=[stypes.StructField('item', 
        	   stypes.StringType()), stypes.StructField('score', 
        	    stypes.FloatType())])))])
        result = klass.load_users_schema()
        self.assertEqual(result, expected)

Example #23

Source File: reader.py From HoloClean-Legacy-deprecated with Apache License 2.0

5 votes

def read(self, file_path, spark_session, indexcol=0, schema=None):
        """
        Creates a dataframe from the csv file

        :param indexcol: if 1, create a tuple id column as auto increment
        :param schema: optional schema of file if known
        :param spark_session: The spark_session we created in Holoclean object
        :param file_path: The path to the file

        :return: dataframe
        """
        if schema is None:
            df = spark_session.read.csv(file_path, header=True)
        else:
            df = spark_session.read.csv(file_path, header=True, schema=schema)

        if indexcol == 0:
            return df

        index_name = GlobalVariables.index_name

        new_cols = df.schema.names + [index_name]
        list_schema = []
        for index_attribute in range(len(df.schema.names)):
            list_schema.append(StructField("_" + str(index_attribute),
                                           df.schema[
                                               index_attribute].dataType,
                                           True))
        list_schema.append(
            StructField("_" + str(len(new_cols)), LongType(), True))

        schema = StructType(list_schema)
        ix_df = df.rdd.zipWithIndex().map(
            lambda (row, ix): row + (ix + 1,)).toDF(schema)
        tmp_cols = ix_df.schema.names
        new_df = reduce(lambda data, idx: data.withColumnRenamed(tmp_cols[idx],
                        new_cols[idx]),
                        xrange(len(tmp_cols)), ix_df)
        new_df = self.checking_string_size(new_df)
        return new_df

Example #24

Source File: sampler.py From python_mozetl with MIT License

5 votes

def transform(landfill, n_documents=1000):
    meta_schema = StructType(
        [StructField(k, StringType(), True) for k in META_WHITELIST]
    )

    schema = StructType(
        [
            StructField("namespace", StringType(), False),
            StructField("doc_type", StringType(), False),
            StructField("doc_version", StringType(), True),
            StructField("doc_id", StringType(), True),
            StructField("meta", meta_schema, False),
            StructField("content", StringType(), False),
        ]
    )

    documents = (
        landfill.map(_process)
        .filter(lambda x: x[0] and x[1] and x[-2] and x[-1])
        .toDF(schema)
    )

    window_spec = Window.partitionBy("namespace", "doc_type", "doc_version").orderBy(
        "doc_id"
    )

    df = (
        documents.fillna("0", "doc_version")
        .withColumn("row_id", row_number().over(window_spec))
        .where(col("row_id") <= n_documents)
        .drop("row_id")
    )

    return df

Example #25

Source File: transform.py From python_mozetl with MIT License

5 votes

def toStructType(self):
        return StructType(
            [StructField(col.name, col.struct_type, True) for col in self.columns]
        )

Example #26

Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License

5 votes

def _initialize_results(self, scaffolds):
        data = [ps.Row(smiles=scaffold, scaffold=scaffold,
                       decorations={}, count=1) for scaffold in scaffolds]
        data_schema = pst.StructType([
            pst.StructField("smiles", pst.StringType()),
            pst.StructField("scaffold", pst.StringType()),
            pst.StructField("decorations", pst.MapType(pst.IntegerType(), pst.StringType())),
            pst.StructField("count", pst.IntegerType())
        ])
        return SPARK.createDataFrame(data, schema=data_schema)

Example #27

Source File: schema.py From mlflow with Apache License 2.0

5 votes

def as_spark_schema(self):
        """Convert to Spark schema. If this schema is a single unnamed column, it is converted
        directly the corresponding spark data type, otherwise it's returned as a struct (missing
        column names are filled with an integer sequence).
        """
        if len(self.columns) == 1 and self.columns[0].name is None:
            return self.columns[0].type.to_spark()
        from pyspark.sql.types import StructType, StructField
        return StructType([StructField(name=col.name or str(i),
                                       dataType=col.type.to_spark())
                           for i, col in enumerate(self.columns)])

Example #28

Source File: test_schema.py From mlflow with Apache License 2.0

5 votes

def test_spark_schema_inference(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import _parse_datatype_string, StructField, StructType
    schema = _infer_schema(pandas_df_with_all_types)
    assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
    spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate())
    spark_schema = StructType(
        [StructField(t.name, _parse_datatype_string(t.name), True)
         for t in schema.column_types()])
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=spark_schema)
    schema = _infer_schema(sparkdf)
    assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])

Example #29

Source File: test_schema.py From mlflow with Apache License 2.0

5 votes

def test_spark_type_mapping(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import BooleanType, IntegerType, LongType, FloatType, DoubleType, \
        StringType, BinaryType
    from pyspark.sql.types import StructField, StructType

    assert isinstance(DataType.boolean.to_spark(), BooleanType)
    assert isinstance(DataType.integer.to_spark(), IntegerType)
    assert isinstance(DataType.long.to_spark(), LongType)
    assert isinstance(DataType.float.to_spark(), FloatType)
    assert isinstance(DataType.double.to_spark(), DoubleType)
    assert isinstance(DataType.string.to_spark(), StringType)
    assert isinstance(DataType.binary.to_spark(), BinaryType)
    schema = _infer_schema(pandas_df_with_all_types)
    expected_spark_schema = StructType(
        [StructField(t.name, t.to_spark(), True)
         for t in schema.column_types()])
    actual_spark_schema = schema.as_spark_schema()
    assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()
    spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate())
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types,
                                            schema=actual_spark_schema)
    schema2 = _infer_schema(sparkdf)
    assert schema == schema2

    # test unnamed columns
    schema = Schema([ColSpec(col.type) for col in schema.columns])
    expected_spark_schema = StructType(
        [StructField(str(i), t.to_spark(), True)
         for i, t in enumerate(schema.column_types())])
    actual_spark_schema = schema.as_spark_schema()
    assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()

    # test single unnamed column is mapped to just a single spark type
    schema = Schema([ColSpec(DataType.integer)])
    spark_type = schema.as_spark_schema()
    assert isinstance(spark_type, IntegerType)

Example #30

Source File: hostlinks_to_graph.py From cc-pyspark with MIT License

5 votes

def vertices_assign_ids(self, sc, sqlc, edges):
        source = edges.select(edges.s.alias('name'))
        target = edges.select(edges.t.alias('name'))

        ids = source.union(target) \
            .distinct()

        if self.args.validate_host_names:
            is_valid = sqlf.udf(HostLinksToGraph.reverse_host_is_valid,
                                BooleanType())
            ids = ids.filter(is_valid(ids.name))

        if self.args.vertex_partitions == 1:
            ids = ids \
                    .coalesce(1) \
                    .sort('name') \
                    .withColumn('id', sqlf.monotonically_increasing_id())
        else:
            id_rdd = ids.select(ids.name).rdd \
                        .map(lambda row: tuple(row)[0]) \
                        .sortBy(lambda x: x, True,
                                self.args.vertex_partitions) \
                        .zipWithIndex()
            id_schema = StructType([
                StructField("name", StringType(), True),
                StructField("id", LongType(), True)
            ])
            ids = sqlc.createDataFrame(id_rdd, schema=id_schema)

        if self.args.save_as_text is not None:
            ids = ids.persist()
            ids.select(sqlf.concat_ws('\t', ids.id, ids.name)) \
                .write \
                .text(os.path.join(self.args.save_as_text, "vertices"),
                      compression="gzip")
        ids.write \
           .format(self.args.output_format) \
           .option("compression", self.args.output_compression) \
           .saveAsTable(self.args.output + '_vertices')

        return ids