Python pyspark.sql.types.StructType() Examples

The following are code examples for showing how to use pyspark.sql.types.StructType(). They are extracted from open source Python projects. You can vote up the examples you like or vote down the exmaples you don't like. You can also save this page to your account.

Example 1
Project: monasca-transform   Author: openstack   File: transform_utils.py    (Apache License 2.0) View Source Project 11 votes vote down vote up
def _get_instance_usage_schema():
        """get instance usage schema."""

        # Initialize columns for all string fields
        columns = ["tenant_id", "user_id", "resource_uuid",
                   "geolocation", "region", "zone", "host", "project_id",
                   "aggregated_metric_name", "firstrecord_timestamp_string",
                   "lastrecord_timestamp_string",
                   "service_group", "service_id",
                   "usage_date", "usage_hour", "usage_minute",
                   "aggregation_period", "namespace", "pod_name", "app",
                   "container_name", "interface", "deployment", "daemon_set"]

        columns_struct_fields = [StructField(field_name, StringType(), True)
                                 for field_name in columns]

        # Add columns for non-string fields
        columns_struct_fields.append(StructField("firstrecord_timestamp_unix",
                                                 DoubleType(), True))
        columns_struct_fields.append(StructField("lastrecord_timestamp_unix",
                                                 DoubleType(), True))
        columns_struct_fields.append(StructField("quantity",
                                                 DoubleType(), True))
        columns_struct_fields.append(StructField("record_count",
                                                 DoubleType(), True))

        columns_struct_fields.append(StructField("processing_meta",
                                                 MapType(StringType(),
                                                         StringType(),
                                                         True),
                                                 True))
        schema = StructType(columns_struct_fields)

        return schema 
Example 2
Project: Optimus   Author: ironmussa   File: tests.py    (license) View Source Project 9 votes vote down vote up
def create_df(spark_session):
    try:
        # Building a simple dataframe:
        schema = StructType([
            StructField("city", StringType(), True),
            StructField("country", StringType(), True),
            StructField("population", IntegerType(), True)])

        countries = ['Colombia   ', '[email protected]', 'Brazil', 'Spain']
        cities = ['Bogotá', 'New York', '   São Paulo   ', '~Madrid']
        population = [37800000, 19795791, 12341418, 6489162]

        # Dataframe:
        df = spark_session.createDataFrame(list(zip(cities, countries, population)), schema=schema)
        assert_spark_df(df)
        return df
    except RuntimeError:
        sys.exit(1) 
Example 3
Project: kafka-spark-influx-csv-analysis   Author: bwsw   File: test_aggregationProcessor.py    (license) View Source Project 7 votes vote down vote up
def test_seaprate_key_from_end(self):
        spark = SparkSession.builder.getOrCreate()
        sc = spark.sparkContext

        rdd = sc.parallelize([
            (100, 4000, "217.69.143.60"),
            (100, 4000, "217.69.143.60"),
            (1500, 54000, "192.168.30.2"),
            (200, 3000, "192.168.30.2"),
            (200, 3000, "192.168.30.2")
        ])

        config = Config(CONFIG_PATH)
        aggregation_processor = AggregationProcessor(config, StructType([packet_size, traffic, src_ip]))

        separate_key = aggregation_processor._get_separate_key_lambda()
        result = separate_key(rdd)
        self.assertListEqual(result.collect(),
                             [(('217.69.143.60',), (100, 4000)), (('217.69.143.60',), (100, 4000)),
                              (('192.168.30.2',), (1500, 54000)), (('192.168.30.2',), (200, 3000)),
                              (('192.168.30.2',), (200, 3000))], "Lists should be equal") 
Example 4
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 6 votes vote down vote up
def test_union_with_udt(self):
        from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
        row1 = (1.0, ExamplePoint(1.0, 2.0))
        row2 = (2.0, ExamplePoint(3.0, 4.0))
        schema = StructType([StructField("label", DoubleType(), False),
                             StructField("point", ExamplePointUDT(), False)])
        df1 = self.spark.createDataFrame([row1], schema)
        df2 = self.spark.createDataFrame([row2], schema)

        result = df1.union(df2).orderBy("label").collect()
        self.assertEqual(
            result,
            [
                Row(label=1.0, point=ExamplePoint(1.0, 2.0)),
                Row(label=2.0, point=ExamplePoint(3.0, 4.0))
            ]
        ) 
Example 5
Project: kafka-spark-influx-csv-analysis   Author: bwsw   File: test_transformations_validator.py    (license) View Source Project 6 votes vote down vote up
def test_validate_with_correct_one_level_subtree(self):
        validator = TransformatoinsValidator(TransformationOperations({
            "country": "./GeoLite2-Country.mmdb",
            "city": "./GeoLite2-City.mmdb",
            "asn": "./GeoLite2-ASN.mmdb"
        }), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "mult"
        syntaxtree.children = ["packet_size", "sampling_rate"]

        fields = validator.validate([FieldTransformation("traffic", syntaxtree), "dst_ip"])

        self.assertEqual(fields, types.StructType([
            types.StructField('traffic', types.LongType()),
            types.StructField('dst_ip', types.StringType())
        ])) 
Example 6
Project: kafka-spark-influx-csv-analysis   Author: bwsw   File: test_transformations_validator.py    (license) View Source Project 6 votes vote down vote up
def test_validate_function_with_different_arguments_type(self):
        validator = TransformatoinsValidator(TransformationOperations({
            "country": "./GeoLite2-Country.mmdb",
            "city": "./GeoLite2-City.mmdb",
            "asn": "./GeoLite2-ASN.mmdb"
        }), self.data_structure_pyspark)

        main_syntax_tree = SyntaxTree()
        main_syntax_tree.operation = "truncate"
        main_syntax_tree.children = ["src_ip", "5"]

        fields = validator.validate([FieldTransformation("result", main_syntax_tree)])

        self.assertEqual(fields, types.StructType([
            types.StructField("result", types.StringType())
        ])) 
Example 7
Project: kafka-spark-influx-csv-analysis   Author: bwsw   File: test_aggregationProcessor.py    (license) View Source Project 6 votes vote down vote up
def test_build_lambda_for_reduce(self):
        test_input_rule = "Min(packet_size);Max(sampling_rate); Sum(traffic)"
        input_data_structure = StructType([StructField("sampling_rate", LongType()),
                                           StructField("packet_size", LongType()),
                                           StructField("traffic", LongType())])
        test_input_operation = "reduce"
        config = TestConfig({"processing": {"aggregations": {"operation_type": test_input_operation,
                                                             "rule": test_input_rule}}})
        test_aggregation_processor = AggregationProcessor(config, input_data_structure)

        spark = SparkSession.builder.getOrCreate()
        sc = spark.sparkContext
        test_rdd = sc.parallelize([(4, 2, 1), (7, 1, 1), (1, 0, 1), (2, 5, 1), (1, 1, 1)])
        test_aggregation_lambda = test_aggregation_processor.get_aggregation_lambda()

        self.assertIsInstance(test_aggregation_lambda, types.LambdaType, "get_aggregation_lambda should return "
                                                                         "lambda function")

        test_result = test_aggregation_lambda(test_rdd)
        self.assertTupleEqual(test_result, (7, 0, 5), "Error in aggregation operation. Tuple should be equal") 
Example 8
Project: kafka-spark-influx-csv-analysis   Author: bwsw   File: test_aggregationProcessor.py    (license) View Source Project 6 votes vote down vote up
def test_separate_key_from_center(self):
        spark = SparkSession.builder.getOrCreate()
        sc = spark.sparkContext

        rdd = sc.parallelize([
            (100, "217.69.143.60", 4000),
            (100, "217.69.143.60", 4000),
            (1500, "192.168.30.2", 54000),
            (200, "192.168.30.2", 3000),
            (200, "192.168.30.2", 3000)
        ])

        config = Config(CONFIG_PATH)
        aggregation_processor = AggregationProcessor(config, StructType([packet_size, src_ip, traffic]))

        separate_key = aggregation_processor._get_separate_key_lambda()
        result = separate_key(rdd)
        self.assertListEqual(result.collect(),
                             [(('217.69.143.60',), (100, 4000)), (('217.69.143.60',), (100, 4000)),
                              (('192.168.30.2',), (1500, 54000)), (('192.168.30.2',), (200, 3000)),
                              (('192.168.30.2',), (200, 3000))], "Lists should be equal") 
Example 9
Project: Optimus   Author: ironmussa   File: tests.py    (license) View Source Project 6 votes vote down vote up
def create_other_df(spark_session):
    try:
        # Building a simple dataframe:
        schema = StructType([
            StructField("city", StringType(), True),
            StructField("dates", StringType(), True),
            StructField("population", IntegerType(), True)])

        dates = ['1991/02/25', '1998/05/10', '1993/03/15', '1992/07/17']
        cities = ['Caracas', 'Ccs', '   São Paulo   ', '~Madrid']
        population = [37800000, 19795791, 12341418, 6489162]

        # Dataframe:
        df = spark_session.createDataFrame(list(zip(cities, dates, population)), schema=schema)
        assert_spark_df(df)
        return df
    except RuntimeError:
        sys.exit(1) 
Example 10
Project: Optimus   Author: ironmussa   File: tests.py    (license) View Source Project 6 votes vote down vote up
def create_another_df(spark_session):
    try:
        # Building a simple dataframe:
        schema = StructType([
            StructField("city", StringType(), True),
            StructField("dates", StringType(), True),
            StructField("population", IntegerType(), True)])

        dates = ['1991/02/25', '1998/05/10', '1993/03/15', '1992/07/17']
        cities = ['Caracas', 'Caracas', '   Maracaibo   ', 'Madrid']
        population = [37800000, 19795791, 12341418, 6489162]

        # Dataframe:
        df = spark_session.createDataFrame(list(zip(cities, dates, population)), schema=schema)
        assert_spark_df(df)
        return df
    except RuntimeError:
        sys.exit(1) 
Example 11
Project: FYP-AutoTextSum   Author: MrRexZ   File: sparkcrawl.py    (license) View Source Project 6 votes vote down vote up
def local_process(session, sc, input_directory, context_size):
    from os import listdir
    from os.path import isfile, join
    filenames = [filename for filename in listdir(input_directory) if isfile(join(input_directory, filename))]
    file_paths = ["input/CC-MAIN-20170423031158-00000-ip-10-145-167-34.ec2.internal.warc.wet.gz"]
    for file_path in file_paths:
        filename = file_path.split("/")[-1]
        corpus = WARCFile(fileobj=gzip.open(file_path))
        list_of_records = sc.parallelize(
            [re.sub(r'[^\x00-\x7F]+', ' ', record.payload.read().decode("UTF-8")) for i, record in enumerate(corpus)], 10)
        coor = list_of_records.flatMap(partial(map_coocurence, context_size))
        coor_count = coor.reduceByKey(lambda x, y: x + y)
        result = coor_count.map(lambda x: [x[0][0], x[0][1], x[1]])
        tempFile = NamedTemporaryFile(delete=True)
        tempFile.close()
        schema = StructType([
            StructField("Target Word", StringType(), True),
            StructField("Context Word", StringType(), True),
            StructField("Coocurrence Prob", DoubleType(), True)
        ])
        DF = session.createDataFrame(result, schema)

        #DF.show()
        DF.write.csv(os.path.join('output', filename)) 
Example 12
Project: pyspark   Author: v-v-vishnevskiy   File: tests.py    (license) View Source Project 6 votes vote down vote up
def test_unionAll_with_udt(self):
        from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
        row1 = (1.0, ExamplePoint(1.0, 2.0))
        row2 = (2.0, ExamplePoint(3.0, 4.0))
        schema = StructType([StructField("label", DoubleType(), False),
                             StructField("point", ExamplePointUDT(), False)])
        df1 = self.sqlCtx.createDataFrame([row1], schema)
        df2 = self.sqlCtx.createDataFrame([row2], schema)

        result = df1.unionAll(df2).orderBy("label").collect()
        self.assertEqual(
            result,
            [
                Row(label=1.0, point=ExamplePoint(1.0, 2.0)),
                Row(label=2.0, point=ExamplePoint(3.0, 4.0))
            ]
        ) 
Example 13
Project: sparkly   Author: Tubular   File: test_utils.py    (license) View Source Project 6 votes vote down vote up
def test_structs_nested_subset(self):
        schema_has(
            T.StructType([
                T.StructField(
                    'f1',
                    T.ArrayType(T.StructType([
                        T.StructField('f11', T.IntegerType()),
                        T.StructField('f12', T.StringType()),
                    ])),
                ),
            ]),
            T.StructType([
                T.StructField(
                    'f1',
                    T.ArrayType(T.StructType([T.StructField('f11', T.IntegerType())])),
                ),
            ]),
        ) 
Example 14
Project: sparkly   Author: Tubular   File: test_utils.py    (license) View Source Project 6 votes vote down vote up
def test_maps_nested_subset(self):
        schema_has(
            T.MapType(
                T.StringType(),
                T.MapType(
                    T.StringType(),
                    T.StructType([
                        T.StructField('f1', T.MapType(T.StringType(), T.LongType())),
                        T.StructField('f2', T.MapType(T.StringType(), T.IntegerType())),
                    ]),
                ),
            ),
            T.MapType(
                T.StringType(),
                T.MapType(
                    T.StringType(),
                    T.StructType([
                        T.StructField('f1', T.MapType(T.StringType(), T.LongType())),
                    ]),
                ),
            ),
        ) 
Example 15
Project: sparkly   Author: Tubular   File: test_functions.py    (license) View Source Project 6 votes vote down vote up
def test_inner_join(self):
        first_df = self.spark.createDataFrame(
            data=[(1, ), (2, ), (3, )],
            schema=T.StructType([T.StructField('id', T.IntegerType())]),
        )
        second_df = self.spark.createDataFrame(
            data=[(2, ), (3, ), (4, )],
            schema=T.StructType([T.StructField('id', T.IntegerType())]),
        )
        third_df = self.spark.createDataFrame(
            data=[(3, ), (4, ), (5, )],
            schema=T.StructType([T.StructField('id', T.IntegerType())]),
        )

        joined_df = SF.multijoin([first_df, second_df, third_df], on='id', how='inner')

        self.assertDataFrameEqual(joined_df, [{'id': 3}]) 
Example 16
Project: sparkly   Author: Tubular   File: test_functions.py    (license) View Source Project 6 votes vote down vote up
def test_outer_join(self):
        first_df = self.spark.createDataFrame(
            data=[(1, ), (2, ), (3, )],
            schema=T.StructType([T.StructField('id', T.IntegerType())]),
        )
        second_df = self.spark.createDataFrame(
            data=[(2, ), (3, ), (4, )],
            schema=T.StructType([T.StructField('id', T.IntegerType())]),
        )
        third_df = self.spark.createDataFrame(
            data=[(3, ), (4, ), (5, )],
            schema=T.StructType([T.StructField('id', T.IntegerType())]),
        )

        joined_df = SF.multijoin([first_df, second_df, third_df], on='id', how='outer')

        self.assertDataFrameEqual(joined_df, [{'id': i} for i in [1, 2, 3, 4, 5]]) 
Example 17
Project: sparkly   Author: Tubular   File: test_functions.py    (license) View Source Project 6 votes vote down vote up
def test_coalescing(self):
        first_df = self.spark.createDataFrame(
            data=[(1, None), (2, 'hi'), (3, None), (4, 'may')],
            schema=T.StructType([
                T.StructField('id', T.IntegerType()),
                T.StructField('value', T.StringType()),
            ]),
        )
        second_df = self.spark.createDataFrame(
            data=[(2, 'hey'), (3, 'you'), (4, None)],
            schema=T.StructType([
                T.StructField('id', T.IntegerType()),
                T.StructField('value', T.StringType()),
            ]),
        )

        joined_df = SF.multijoin([first_df, second_df], on='id', how='inner', coalesce=['value'])

        self.assertDataFrameEqual(
            joined_df,
            [{'id': 2, 'value': 'hi'}, {'id': 3, 'value': 'you'}, {'id': 4, 'value': 'may'}],
        ) 
Example 18
Project: sparkly   Author: Tubular   File: test_functions.py    (license) View Source Project 6 votes vote down vote up
def test_coalescing_light_type_mismatch(self):
        first_df = self.spark.createDataFrame(
            data=[(1, None), (2, 'hi'), (3, None), (4, 'may')],
            schema=T.StructType([
                T.StructField('id', T.IntegerType()),
                T.StructField('value', T.StringType()),
            ]),
        )
        second_df = self.spark.createDataFrame(
            data=[(2, 2), (3, 3), (4, None)],
            schema=T.StructType([
                T.StructField('id', T.IntegerType()),
                T.StructField('value', T.IntegerType()),
            ]),
        )

        joined_df = SF.multijoin([first_df, second_df], on='id', how='inner', coalesce=['value'])

        self.assertDataFrameEqual(
            joined_df,
            [{'id': 2, 'value': 'hi'}, {'id': 3, 'value': '3'}, {'id': 4, 'value': 'may'}],
        ) 
Example 19
Project: sparkly   Author: Tubular   File: test_functions.py    (license) View Source Project 6 votes vote down vote up
def test_coalescing_heavy_type_mismatch(self):
        first_df = self.spark.createDataFrame(
            data=[(1, None), (2, 'hi'), (3, None), (4, 'may')],
            schema=T.StructType([
                T.StructField('id', T.IntegerType()),
                T.StructField('value', T.StringType()),
            ]),
        )
        second_df = self.spark.createDataFrame(
            data=[(2, [2, ]), (3, [3, ]), (4, None)],
            schema=T.StructType([
                T.StructField('id', T.IntegerType()),
                T.StructField('value', T.ArrayType(T.IntegerType())),
            ]),
        )

        with self.assertRaises(U.AnalysisException):
            SF.multijoin([first_df, second_df], on='id', how='inner', coalesce=['value']) 
Example 20
Project: sparkly   Author: Tubular   File: test_functions.py    (license) View Source Project 6 votes vote down vote up
def test_default_as_a_lit(self):
        df = self.spark.createDataFrame(
            data=[('one', ), ('two', ), ('three', ), ('hi', )],
            schema=T.StructType([T.StructField('name', T.StringType())]),
        )

        df = df.withColumn('value', SF.switch_case('name', default=0))

        self.assertDataFrameEqual(
            df,
            [
                {'name': 'one', 'value': 0},
                {'name': 'two', 'value': 0},
                {'name': 'three', 'value': 0},
                {'name': 'hi', 'value': 0},
            ],
        ) 
Example 21
Project: sparkly   Author: Tubular   File: test_functions.py    (license) View Source Project 6 votes vote down vote up
def test_default_as_a_column(self):
        df = self.spark.createDataFrame(
            data=[('one', ), ('two', ), ('three', ), ('hi', )],
            schema=T.StructType([T.StructField('name', T.StringType())]),
        )

        df = df.withColumn('value', SF.switch_case('name', default=F.col('name')))

        self.assertDataFrameEqual(
            df,
            [
                {'name': 'one', 'value': 'one'},
                {'name': 'two', 'value': 'two'},
                {'name': 'three', 'value': 'three'},
                {'name': 'hi', 'value': 'hi'},
            ],
        ) 
Example 22
Project: sparkly   Author: Tubular   File: test_functions.py    (license) View Source Project 6 votes vote down vote up
def test_switch_as_a_string_cases_as_kwargs(self):
        df = self.spark.createDataFrame(
            data=[('one', ), ('two', ), ('three', ), ('hi', )],
            schema=T.StructType([T.StructField('name', T.StringType())]),
        )

        df = df.withColumn('value', SF.switch_case('name', one=1, two=2, three=3, default=0))

        self.assertDataFrameEqual(
            df,
            [
                {'name': 'one', 'value': 1},
                {'name': 'two', 'value': 2},
                {'name': 'three', 'value': 3},
                {'name': 'hi', 'value': 0},
            ],
        ) 
Example 23
Project: sparkly   Author: Tubular   File: test_functions.py    (license) View Source Project 6 votes vote down vote up
def test_switch_as_a_column_cases_as_kwargs(self):
        df = self.spark.createDataFrame(
            data=[('one', ), ('two', ), ('three', ), ('hi', )],
            schema=T.StructType([T.StructField('name', T.StringType())]),
        )

        df = df.withColumn(
            'value',
            SF.switch_case(F.col('name'), one=1, two=2, three=3, default=0),
        )

        self.assertDataFrameEqual(
            df,
            [
                {'name': 'one', 'value': 1},
                {'name': 'two', 'value': 2},
                {'name': 'three', 'value': 3},
                {'name': 'hi', 'value': 0},
            ],
        ) 
Example 24
Project: sparkly   Author: Tubular   File: test_functions.py    (license) View Source Project 6 votes vote down vote up
def test_dict_cases_override_kwarg_cases(self):
        df = self.spark.createDataFrame(
            data=[('one', ), ('two', ), ('three', ), ('hi', )],
            schema=T.StructType([T.StructField('name', T.StringType())]),
        )

        df = df.withColumn(
            'value',
            SF.switch_case('name', {'one': 11, 'three': 33}, one=1, two=2, three=3, default=0),
        )

        self.assertDataFrameEqual(
            df,
            [
                {'name': 'one', 'value': 11},
                {'name': 'two', 'value': 2},
                {'name': 'three', 'value': 33},
                {'name': 'hi', 'value': 0},
            ],
        ) 
Example 25
Project: sparkly   Author: Tubular   File: test_writer.py    (license) View Source Project 6 votes vote down vote up
def test_redis_client_init(self):
        df = self.spark.createDataFrame(
            data=[
                ('k1', 'k14', [1, 14, 141]),
            ],
            schema=T.StructType([
                T.StructField('key_1', T.StringType()),
                T.StructField('key_2', T.StringType()),
                T.StructField('aux_data', T.ArrayType(T.IntegerType())),
            ])
        )

        df.write_ext.redis(
            key_by=['key_2'],
            max_pipeline_size=3,
            redis_client_init=partial(redis.StrictRedis, 'redis.docker'),
        )

        redis_client = redis.StrictRedis('redis.docker')

        self.assertEqual(redis_client.keys(), [b'k14'])

        written_data = json.loads(redis_client.get('k14'))
        expected = {'key_1': 'k1', 'key_2': 'k14', 'aux_data': [1, 14, 141]}
        self.assertEqual(written_data, expected) 
Example 26
Project: sparkly   Author: Tubular   File: test_writer.py    (license) View Source Project 6 votes vote down vote up
def test_db(self):
        df = self.spark.createDataFrame(
            data=[
                ('k1', 'k14', [1, 14, 141]),
            ],
            schema=T.StructType([
                T.StructField('key_1', T.StringType()),
                T.StructField('key_2', T.StringType()),
                T.StructField('aux_data', T.ArrayType(T.IntegerType())),
            ])
        )

        df.write_ext.by_url('redis://redis.docker/1?keyBy=key_2&maxPipelineSize=3')

        redis_client = redis.StrictRedis('redis.docker', db=1)

        self.assertEqual(redis_client.keys(), [b'k14'])

        written_data = json.loads(redis_client.get('k14'))
        expected = {'key_1': 'k1', 'key_2': 'k14', 'aux_data': [1, 14, 141]}
        self.assertEqual(written_data, expected) 
Example 27
Project: monasca-transform   Author: openstack   File: transform_utils.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def _get_record_store_df_schema():
        """get instance usage schema."""

        columns = ["event_timestamp_string",
                   "event_type", "event_quantity_name",
                   "event_status", "event_version",
                   "record_type", "resource_uuid", "tenant_id",
                   "user_id", "region", "zone",
                   "host", "project_id", "service_group", "service_id",
                   "event_date", "event_hour", "event_minute",
                   "event_second", "metric_group", "metric_id",
                   "namespace", "pod_name", "app", "container_name",
                   "interface", "deployment", "daemon_set"]

        columns_struct_fields = [StructField(field_name, StringType(), True)
                                 for field_name in columns]

        # Add a column for a non-string fields
        columns_struct_fields.insert(0,
                                     StructField("event_timestamp_unix",
                                                 DoubleType(), True))
        columns_struct_fields.insert(0,
                                     StructField("event_quantity",
                                                 DoubleType(), True))

        schema = StructType(columns_struct_fields)

        return schema 
Example 28
Project: monasca-transform   Author: openstack   File: transform_utils.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def _get_pre_transform_specs_df_schema():
        """get pre_transform_specs df schema."""

        # FIXME: change when pre_transform_specs df is finalized

        event_type = StructField("event_type", StringType(), True)

        metric_id_list = StructField("metric_id_list",
                                     ArrayType(StringType(),
                                               containsNull=False),
                                     True)
        required_raw_fields_list = StructField("required_raw_fields_list",
                                               ArrayType(StringType(),
                                                         containsNull=False),
                                               True)
        service_id = StructField("service_id", StringType(), True)

        event_processing_params = \
            StructField("event_processing_params",
                        StructType([StructField("set_default_zone_to",
                                                StringType(), True),
                                    StructField("set_default_geolocation_to",
                                                StringType(), True),
                                    StructField("set_default_region_to",
                                                StringType(), True),
                                    ]), True)

        schema = StructType([event_processing_params, event_type,
                             metric_id_list, required_raw_fields_list,
                             service_id])

        return schema 
Example 29
Project: monasca-transform   Author: openstack   File: transform_utils.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def _get_grouping_results_df_schema(group_by_column_list):
        """get grouping results schema."""

        group_by_field_list = [StructField(field_name, StringType(), True)
                               for field_name in group_by_column_list]

        # Initialize columns for string fields
        columns = ["firstrecord_timestamp_string",
                   "lastrecord_timestamp_string"]

        columns_struct_fields = [StructField(field_name, StringType(), True)
                                 for field_name in columns]

        # Add columns for non-string fields
        columns_struct_fields.append(StructField("firstrecord_timestamp_unix",
                                                 DoubleType(), True))
        columns_struct_fields.append(StructField("lastrecord_timestamp_unix",
                                                 DoubleType(), True))
        columns_struct_fields.append(StructField("firstrecord_quantity",
                                                 DoubleType(), True))
        columns_struct_fields.append(StructField("lastrecord_quantity",
                                                 DoubleType(), True))
        columns_struct_fields.append(StructField("record_count",
                                                 DoubleType(), True))

        instance_usage_schema_part = StructType(columns_struct_fields)

        grouping_results = \
            StructType([StructField("grouping_key",
                                    StringType(), True),
                        StructField("results",
                                    instance_usage_schema_part,
                                    True),
                        StructField("grouping_key_dict",
                                    StructType(group_by_field_list))])

        # schema = \
        #     StructType([StructField("GroupingResults", grouping_results)])
        return grouping_results 
Example 30
Project: Eskapade   Author: KaveIO   File: test_data_conversion.py    (license) View Source Project 5 votes vote down vote up
def test_df_schema(self):
        """Test creation of a data-frame schema"""

        # create reference schema
        ref_sub_schema = StructType([StructField(n, t) for n, t in [('long1', LongType()), ('double1', DoubleType())]])
        ref_cols = [('long', LongType()), ('double', DoubleType()), ('string', StringType()),
                    ('timestamp', TimestampType()), ('struct', ref_sub_schema)]
        ref_schema = StructType([StructField(*c) for c in ref_cols])

        # specification with Python types
        specs = {}
        sub_spec = odict([('long1', int), ('double1', float)])
        specs['python'] = odict([('long', int), ('double', float), ('string', str), ('timestamp', datetime.datetime),
                                 ('struct', sub_spec)])

        # specification with Spark types
        sub_spec = odict([('long1', LongType()), ('double1', DoubleType())])
        specs['spark'] = odict([('long', LongType()), ('double', DoubleType()), ('string', StringType()),
                                ('timestamp', TimestampType()), ('struct', sub_spec)])

        # specification with Spark-type classes
        sub_spec = odict([('long1', LongType), ('double1', DoubleType)])
        specs['spark-class'] = odict([('long', LongType), ('double', DoubleType), ('string', StringType),
                                      ('timestamp', TimestampType), ('struct', sub_spec)])

        # specification mixed types
        sub_spec = odict([('long1', LongType), ('double1', float)])
        specs['mixed'] = odict([('long', int), ('double', DoubleType), ('string', str),
                                ('timestamp', TimestampType()), ('struct', sub_spec)])

        # test with different specifications
        for descr, spec in specs.items():
            schema = df_schema(spec)
            self.assertIs(type(schema), type(ref_schema), 'unexpected schema type for "{}"'.format(descr))
            self.assertTupleEqual(tuple(schema), tuple(ref_schema), 'unexpected schema for "{}"'.format(descr))

        # test with incorrect types
        spec = odict([('long', 'long'), ('double', 'double'), ('string', 'string')])
        with self.assertRaises(TypeError):
            df_schema(spec) 
Example 31
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_apply_schema_to_dict_and_rows(self):
        schema = StructType().add("b", StringType()).add("a", IntegerType())
        input = [{"a": 1}, {"b": "coffee"}]
        rdd = self.sc.parallelize(input)
        for verify in [False, True]:
            df = self.spark.createDataFrame(input, schema, verifySchema=verify)
            df2 = self.spark.createDataFrame(rdd, schema, verifySchema=verify)
            self.assertEqual(df.schema, df2.schema)

            rdd = self.sc.parallelize(range(10)).map(lambda x: Row(a=x, b=None))
            df3 = self.spark.createDataFrame(rdd, schema, verifySchema=verify)
            self.assertEqual(10, df3.count())
            input = [Row(a=x, b=str(x)) for x in range(10)]
            df4 = self.spark.createDataFrame(input, schema, verifySchema=verify)
            self.assertEqual(10, df4.count()) 
Example 32
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_create_dataframe_schema_mismatch(self):
        input = [Row(a=1)]
        rdd = self.sc.parallelize(range(3)).map(lambda i: Row(a=i))
        schema = StructType([StructField("a", IntegerType()), StructField("b", StringType())])
        df = self.spark.createDataFrame(rdd, schema)
        self.assertRaises(Exception, lambda: df.show()) 
Example 33
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_apply_schema(self):
        from datetime import date, datetime
        rdd = self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0,
                                    date(2010, 1, 1), datetime(2010, 1, 1, 1, 1, 1),
                                    {"a": 1}, (2,), [1, 2, 3], None)])
        schema = StructType([
            StructField("byte1", ByteType(), False),
            StructField("byte2", ByteType(), False),
            StructField("short1", ShortType(), False),
            StructField("short2", ShortType(), False),
            StructField("int1", IntegerType(), False),
            StructField("float1", FloatType(), False),
            StructField("date1", DateType(), False),
            StructField("time1", TimestampType(), False),
            StructField("map1", MapType(StringType(), IntegerType(), False), False),
            StructField("struct1", StructType([StructField("b", ShortType(), False)]), False),
            StructField("list1", ArrayType(ByteType(), False), False),
            StructField("null1", DoubleType(), True)])
        df = self.spark.createDataFrame(rdd, schema)
        results = df.rdd.map(lambda x: (x.byte1, x.byte2, x.short1, x.short2, x.int1, x.float1,
                             x.date1, x.time1, x.map1["a"], x.struct1.b, x.list1, x.null1))
        r = (127, -128, -32768, 32767, 2147483647, 1.0, date(2010, 1, 1),
             datetime(2010, 1, 1, 1, 1, 1), 1, 2, [1, 2, 3], None)
        self.assertEqual(r, results.first())

        df.createOrReplaceTempView("table2")
        r = self.spark.sql("SELECT byte1 - 1 AS byte1, byte2 + 1 AS byte2, " +
                           "short1 + 1 AS short1, short2 - 1 AS short2, int1 - 1 AS int1, " +
                           "float1 + 1.5 as float1 FROM table2").first()

        self.assertEqual((126, -127, -32767, 32766, 2147483646, 2.5), tuple(r))

        from pyspark.sql.types import _parse_schema_abstract, _infer_schema_type
        rdd = self.sc.parallelize([(127, -32768, 1.0, datetime(2010, 1, 1, 1, 1, 1),
                                    {"a": 1}, (2,), [1, 2, 3])])
        abstract = "byte1 short1 float1 time1 map1{} struct1(b) list1[]"
        schema = _parse_schema_abstract(abstract)
        typedSchema = _infer_schema_type(rdd.first(), schema)
        df = self.spark.createDataFrame(rdd, typedSchema)
        r = (127, -32768, 1.0, datetime(2010, 1, 1, 1, 1, 1), {"a": 1}, Row(b=2), [1, 2, 3])
        self.assertEqual(r, tuple(df.first())) 
Example 34
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_udt(self):
        from pyspark.sql.types import _parse_datatype_json_string, _infer_type, _verify_type
        from pyspark.sql.tests import ExamplePointUDT, ExamplePoint

        def check_datatype(datatype):
            pickled = pickle.loads(pickle.dumps(datatype))
            assert datatype == pickled
            scala_datatype = self.spark._jsparkSession.parseDataType(datatype.json())
            python_datatype = _parse_datatype_json_string(scala_datatype.json())
            assert datatype == python_datatype

        check_datatype(ExamplePointUDT())
        structtype_with_udt = StructType([StructField("label", DoubleType(), False),
                                          StructField("point", ExamplePointUDT(), False)])
        check_datatype(structtype_with_udt)
        p = ExamplePoint(1.0, 2.0)
        self.assertEqual(_infer_type(p), ExamplePointUDT())
        _verify_type(ExamplePoint(1.0, 2.0), ExamplePointUDT())
        self.assertRaises(ValueError, lambda: _verify_type([1.0, 2.0], ExamplePointUDT()))

        check_datatype(PythonOnlyUDT())
        structtype_with_udt = StructType([StructField("label", DoubleType(), False),
                                          StructField("point", PythonOnlyUDT(), False)])
        check_datatype(structtype_with_udt)
        p = PythonOnlyPoint(1.0, 2.0)
        self.assertEqual(_infer_type(p), PythonOnlyUDT())
        _verify_type(PythonOnlyPoint(1.0, 2.0), PythonOnlyUDT())
        self.assertRaises(ValueError, lambda: _verify_type([1.0, 2.0], PythonOnlyUDT())) 
Example 35
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_simple_udt_in_df(self):
        schema = StructType().add("key", LongType()).add("val", PythonOnlyUDT())
        df = self.spark.createDataFrame(
            [(i % 3, PythonOnlyPoint(float(i), float(i))) for i in range(10)],
            schema=schema)
        df.show() 
Example 36
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_complex_nested_udt_in_df(self):
        from pyspark.sql.functions import udf

        schema = StructType().add("key", LongType()).add("val", PythonOnlyUDT())
        df = self.spark.createDataFrame(
            [(i % 3, PythonOnlyPoint(float(i), float(i))) for i in range(10)],
            schema=schema)
        df.collect()

        gd = df.groupby("key").agg({"val": "collect_list"})
        gd.collect()
        udf = udf(lambda k, v: [(k, v[0])], ArrayType(df.schema))
        gd.select(udf(*gd)).collect() 
Example 37
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_apply_schema_with_udt(self):
        from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
        row = (1.0, ExamplePoint(1.0, 2.0))
        schema = StructType([StructField("label", DoubleType(), False),
                             StructField("point", ExamplePointUDT(), False)])
        df = self.spark.createDataFrame([row], schema)
        point = df.head().point
        self.assertEqual(point, ExamplePoint(1.0, 2.0))

        row = (1.0, PythonOnlyPoint(1.0, 2.0))
        schema = StructType([StructField("label", DoubleType(), False),
                             StructField("point", PythonOnlyUDT(), False)])
        df = self.spark.createDataFrame([row], schema)
        point = df.head().point
        self.assertEqual(point, PythonOnlyPoint(1.0, 2.0)) 
Example 38
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_metadata_null(self):
        from pyspark.sql.types import StructType, StringType, StructField
        schema = StructType([StructField("f1", StringType(), True, None),
                             StructField("f2", StringType(), True, {'a': None})])
        rdd = self.sc.parallelize([["a", "b"], ["c", "d"]])
        self.spark.createDataFrame(rdd, schema) 
Example 39
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_save_and_load_builder(self):
        df = self.df
        tmpPath = tempfile.mkdtemp()
        shutil.rmtree(tmpPath)
        df.write.json(tmpPath)
        actual = self.spark.read.json(tmpPath)
        self.assertEqual(sorted(df.collect()), sorted(actual.collect()))

        schema = StructType([StructField("value", StringType(), True)])
        actual = self.spark.read.json(tmpPath, schema)
        self.assertEqual(sorted(df.select("value").collect()), sorted(actual.collect()))

        df.write.mode("overwrite").json(tmpPath)
        actual = self.spark.read.json(tmpPath)
        self.assertEqual(sorted(df.collect()), sorted(actual.collect()))

        df.write.mode("overwrite").options(noUse="this options will not be used in save.")\
                .option("noUse", "this option will not be used in save.")\
                .format("json").save(path=tmpPath)
        actual =\
            self.spark.read.format("json")\
                           .load(path=tmpPath, noUse="this options will not be used in load.")
        self.assertEqual(sorted(df.collect()), sorted(actual.collect()))

        defaultDataSourceName = self.spark.conf.get("spark.sql.sources.default",
                                                    "org.apache.spark.sql.parquet")
        self.spark.sql("SET spark.sql.sources.default=org.apache.spark.sql.json")
        actual = self.spark.read.load(path=tmpPath)
        self.assertEqual(sorted(df.collect()), sorted(actual.collect()))
        self.spark.sql("SET spark.sql.sources.default=" + defaultDataSourceName)

        shutil.rmtree(tmpPath) 
Example 40
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_stream_read_options(self):
        schema = StructType([StructField("data", StringType(), False)])
        df = self.spark.readStream\
            .format('text')\
            .option('path', 'python/test_support/sql/streaming')\
            .schema(schema)\
            .load()
        self.assertTrue(df.isStreaming)
        self.assertEqual(df.schema.simpleString(), "struct<data:string>") 
Example 41
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_stream_read_options_overwrite(self):
        bad_schema = StructType([StructField("test", IntegerType(), False)])
        schema = StructType([StructField("data", StringType(), False)])
        df = self.spark.readStream.format('csv').option('path', 'python/test_support/sql/fake') \
            .schema(bad_schema)\
            .load(path='python/test_support/sql/streaming', schema=schema, format='text')
        self.assertTrue(df.isStreaming)
        self.assertEqual(df.schema.simpleString(), "struct<data:string>") 
Example 42
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_column_name_with_non_ascii(self):
        if sys.version >= '3':
            columnName = "??"
            self.assertTrue(isinstance(columnName, str))
        else:
            columnName = unicode("??", "utf-8")
            self.assertTrue(isinstance(columnName, unicode))
        schema = StructType([StructField(columnName, LongType(), True)])
        df = self.spark.createDataFrame([(1,)], schema)
        self.assertEqual(schema, df.schema)
        self.assertEqual("DataFrame[??: bigint]", str(df))
        self.assertEqual([("??", 'bigint')], df.dtypes)
        self.assertEqual(1, df.select("??").first()[0])
        self.assertEqual(1, df.select(df["??"]).first()[0]) 
Example 43
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_decimal(self):
        from decimal import Decimal
        schema = StructType([StructField("decimal", DecimalType(10, 5))])
        df = self.spark.createDataFrame([(Decimal("3.14159"),)], schema)
        row = df.select(df.decimal + 1).first()
        self.assertEqual(row[0], Decimal("4.14159"))
        tmpPath = tempfile.mkdtemp()
        shutil.rmtree(tmpPath)
        df.write.parquet(tmpPath)
        df2 = self.spark.read.parquet(tmpPath)
        row = df2.first()
        self.assertEqual(row[0], Decimal("3.14159")) 
Example 44
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_fillna(self):
        schema = StructType([
            StructField("name", StringType(), True),
            StructField("age", IntegerType(), True),
            StructField("height", DoubleType(), True)])

        # fillna shouldn't change non-null values
        row = self.spark.createDataFrame([(u'Alice', 10, 80.1)], schema).fillna(50).first()
        self.assertEqual(row.age, 10)

        # fillna with int
        row = self.spark.createDataFrame([(u'Alice', None, None)], schema).fillna(50).first()
        self.assertEqual(row.age, 50)
        self.assertEqual(row.height, 50.0)

        # fillna with double
        row = self.spark.createDataFrame([(u'Alice', None, None)], schema).fillna(50.1).first()
        self.assertEqual(row.age, 50)
        self.assertEqual(row.height, 50.1)

        # fillna with string
        row = self.spark.createDataFrame([(None, None, None)], schema).fillna("hello").first()
        self.assertEqual(row.name, u"hello")
        self.assertEqual(row.age, None)

        # fillna with subset specified for numeric cols
        row = self.spark.createDataFrame(
            [(None, None, None)], schema).fillna(50, subset=['name', 'age']).first()
        self.assertEqual(row.name, None)
        self.assertEqual(row.age, 50)
        self.assertEqual(row.height, None)

        # fillna with subset specified for numeric cols
        row = self.spark.createDataFrame(
            [(None, None, None)], schema).fillna("haha", subset=['name', 'age']).first()
        self.assertEqual(row.name, "haha")
        self.assertEqual(row.age, None)
        self.assertEqual(row.height, None) 
Example 45
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_BinaryType_serialization(self):
        # Pyrolite version <= 4.9 could not serialize BinaryType with Python3 SPARK-17808
        schema = StructType([StructField('mybytes', BinaryType())])
        data = [[bytearray(b'here is my data')],
                [bytearray(b'and here is some more')]]
        df = self.spark.createDataFrame(data, schema=schema)
        df.collect() 
Example 46
Project: MIT-Thesis   Author: alec-heif   File: tests.py    (license) View Source Project 5 votes vote down vote up
def test_save_and_load_table(self):
        df = self.df
        tmpPath = tempfile.mkdtemp()
        shutil.rmtree(tmpPath)
        df.write.saveAsTable("savedJsonTable", "json", "append", path=tmpPath)
        actual = self.spark.createExternalTable("externalJsonTable", tmpPath, "json")
        self.assertEqual(sorted(df.collect()),
                         sorted(self.spark.sql("SELECT * FROM savedJsonTable").collect()))
        self.assertEqual(sorted(df.collect()),
                         sorted(self.spark.sql("SELECT * FROM externalJsonTable").collect()))
        self.assertEqual(sorted(df.collect()), sorted(actual.collect()))
        self.spark.sql("DROP TABLE externalJsonTable")

        df.write.saveAsTable("savedJsonTable", "json", "overwrite", path=tmpPath)
        schema = StructType([StructField("value", StringType(), True)])
        actual = self.spark.createExternalTable("externalJsonTable", source="json",
                                                schema=schema, path=tmpPath,
                                                noUse="this options will not be used")
        self.assertEqual(sorted(df.collect()),
                         sorted(self.spark.sql("SELECT * FROM savedJsonTable").collect()))
        self.assertEqual(sorted(df.select("value").collect()),
                         sorted(self.spark.sql("SELECT * FROM externalJsonTable").collect()))
        self.assertEqual(sorted(df.select("value").collect()), sorted(actual.collect()))
        self.spark.sql("DROP TABLE savedJsonTable")
        self.spark.sql("DROP TABLE externalJsonTable")

        defaultDataSourceName = self.spark.getConf("spark.sql.sources.default",
                                                   "org.apache.spark.sql.parquet")
        self.spark.sql("SET spark.sql.sources.default=org.apache.spark.sql.json")
        df.write.saveAsTable("savedJsonTable", path=tmpPath, mode="overwrite")
        actual = self.spark.createExternalTable("externalJsonTable", path=tmpPath)
        self.assertEqual(sorted(df.collect()),
                         sorted(self.spark.sql("SELECT * FROM savedJsonTable").collect()))
        self.assertEqual(sorted(df.collect()),
                         sorted(self.spark.sql("SELECT * FROM externalJsonTable").collect()))
        self.assertEqual(sorted(df.collect()), sorted(actual.collect()))
        self.spark.sql("DROP TABLE savedJsonTable")
        self.spark.sql("DROP TABLE externalJsonTable")
        self.spark.sql("SET spark.sql.sources.default=" + defaultDataSourceName)

        shutil.rmtree(tmpPath) 
Example 47
Project: MIT-Thesis   Author: alec-heif   File: catalog.py    (license) View Source Project 5 votes vote down vote up
def createExternalTable(self, tableName, path=None, source=None, schema=None, **options):
        """Creates an external table based on the dataset in a data source.

        It returns the DataFrame associated with the external table.

        The data source is specified by the ``source`` and a set of ``options``.
        If ``source`` is not specified, the default data source configured by
        ``spark.sql.sources.default`` will be used.

        Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and
        created external table.

        :return: :class:`DataFrame`
        """
        if path is not None:
            options["path"] = path
        if source is None:
            source = self._sparkSession.conf.get(
                "spark.sql.sources.default", "org.apache.spark.sql.parquet")
        if schema is None:
            df = self._jcatalog.createExternalTable(tableName, source, options)
        else:
            if not isinstance(schema, StructType):
                raise TypeError("schema should be StructType")
            scala_datatype = self._jsparkSession.parseDataType(schema.json())
            df = self._jcatalog.createExternalTable(tableName, source, scala_datatype, options)
        return DataFrame(df, self._sparkSession._wrapped) 
Example 48
Project: MIT-Thesis   Author: alec-heif   File: evaluation.py    (license) View Source Project 5 votes vote down vote up
def __init__(self, predictionAndObservations):
        sc = predictionAndObservations.ctx
        sql_ctx = SQLContext.getOrCreate(sc)
        df = sql_ctx.createDataFrame(predictionAndObservations, schema=StructType([
            StructField("prediction", DoubleType(), nullable=False),
            StructField("observation", DoubleType(), nullable=False)]))
        java_class = sc._jvm.org.apache.spark.mllib.evaluation.RegressionMetrics
        java_model = java_class(df._jdf)
        super(RegressionMetrics, self).__init__(java_model) 
Example 49
Project: MIT-Thesis   Author: alec-heif   File: evaluation.py    (license) View Source Project 5 votes vote down vote up
def __init__(self, predictionAndLabels):
        sc = predictionAndLabels.ctx
        sql_ctx = SQLContext.getOrCreate(sc)
        df = sql_ctx.createDataFrame(predictionAndLabels, schema=StructType([
            StructField("prediction", DoubleType(), nullable=False),
            StructField("label", DoubleType(), nullable=False)]))
        java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics
        java_model = java_class(df._jdf)
        super(MulticlassMetrics, self).__init__(java_model) 
Example 50
Project: kafka-spark-influx-csv-analysis   Author: bwsw   File: transformations_validator.py    (license) View Source Project 5 votes vote down vote up
def validate(self, transformations):
        new_fields = []
        for transformation in transformations:
            if isinstance(transformation, FieldTransformation):  # it's transformed name
                if isinstance(transformation.operation, str):  # it's rename
                    renamed_field = self.__get_field(transformation.operation)
                    new_fields.append(types.StructField(transformation.field_name, renamed_field.dataType))
                else:  # is Syntaxtree
                    field_type = self._validate_syntax_tree(transformation.operation)
                    new_fields.append(types.StructField(transformation.field_name, field_type))
            else:  # no transforms
                new_fields.append(self.__get_field(transformation))
        return types.StructType(new_fields)