Python pyspark.sql.types.LongType() Examples

The following are 22 code examples of pyspark.sql.types.LongType(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.types , or try the search function .
Example #1
Source File: base.py    From koalas with Apache License 2.0 5 votes vote down vote up
def numpy_column_op(f):
    @wraps(f)
    def wrapper(self, *args):
        # PySpark does not support NumPy type out of the box. For now, we convert NumPy types
        # into some primitive types understandable in PySpark.
        new_args = []
        for arg in args:
            # TODO: This is a quick hack to support NumPy type. We should revisit this.
            if isinstance(self.spark.data_type, LongType) and isinstance(arg, np.timedelta64):
                new_args.append(float(arg / np.timedelta64(1, "s")))
            else:
                new_args.append(arg)
        return column_op(f)(self, *new_args)

    return wrapper 
Example #2
Source File: dfutil.py    From TensorFlowOnSpark with Apache License 2.0 5 votes vote down vote up
def infer_schema(example, binary_features=[]):
  """Given a tf.train.Example, infer the Spark DataFrame schema (StructFields).

  Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to
  disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint"
  from the caller in the ``binary_features`` argument.

  Args:
    :example: a tf.train.Example
    :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays.

  Returns:
    A DataFrame StructType schema
  """
  def _infer_sql_type(k, v):
    # special handling for binary features
    if k in binary_features:
      return BinaryType()

    if v.int64_list.value:
      result = v.int64_list.value
      sql_type = LongType()
    elif v.float_list.value:
      result = v.float_list.value
      sql_type = DoubleType()
    else:
      result = v.bytes_list.value
      sql_type = StringType()

    if len(result) > 1:             # represent multi-item tensors as Spark SQL ArrayType() of base types
      return ArrayType(sql_type)
    else:                           # represent everything else as base types (and empty tensors as StringType())
      return sql_type

  return StructType([StructField(k, _infer_sql_type(k, v), True) for k, v in sorted(example.features.feature.items())]) 
Example #3
Source File: hostlinks_to_graph.py    From cc-pyspark with MIT License 5 votes vote down vote up
def vertices_assign_ids(self, sc, sqlc, edges):
        source = edges.select(edges.s.alias('name'))
        target = edges.select(edges.t.alias('name'))

        ids = source.union(target) \
            .distinct()

        if self.args.validate_host_names:
            is_valid = sqlf.udf(HostLinksToGraph.reverse_host_is_valid,
                                BooleanType())
            ids = ids.filter(is_valid(ids.name))

        if self.args.vertex_partitions == 1:
            ids = ids \
                    .coalesce(1) \
                    .sort('name') \
                    .withColumn('id', sqlf.monotonically_increasing_id())
        else:
            id_rdd = ids.select(ids.name).rdd \
                        .map(lambda row: tuple(row)[0]) \
                        .sortBy(lambda x: x, True,
                                self.args.vertex_partitions) \
                        .zipWithIndex()
            id_schema = StructType([
                StructField("name", StringType(), True),
                StructField("id", LongType(), True)
            ])
            ids = sqlc.createDataFrame(id_rdd, schema=id_schema)

        if self.args.save_as_text is not None:
            ids = ids.persist()
            ids.select(sqlf.concat_ws('\t', ids.id, ids.name)) \
                .write \
                .text(os.path.join(self.args.save_as_text, "vertices"),
                      compression="gzip")
        ids.write \
           .format(self.args.output_format) \
           .option("compression", self.args.output_compression) \
           .saveAsTable(self.args.output + '_vertices')

        return ids 
Example #4
Source File: test_spark.py    From mlflow with Apache License 2.0 5 votes vote down vote up
def test_spark_udf(spark, model_path):
    mlflow.pyfunc.save_model(
        path=model_path,
        loader_module=__name__,
        code_path=[os.path.dirname(tests.__file__)],
    )
    reloaded_pyfunc_model = mlflow.pyfunc.load_pyfunc(model_path)

    pandas_df = pd.DataFrame(data=np.ones((10, 10)), columns=[str(i) for i in range(10)])
    spark_df = spark.createDataFrame(pandas_df)

    # Test all supported return types
    type_map = {"float": (FloatType(), np.number),
                "int": (IntegerType(), np.int32),
                "double": (DoubleType(), np.number),
                "long": (LongType(), np.int),
                "string": (StringType(), None)}

    for tname, tdef in type_map.items():
        spark_type, np_type = tdef
        prediction_df = reloaded_pyfunc_model.predict(pandas_df)
        for is_array in [True, False]:
            t = ArrayType(spark_type) if is_array else spark_type
            if tname == "string":
                expected = prediction_df.applymap(str)
            else:
                expected = prediction_df.select_dtypes(np_type)
                if tname == "float":
                    expected = expected.astype(np.float32)

            expected = [list(row[1]) if is_array else row[1][0] for row in expected.iterrows()]
            pyfunc_udf = spark_udf(spark, model_path, result_type=t)
            new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns))
            actual = list(new_df.select("prediction").toPandas()['prediction'])
            assert expected == actual
            if not is_array:
                pyfunc_udf = spark_udf(spark, model_path, result_type=tname)
                new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns))
                actual = list(new_df.select("prediction").toPandas()['prediction'])
                assert expected == actual 
Example #5
Source File: reader.py    From HoloClean-Legacy-deprecated with Apache License 2.0 5 votes vote down vote up
def read(self, file_path, spark_session, indexcol=0, schema=None):
        """
        Creates a dataframe from the csv file

        :param indexcol: if 1, create a tuple id column as auto increment
        :param schema: optional schema of file if known
        :param spark_session: The spark_session we created in Holoclean object
        :param file_path: The path to the file

        :return: dataframe
        """
        if schema is None:
            df = spark_session.read.csv(file_path, header=True)
        else:
            df = spark_session.read.csv(file_path, header=True, schema=schema)

        if indexcol == 0:
            return df

        index_name = GlobalVariables.index_name

        new_cols = df.schema.names + [index_name]
        list_schema = []
        for index_attribute in range(len(df.schema.names)):
            list_schema.append(StructField("_" + str(index_attribute),
                                           df.schema[
                                               index_attribute].dataType,
                                           True))
        list_schema.append(
            StructField("_" + str(len(new_cols)), LongType(), True))

        schema = StructType(list_schema)
        ix_df = df.rdd.zipWithIndex().map(
            lambda (row, ix): row + (ix + 1,)).toDF(schema)
        tmp_cols = ix_df.schema.names
        new_df = reduce(lambda data, idx: data.withColumnRenamed(tmp_cols[idx],
                        new_cols[idx]),
                        xrange(len(tmp_cols)), ix_df)
        new_df = self.checking_string_size(new_df)
        return new_df 
Example #6
Source File: strings.py    From koalas with Apache License 2.0 5 votes vote down vote up
def len(self) -> "ks.Series":
        """
        Computes the length of each element in the Series.

        The element may be a sequence (such as a string, tuple or list).

        Returns
        -------
        Series of int
            A Series of integer values indicating the length of each element in
            the Series.

        Examples
        --------
        Returns the length (number of characters) in a string. Returns the
        number of entries for lists or tuples.

        >>> s1 = ks.Series(['dog', 'monkey'])
        >>> s1.str.len()
        0    3
        1    6
        Name: 0, dtype: int64

        >>> s2 = ks.Series([["a", "b", "c"], []])
        >>> s2.str.len()
        0    3
        1    0
        Name: 0, dtype: int64
        """
        if isinstance(self._data.spark.data_type, (ArrayType, MapType)):
            return column_op(lambda c: F.size(c).cast(LongType()))(self._data).alias(
                self._data.name
            )
        else:
            return column_op(lambda c: F.length(c).cast(LongType()))(self._data).alias(
                self._data.name
            ) 
Example #7
Source File: typehints.py    From koalas with Apache License 2.0 5 votes vote down vote up
def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes,):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date,):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in (np.ndarray,):
        # TODO: support other child types
        return types.ArrayType(types.StringType())
    else:
        raise TypeError("Type %s was not understood." % tpe) 
Example #8
Source File: indexing.py    From koalas with Apache License 2.0 5 votes vote down vote up
def _select_rows_by_iterable(
        self, rows_sel: Iterable
    ) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
        sdf = self._internal.spark_frame

        if any(isinstance(key, (int, np.int, np.int64, np.int32)) and key < 0 for key in rows_sel):
            offset = sdf.count()
        else:
            offset = 0

        new_rows_sel = []
        for key in list(rows_sel):
            if not isinstance(key, (int, np.int, np.int64, np.int32)):
                raise TypeError(
                    "cannot do positional indexing with these indexers [{}] of {}".format(
                        key, type(key)
                    )
                )
            if key < 0:
                key = key + offset
            new_rows_sel.append(key)

        if len(new_rows_sel) != len(set(new_rows_sel)):
            raise NotImplementedError(
                "Duplicated row selection is not currently supported; "
                "however, normalised index was [%s]" % new_rows_sel
            )

        sequence_scol = sdf[self._sequence_col]
        cond = []
        for key in new_rows_sel:
            cond.append(sequence_scol == F.lit(int(key)).cast(LongType()))

        if len(cond) == 0:
            cond = [F.lit(False)]
        return reduce(lambda x, y: x | y, cond), None, None 
Example #9
Source File: datetimes.py    From koalas with Apache License 2.0 5 votes vote down vote up
def week(self) -> "ks.Series":
        """
        The week ordinal of the year.
        """
        return column_op(lambda c: F.weekofyear(c).cast(LongType()))(self._data).alias(
            self._data.name
        ) 
Example #10
Source File: datetimes.py    From koalas with Apache License 2.0 5 votes vote down vote up
def minute(self) -> "ks.Series":
        """
        The minutes of the datetime.
        """
        return column_op(lambda c: F.minute(c).cast(LongType()))(self._data).alias(self._data.name) 
Example #11
Source File: datetimes.py    From koalas with Apache License 2.0 5 votes vote down vote up
def hour(self) -> "ks.Series":
        """
        The hours of the datetime.
        """
        return column_op(lambda c: F.hour(c).cast(LongType()))(self._data).alias(self._data.name) 
Example #12
Source File: datetimes.py    From koalas with Apache License 2.0 5 votes vote down vote up
def day(self) -> "ks.Series":
        """
        The days of the datetime.
        """
        return column_op(lambda c: F.dayofmonth(c).cast(LongType()))(self._data).alias(
            self._data.name
        ) 
Example #13
Source File: datetimes.py    From koalas with Apache License 2.0 5 votes vote down vote up
def month(self) -> "ks.Series":
        """
        The month of the timestamp as January = 1 December = 12.
        """
        return column_op(lambda c: F.month(c).cast(LongType()))(self._data).alias(self._data.name) 
Example #14
Source File: datetimes.py    From koalas with Apache License 2.0 5 votes vote down vote up
def year(self) -> "ks.Series":
        """
        The year of the datetime.
        """
        return column_op(lambda c: F.year(c).cast(LongType()))(self._data).alias(self._data.name) 
Example #15
Source File: codecs.py    From petastorm with Apache License 2.0 5 votes vote down vote up
def encode(self, unischema_field, value):
        # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
        # (currently works only with make_batch_reader). We should move all pyspark related code into a separate module
        import pyspark.sql.types as sql_types

        # We treat ndarrays with shape=() as scalars
        unsized_numpy_array = isinstance(value, np.ndarray) and value.shape == ()
        # Validate the input to be a scalar (or an unsized numpy array)
        if not unsized_numpy_array and hasattr(value, '__len__') and (not isinstance(value, str)):
            raise TypeError('Expected a scalar as a value for field \'{}\'. '
                            'Got a non-numpy type\'{}\''.format(unischema_field.name, type(value)))

        if unischema_field.shape:
            raise ValueError('The shape field of unischema_field \'%s\' must be an empty tuple (i.e. \'()\' '
                             'to indicate a scalar. However, the actual shape is %s',
                             unischema_field.name, unischema_field.shape)
        if isinstance(self._spark_type, (sql_types.ByteType, sql_types.ShortType, sql_types.IntegerType,
                                         sql_types.LongType)):
            return int(value)
        if isinstance(self._spark_type, (sql_types.FloatType, sql_types.DoubleType)):
            return float(value)
        if isinstance(self._spark_type, sql_types.BooleanType):
            return bool(value)
        if isinstance(self._spark_type, sql_types.StringType):
            if not isinstance(value, str):
                raise ValueError(
                    'Expected a string value for field {}. Got type {}'.format(unischema_field.name, type(value)))
            return str(value)

        return value 
Example #16
Source File: test_unischema.py    From petastorm with Apache License 2.0 5 votes vote down vote up
def test_create_schema_view_fails_validate():
    """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__."""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])
    with pytest.raises(ValueError, match='does not belong to the schema'):
        TestSchema.create_schema_view([UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False)]) 
Example #17
Source File: test_end_to_end.py    From petastorm with Apache License 2.0 5 votes vote down vote up
def test_invalid_schema_field(synthetic_dataset, reader_factory):
    # Let's assume we are selecting columns using a schema which is different from the one
    # stored in the dataset. Would expect to get a reasonable error message
    BogusSchema = Unischema('BogusSchema', [
        UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False),
        UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False)])

    expected_values = {'bogus_key': 11, 'id': 1}
    with pytest.raises(ValueError, match='bogus_key'):
        reader_factory(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(),
                       shuffle_row_groups=False,
                       predicate=EqualPredicate(expected_values)) 
Example #18
Source File: unischema.py    From petastorm with Apache License 2.0 5 votes vote down vote up
def _numpy_to_spark_mapping():
    """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation
    of multiple objects in each call."""

    # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot'
    # notation to avoid copy/paste/typo mistakes
    cache_attr_name = 'cached_numpy_to_pyspark_types_map'
    if not hasattr(_numpy_to_spark_mapping, cache_attr_name):
        import pyspark.sql.types as T

        setattr(_numpy_to_spark_mapping, cache_attr_name,
                {
                    np.int8: T.ByteType(),
                    np.uint8: T.ShortType(),
                    np.int16: T.ShortType(),
                    np.uint16: T.IntegerType(),
                    np.int32: T.IntegerType(),
                    np.int64: T.LongType(),
                    np.float32: T.FloatType(),
                    np.float64: T.DoubleType(),
                    np.string_: T.StringType(),
                    np.str_: T.StringType(),
                    np.unicode_: T.StringType(),
                    np.bool_: T.BooleanType(),
                })

    return getattr(_numpy_to_spark_mapping, cache_attr_name)


# TODO: Changing fields in this class or the UnischemaField will break reading due to the schema being pickled next to
# the dataset on disk 
Example #19
Source File: streaming.py    From incubator-spot with Apache License 2.0 4 votes vote down vote up
def schema(self):
        '''
            Return the data type that represents a row from the received data list.
        '''
        from pyspark.sql.types import IntegerType, LongType, StringType, StructField, StructType

        return StructType(
            [
                StructField('p_date', StringType(), True),
                StructField('p_time', StringType(), True),
                StructField('clientip', StringType(), True),
                StructField('host', StringType(), True),
                StructField('reqmethod', StringType(), True),
                StructField('useragent', StringType(), True),
                StructField('resconttype', StringType(), True),
                StructField('duration', LongType(), True),
                StructField('username', StringType(), True),
                StructField('authgroup', StringType(), True),
                StructField('exceptionid', StringType(), True),
                StructField('filterresult', StringType(), True),
                StructField('webcat', StringType(), True),
                StructField('referer', StringType(), True),
                StructField('respcode', StringType(), True),
                StructField('action', StringType(), True),
                StructField('urischeme', StringType(), True),
                StructField('uriport', StringType(), True),
                StructField('uripath', StringType(), True),
                StructField('uriquery', StringType(), True),
                StructField('uriextension', StringType(), True),
                StructField('serverip', StringType(), True),
                StructField('scbytes', IntegerType(), True),
                StructField('csbytes', IntegerType(), True),
                StructField('virusid', StringType(), True),
                StructField('bcappname', StringType(), True),
                StructField('bcappoper', StringType(), True),
                StructField('fulluri', StringType(), True),
                StructField('y', StringType(), True),
                StructField('m', StringType(), True),
                StructField('d', StringType(), True),
                StructField('h', StringType(), True)
            ]
        ) 
Example #20
Source File: taar_lite_guidguid.py    From telemetry-airflow with Mozilla Public License 2.0 4 votes vote down vote up
def transform(longitudinal_addons):
    # Only for logging, not used, but may be interesting for later analysis.
    guid_set_unique = (
        longitudinal_addons.withColumn(
            "exploded", F.explode(longitudinal_addons.installed_addons)
        )
        .select("exploded")  # noqa: E501 - long lines
        .rdd.flatMap(lambda x: x)
        .distinct()
        .collect()
    )
    logging.info(
        "Number of unique guids co-installed in sample: " + str(len(guid_set_unique))
    )

    restructured = longitudinal_addons.rdd.flatMap(
        lambda x: key_all(x.installed_addons)
    ).toDF(["key_addon", "coinstalled_addons"])

    # Explode the list of co-installs and count pair occurrences.
    addon_co_installations = (
        restructured.select(
            "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon")
        )  # noqa: E501 - long lines
        .groupBy("key_addon", "coinstalled_addon")
        .count()
    )

    # Collect the set of coinstalled_addon, count pairs for each key_addon.
    combine_and_map_cols = F.udf(
        lambda x, y: (x, y),
        StructType([StructField("id", StringType()), StructField("n", LongType())]),
    )

    # Spark functions are sometimes long and unwieldy. Tough luck.
    # Ignore E128 and E501 long line errors
    addon_co_installations_collapsed = (
        addon_co_installations.select(  # noqa: E128
            "key_addon",
            combine_and_map_cols("coinstalled_addon", "count").alias(  # noqa: E501
                "id_n"
            ),
        )
        .groupby("key_addon")
        .agg(F.collect_list("id_n").alias("coinstallation_counts"))
    )
    logging.info(addon_co_installations_collapsed.printSchema())
    logging.info("Collecting final result of co-installations.")

    return addon_co_installations_collapsed 
Example #21
Source File: taar_lite_guidguid.py    From python_mozetl with MIT License 4 votes vote down vote up
def transform(longitudinal_addons):
    # Only for logging, not used, but may be interesting for later analysis.
    guid_set_unique = (
        longitudinal_addons.withColumn(
            "exploded", F.explode(longitudinal_addons.installed_addons)
        )
        .select("exploded")  # noqa: E501 - long lines
        .rdd.flatMap(lambda x: x)
        .distinct()
        .collect()
    )
    logging.info(
        "Number of unique guids co-installed in sample: " + str(len(guid_set_unique))
    )

    restructured = longitudinal_addons.rdd.flatMap(
        lambda x: key_all(x.installed_addons)
    ).toDF(["key_addon", "coinstalled_addons"])

    # Explode the list of co-installs and count pair occurrences.
    addon_co_installations = (
        restructured.select(
            "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon")
        )  # noqa: E501 - long lines
        .groupBy("key_addon", "coinstalled_addon")
        .count()
    )

    # Collect the set of coinstalled_addon, count pairs for each key_addon.
    combine_and_map_cols = F.udf(
        lambda x, y: (x, y),
        StructType([StructField("id", StringType()), StructField("n", LongType())]),
    )

    # Spark functions are sometimes long and unwieldy. Tough luck.
    # Ignore E128 and E501 long line errors
    addon_co_installations_collapsed = (
        addon_co_installations.select(  # noqa: E128
            "key_addon",
            combine_and_map_cols("coinstalled_addon", "count").alias(  # noqa: E501
                "id_n"
            ),
        )
        .groupby("key_addon")
        .agg(F.collect_list("id_n").alias("coinstallation_counts"))
    )
    logging.info(addon_co_installations_collapsed.printSchema())
    logging.info("Collecting final result of co-installations.")

    return addon_co_installations_collapsed 
Example #22
Source File: test_sync_bookmark.py    From python_mozetl with MIT License 4 votes vote down vote up
def sync_summary_schema():
    """"Generate a schema for sync_summary. This subset contains enough
    structure for testing bookmark validation. The schema is derived from
    [`telemetry-batch-view`][1].

    [1]: https://git.io/vdQ5A
    """
    failure_type = StructType([StructField("name", StringType(), False)])

    status_type = StructType([StructField("sync", StringType(), True)])

    validation_problems = StructType(
        [
            StructField("name", StringType(), False),
            StructField("count", LongType(), False),
        ]
    )

    validation_type = StructType(
        [
            StructField("version", LongType(), False),
            StructField("checked", LongType(), False),
            StructField("took", LongType(), False),
            StructField("problems", ArrayType(validation_problems, False), True),
        ]
    )

    engine_type = StructType(
        [
            StructField("name", StringType(), False),
            StructField("status", StringType(), False),
            StructField("failure_reason", failure_type, True),
            StructField("validation", validation_type, True),
        ]
    )

    return StructType(
        [
            StructField("app_build_id", StringType(), True),
            StructField("app_version", StringType(), True),
            StructField("app_display_version", StringType(), True),
            StructField("app_name", StringType(), True),
            StructField("app_channel", StringType(), True),
            StructField("uid", StringType(), False),
            StructField("device_id", StringType(), True),
            StructField("when", LongType(), False),
            StructField("failure_reason", failure_type, True),
            StructField("status", status_type, False),
            StructField("engines", ArrayType(engine_type, False), True),
            StructField("submission_date_s3", StringType(), False),
        ]
    )