Python pyspark.sql.functions.concat() Examples

The following are 7 code examples of pyspark.sql.functions.concat(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.functions , or try the search function .
Example #1
Source File: compiler.py    From ibis with Apache License 2.0 5 votes vote down vote up
def compile_string_concat(t, expr, scope, **kwargs):
    op = expr.op()

    src_columns = t.translate(op.arg, scope)
    return F.concat(*src_columns) 
Example #2
Source File: compiler.py    From ibis with Apache License 2.0 5 votes vote down vote up
def compile_array_concat(t, expr, scope, **kwargs):
    op = expr.op()

    left = t.translate(op.left, scope)
    right = t.translate(op.right, scope)
    return F.concat(left, right) 
Example #3
Source File: Provider.py    From cccatalog with MIT License 5 votes vote down vote up
def getData(self):
        spk         = SparkSession.builder.getOrCreate()
        dataDF      = spk.read.parquet(self.input)
        providerDF  = dataDF.select(concat(concat('provider_domain', 'content_path'), \
                            when(col('content_query_string') != '', concat(lit('?'), col('content_query_string')))\
                            .otherwise(lit(''))).alias('url'), \
                            concat('warc_segment', lit('/warc/'), 'warc_filename').alias('warc_filename'), \
                                     'content_offset', 'deflate_length')\
                            .where(col('provider_domain').like('%{}'.format(self.domain)))\
                            .dropDuplicates(['url'])

        providerData = providerDF.rdd.map(lambda row: '\t'.join([str(col) for col in row])).collect() #convert dataframe into a list of tab delimited elements

        return providerData 
Example #4
Source File: base.py    From koalas with Apache License 2.0 5 votes vote down vote up
def __add__(self, other):
        if isinstance(self.spark.data_type, StringType):
            # Concatenate string columns
            if isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType):
                return column_op(F.concat)(self, other)
            # Handle df['col'] + 'literal'
            elif isinstance(other, str):
                return column_op(F.concat)(self, F.lit(other))
            else:
                raise TypeError("string addition can only be applied to string series or literals.")
        else:
            return column_op(Column.__add__)(self, other) 
Example #5
Source File: base.py    From koalas with Apache License 2.0 5 votes vote down vote up
def __radd__(self, other):
        # Handle 'literal' + df['col']
        if isinstance(self.spark.data_type, StringType) and isinstance(other, str):
            return self._with_new_scol(F.concat(F.lit(other), self.spark.column))
        else:
            return column_op(Column.__radd__)(self, other) 
Example #6
Source File: series.py    From koalas with Apache License 2.0 4 votes vote down vote up
def add_prefix(self, prefix):
        """
        Prefix labels with string `prefix`.

        For Series, the row labels are prefixed.
        For DataFrame, the column labels are prefixed.

        Parameters
        ----------
        prefix : str
           The string to add before each label.

        Returns
        -------
        Series
           New Series with updated labels.

        See Also
        --------
        Series.add_suffix: Suffix column labels with string `suffix`.
        DataFrame.add_suffix: Suffix column labels with string `suffix`.
        DataFrame.add_prefix: Prefix column labels with string `prefix`.

        Examples
        --------
        >>> s = ks.Series([1, 2, 3, 4])
        >>> s
        0    1
        1    2
        2    3
        3    4
        Name: 0, dtype: int64

        >>> s.add_prefix('item_')
        item_0    1
        item_1    2
        item_2    3
        item_3    4
        Name: 0, dtype: int64
        """
        assert isinstance(prefix, str)
        internal = self.to_frame()._internal
        sdf = internal.spark_frame.select(
            [
                F.concat(F.lit(prefix), index_spark_column).alias(index_spark_column_name)
                for index_spark_column, index_spark_column_name in zip(
                    internal.index_spark_columns, internal.index_spark_column_names
                )
            ]
            + internal.data_spark_columns
        )
        return first_series(DataFrame(internal.with_new_sdf(sdf))) 
Example #7
Source File: series.py    From koalas with Apache License 2.0 4 votes vote down vote up
def add_suffix(self, suffix):
        """
        Suffix labels with string suffix.

        For Series, the row labels are suffixed.
        For DataFrame, the column labels are suffixed.

        Parameters
        ----------
        suffix : str
           The string to add after each label.

        Returns
        -------
        Series
           New Series with updated labels.

        See Also
        --------
        Series.add_prefix: Prefix row labels with string `prefix`.
        DataFrame.add_prefix: Prefix column labels with string `prefix`.
        DataFrame.add_suffix: Suffix column labels with string `suffix`.

        Examples
        --------
        >>> s = ks.Series([1, 2, 3, 4])
        >>> s
        0    1
        1    2
        2    3
        3    4
        Name: 0, dtype: int64

        >>> s.add_suffix('_item')
        0_item    1
        1_item    2
        2_item    3
        3_item    4
        Name: 0, dtype: int64
        """
        assert isinstance(suffix, str)
        internal = self.to_frame()._internal
        sdf = internal.spark_frame.select(
            [
                F.concat(index_spark_column, F.lit(suffix)).alias(index_spark_column_name)
                for index_spark_column, index_spark_column_name in zip(
                    internal.index_spark_columns, internal.index_spark_column_names
                )
            ]
            + internal.data_spark_columns
        )
        return first_series(DataFrame(internal.with_new_sdf(sdf)))