Python pyspark.sql.Row() Examples

The following are 30 code examples of pyspark.sql.Row(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql , or try the search function .
Example #1
Source File: functions.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def randn(seed=None):
    """Generates a column with independent and identically distributed (i.i.d.) samples from
    the standard normal distribution.

    .. note:: The function is non-deterministic in general case.

    >>> df.withColumn('randn', randn(seed=42)).collect()
    [Row(age=2, name=u'Alice', randn=-0.7556247885860078),
    Row(age=5, name=u'Bob', randn=-0.0861619008451133)]
    """
    sc = SparkContext._active_spark_context
    if seed is not None:
        jc = sc._jvm.functions.randn(seed)
    else:
        jc = sc._jvm.functions.randn()
    return Column(jc) 
Example #2
Source File: session.py    From tidb-docker-compose with Apache License 2.0 6 votes vote down vote up
def _inferSchemaFromList(self, data, names=None):
        """
        Infer schema from list of Row or tuple.

        :param data: list of Row or tuple
        :param names: list of column names
        :return: :class:`pyspark.sql.types.StructType`
        """
        if not data:
            raise ValueError("can not infer schema from empty dataset")
        first = data[0]
        if type(first) is dict:
            warnings.warn("inferring schema from dict is deprecated,"
                          "please use pyspark.sql.Row instead")
        schema = reduce(_merge_type, (_infer_schema(row, names) for row in data))
        if _has_nulltype(schema):
            raise ValueError("Some of types cannot be determined after inferring")
        return schema 
Example #3
Source File: session.py    From tidb-docker-compose with Apache License 2.0 6 votes vote down vote up
def _test():
    import os
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row
    import pyspark.sql.session

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.session.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['spark'] = SparkSession(sc)
    globs['rdd'] = rdd = sc.parallelize(
        [Row(field1=1, field2="row1"),
         Row(field1=2, field2="row2"),
         Row(field1=3, field2="row3")])
    globs['df'] = rdd.toDF()
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.session, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    globs['sc'].stop()
    if failure_count:
        sys.exit(-1) 
Example #4
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_string_indexer_handle_invalid(self):
        df = self.spark.createDataFrame([
            (0, "a"),
            (1, "d"),
            (2, None)], ["id", "label"])

        si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep",
                            stringOrderType="alphabetAsc")
        model1 = si1.fit(df)
        td1 = model1.transform(df)
        actual1 = td1.select("id", "indexed").collect()
        expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)]
        self.assertEqual(actual1, expected1)

        si2 = si1.setHandleInvalid("skip")
        model2 = si2.fit(df)
        td2 = model2.transform(df)
        actual2 = td2.select("id", "indexed").collect()
        expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)]
        self.assertEqual(actual2, expected2) 
Example #5
Source File: session.py    From tidb-docker-compose with Apache License 2.0 6 votes vote down vote up
def _monkey_patch_RDD(sparkSession):
    def toDF(self, schema=None, sampleRatio=None):
        """
        Converts current :class:`RDD` into a :class:`DataFrame`

        This is a shorthand for ``spark.createDataFrame(rdd, schema, sampleRatio)``

        :param schema: a :class:`pyspark.sql.types.StructType` or list of names of columns
        :param samplingRatio: the sample ratio of rows used for inferring
        :return: a DataFrame

        >>> rdd.toDF().collect()
        [Row(name=u'Alice', age=1)]
        """
        return sparkSession.createDataFrame(self, schema, sampleRatio)

    RDD.toDF = toDF 
Example #6
Source File: keras_sql_udf_test.py    From spark-deep-learning with Apache License 2.0 6 votes vote down vote up
def test_map_blocks_sql_1(self):
        data = [Row(x=float(x)) for x in range(5)]
        df = self.sql.createDataFrame(data)
        with IsolatedSession() as issn:
            # The placeholder that corresponds to column 'x' as a whole column
            x = tf.placeholder(tf.double, shape=[None], name="x")
            # The output that adds 3 to x
            z = tf.add(x, 3, name='z')
            # Let's register these computations in SQL.
            makeGraphUDF(issn.graph, "map_blocks_sql_1", [z], blocked=True)

        # Here we go, for the SQL users, straight from PySpark.
        df2 = df.selectExpr("map_blocks_sql_1(x) AS z")
        print("df2 = %s" % df2)
        data2 = df2.collect()
        assert len(data2) == 5, data2
        assert data2[0].z == 3.0, data2 
Example #7
Source File: keras_sql_udf_test.py    From spark-deep-learning with Apache License 2.0 6 votes vote down vote up
def test_map_rows_sql_1(self):
        data = [Row(x=float(x)) for x in range(5)]
        df = self.sql.createDataFrame(data)
        with IsolatedSession() as issn:
            # The placeholder that corresponds to column 'x' as a whole column
            x = tf.placeholder(tf.double, shape=[], name="x")
            # The output that adds 3 to x
            z = tf.add(x, 3, name='z')
            # Let's register these computations in SQL.
            makeGraphUDF(issn.graph, "map_rows_sql_1", [z])

        # Here we go, for the SQL users, straight from PySpark.
        df2 = df.selectExpr("map_rows_sql_1(x) AS z")
        print("df2 = %s" % df2)
        data2 = df2.collect()
        assert data2[0].z == 3.0, data2 
Example #8
Source File: tf_transformer_test.py    From spark-deep-learning with Apache License 2.0 6 votes vote down vote up
def _build_local_features(np_dtype):
    """
    Build numpy array (i.e. local) features.
    """
    # Build local features and DataFrame from it
    local_features = []
    np.random.seed(997)
    for idx in range(100):
        _dict = {'idx': idx}
        for colname, _ in _input_mapping.items():
            colvalue = np.random.randn(_tensor_size) * 100
            _dict[colname] = colvalue.astype(np_dtype).tolist()

        local_features.append(Row(**_dict))

    return local_features 
Example #9
Source File: session.py    From tidb-docker-compose with Apache License 2.0 6 votes vote down vote up
def _monkey_patch_RDD(sparkSession):
    def toDF(self, schema=None, sampleRatio=None):
        """
        Converts current :class:`RDD` into a :class:`DataFrame`

        This is a shorthand for ``spark.createDataFrame(rdd, schema, sampleRatio)``

        :param schema: a :class:`pyspark.sql.types.StructType` or list of names of columns
        :param samplingRatio: the sample ratio of rows used for inferring
        :return: a DataFrame

        >>> rdd.toDF().collect()
        [Row(name=u'Alice', age=1)]
        """
        return sparkSession.createDataFrame(self, schema, sampleRatio)

    RDD.toDF = toDF 
Example #10
Source File: functions.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def date_format(date, format):
    """
    Converts a date/timestamp/string to a value of string in the format specified by the date
    format given by the second argument.

    A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
    pattern letters of the Java class `java.text.SimpleDateFormat` can be used.

    .. note:: Use when ever possible specialized functions like `year`. These benefit from a
        specialized implementation.

    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
    >>> df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect()
    [Row(date=u'04/08/2015')]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.date_format(_to_java_column(date), format)) 
Example #11
Source File: session.py    From tidb-docker-compose with Apache License 2.0 6 votes vote down vote up
def _test():
    import os
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row
    import pyspark.sql.session

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.session.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['spark'] = SparkSession(sc)
    globs['rdd'] = rdd = sc.parallelize(
        [Row(field1=1, field2="row1"),
         Row(field1=2, field2="row2"),
         Row(field1=3, field2="row3")])
    globs['df'] = rdd.toDF()
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.session, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    globs['sc'].stop()
    if failure_count:
        exit(-1) 
Example #12
Source File: functions.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def approx_count_distinct(col, rsd=None):
    """Aggregate function: returns a new :class:`Column` for approximate distinct count of
    column `col`.

    :param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more
        efficient to use :func:`countDistinct`

    >>> df.agg(approx_count_distinct(df.age).alias('distinct_ages')).collect()
    [Row(distinct_ages=2)]
    """
    sc = SparkContext._active_spark_context
    if rsd is None:
        jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col))
    else:
        jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col), rsd)
    return Column(jc) 
Example #13
Source File: test_spark.py    From snorkel with Apache License 2.0 6 votes vote down vote up
def test_decorator_mapper_memoized_none(self) -> None:
        square_hit_tracker = SquareHitTracker()

        @lambda_mapper(memoize=True)
        def square(x: DataPoint) -> DataPoint:
            fields = x.asDict()
            fields["num_squared"] = square_hit_tracker(x.num)
            if x.num == 21:
                return None
            return Row(**fields)

        x21 = self._get_x(21)
        x21_mapped = square(x21)
        self.assertIsNone(x21_mapped)
        self.assertEqual(square_hit_tracker.n_hits, 1)
        x21_mapped = square(x21)
        self.assertIsNone(x21_mapped)
        self.assertEqual(square_hit_tracker.n_hits, 1) 
Example #14
Source File: make_folds.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def convert_svmrank_to_xgboost(df: DataFrame) -> DataFrame:
    def convert_one(row: Row) -> Row:
        # For now place the .xgb right next to the svmrank files. Naming/path
        # options could be added if needed later.
        out_path = row.path + '.xgb'
        _convert_xgboost_remote(row.path, out_path)
        return Row(**dict(
            row.asDict(),
            vec_format='xgboost',
            path=out_path))

    # Each row represents potentially gigabytes, convince spark
    # to create a partition per row.
    rdd_xgb = mt.partition_per_row(df.rdd).map(convert_one)
    df_xgb = df.sql_ctx.createDataFrame(rdd_xgb, df.schema)  # type: ignore
    # Return both the xgb and svmrank datasets since
    # we aren't purging the related files. df is safe to reuse since
    # svmrank conversion returns a new dataframe with no lineage.
    return df.union(df_xgb) 
Example #15
Source File: test_spark.py    From snorkel with Apache License 2.0 6 votes vote down vote up
def test_lf_applier_spark_preprocessor_memoized(self) -> None:
        sc = SparkContext.getOrCreate()
        sql = SQLContext(sc)

        @preprocessor(memoize=True)
        def square_memoize(x: DataPoint) -> DataPoint:
            return Row(num=x.num, num_squared=x.num ** 2)

        @labeling_function(pre=[square_memoize])
        def fp_memoized(x: DataPoint) -> int:
            return 0 if x.num_squared > 42 else -1

        df = pd.DataFrame(dict(num=DATA))
        rdd = sql.createDataFrame(df).rdd
        applier = SparkLFApplier([f, fp_memoized])
        L = applier.apply(rdd)
        np.testing.assert_equal(L, L_PREPROCESS_EXPECTED) 
Example #16
Source File: functions.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def monotonically_increasing_id():
    """A column that generates monotonically increasing 64-bit integers.

    The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive.
    The current implementation puts the partition ID in the upper 31 bits, and the record number
    within each partition in the lower 33 bits. The assumption is that the data frame has
    less than 1 billion partitions, and each partition has less than 8 billion records.

    .. note:: The function is non-deterministic because its result depends on partition IDs.

    As an example, consider a :class:`DataFrame` with two partitions, each with 3 records.
    This expression would return the following IDs:
    0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.

    >>> df0 = sc.parallelize(range(2), 2).mapPartitions(lambda x: [(1,), (2,), (3,)]).toDF(['col1'])
    >>> df0.select(monotonically_increasing_id().alias('id')).collect()
    [Row(id=0), Row(id=1), Row(id=2), Row(id=8589934592), Row(id=8589934593), Row(id=8589934594)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.monotonically_increasing_id()) 
Example #17
Source File: session.py    From tidb-docker-compose with Apache License 2.0 6 votes vote down vote up
def _inferSchemaFromList(self, data, names=None):
        """
        Infer schema from list of Row or tuple.

        :param data: list of Row or tuple
        :param names: list of column names
        :return: :class:`pyspark.sql.types.StructType`
        """
        if not data:
            raise ValueError("can not infer schema from empty dataset")
        first = data[0]
        if type(first) is dict:
            warnings.warn("inferring schema from dict is deprecated,"
                          "please use pyspark.sql.Row instead")
        schema = reduce(_merge_type, (_infer_schema(row, names) for row in data))
        if _has_nulltype(schema):
            raise ValueError("Some of types cannot be determined after inferring")
        return schema 
Example #18
Source File: medline_spark.py    From pubmed_parser with MIT License 6 votes vote down vote up
def process_file(date_update):
    """Process downloaded MEDLINE folder to parquet file"""
    print("Process MEDLINE file to parquet")
    # remove if folder still exist
    if glob(os.path.join(save_dir, 'medline_*.parquet')):
        subprocess.call(['rm', '-rf', 'medline_*.parquet'])

    date_update_str = date_update.strftime("%Y_%m_%d")
    path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000)
    parse_results_rdd = path_rdd.\
        flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict)
                           for publication_dict in pp.parse_medline_xml(x)])
    medline_df = parse_results_rdd.toDF()
    medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str),
                             mode='overwrite')

    window = Window.partitionBy(['pmid']).orderBy(desc('file_name'))
    windowed_df = medline_df.select(
        max('delete').over(window).alias('is_deleted'),
        rank().over(window).alias('pos'),
        '*')
    windowed_df.\
        where('is_deleted = False and pos = 1').\
        write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str),
                      mode='overwrite')

    # parse grant database
    parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\
        .filter(lambda x: x is not None)\
        .map(lambda x: Row(**x))
    grant_df = parse_grant_rdd.toDF()
    grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str),
                           mode='overwrite') 
Example #19
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_infer_schema(self):
        rdd = self.sc.parallelize([Row(label=1.0, features=self.dv1),
                                   Row(label=0.0, features=self.sv1)])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = df.rdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise TypeError("expecting a vector but got %r of type %r" % (v, type(v))) 
Example #20
Source File: functions.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def countDistinct(col, *cols):
    """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.

    >>> df.agg(countDistinct(df.age, df.name).alias('c')).collect()
    [Row(c=2)]

    >>> df.agg(countDistinct("age", "name").alias('c')).collect()
    [Row(c=2)]
    """
    sc = SparkContext._active_spark_context
    jc = sc._jvm.functions.countDistinct(_to_java_column(col), _to_seq(sc, cols, _to_java_column))
    return Column(jc) 
Example #21
Source File: functions.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def covar_pop(col1, col2):
    """Returns a new :class:`Column` for the population covariance of ``col1`` and ``col2``.

    >>> a = [1] * 10
    >>> b = [1] * 10
    >>> df = spark.createDataFrame(zip(a, b), ["a", "b"])
    >>> df.agg(covar_pop("a", "b").alias('c')).collect()
    [Row(c=0.0)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.covar_pop(_to_java_column(col1), _to_java_column(col2))) 
Example #22
Source File: functions.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def isnull(col):
    """An expression that returns true iff the column is null.

    >>> df = spark.createDataFrame([(1, None), (None, 2)], ("a", "b"))
    >>> df.select(isnull("a").alias("r1"), isnull(df.a).alias("r2")).collect()
    [Row(r1=False, r2=False), Row(r1=True, r2=True)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.isnull(_to_java_column(col))) 
Example #23
Source File: functions.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def corr(col1, col2):
    """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1``
    and ``col2``.

    >>> a = range(20)
    >>> b = [2 * x for x in range(20)]
    >>> df = spark.createDataFrame(zip(a, b), ["a", "b"])
    >>> df.agg(corr("a", "b").alias('c')).collect()
    [Row(c=1.0)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.corr(_to_java_column(col1), _to_java_column(col2))) 
Example #24
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_stopwordsremover(self):
        dataset = self.spark.createDataFrame([Row(input=["a", "panda"])])
        stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output")
        # Default
        self.assertEqual(stopWordRemover.getInputCol(), "input")
        transformedDF = stopWordRemover.transform(dataset)
        self.assertEqual(transformedDF.head().output, ["panda"])
        self.assertEqual(type(stopWordRemover.getStopWords()), list)
        self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring))
        # Custom
        stopwords = ["panda"]
        stopWordRemover.setStopWords(stopwords)
        self.assertEqual(stopWordRemover.getInputCol(), "input")
        self.assertEqual(stopWordRemover.getStopWords(), stopwords)
        transformedDF = stopWordRemover.transform(dataset)
        self.assertEqual(transformedDF.head().output, ["a"])
        # with language selection
        stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
        dataset = self.spark.createDataFrame([Row(input=["acaba", "ama", "biri"])])
        stopWordRemover.setStopWords(stopwords)
        self.assertEqual(stopWordRemover.getStopWords(), stopwords)
        transformedDF = stopWordRemover.transform(dataset)
        self.assertEqual(transformedDF.head().output, [])
        # with locale
        stopwords = ["BELKÄ°"]
        dataset = self.spark.createDataFrame([Row(input=["belki"])])
        stopWordRemover.setStopWords(stopwords).setLocale("tr")
        self.assertEqual(stopWordRemover.getStopWords(), stopwords)
        transformedDF = stopWordRemover.transform(dataset)
        self.assertEqual(transformedDF.head().output, []) 
Example #25
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_ngram(self):
        dataset = self.spark.createDataFrame([
            Row(input=["a", "b", "c", "d", "e"])])
        ngram0 = NGram(n=4, inputCol="input", outputCol="output")
        self.assertEqual(ngram0.getN(), 4)
        self.assertEqual(ngram0.getInputCol(), "input")
        self.assertEqual(ngram0.getOutputCol(), "output")
        transformedDF = ngram0.transform(dataset)
        self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"]) 
Example #26
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_string_indexer_from_labels(self):
        model = StringIndexerModel.from_labels(["a", "b", "c"], inputCol="label",
                                               outputCol="indexed", handleInvalid="keep")
        self.assertEqual(model.labels, ["a", "b", "c"])

        df1 = self.spark.createDataFrame([
            (0, "a"),
            (1, "c"),
            (2, None),
            (3, "b"),
            (4, "b")], ["id", "label"])

        result1 = model.transform(df1)
        actual1 = result1.select("id", "indexed").collect()
        expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=2.0), Row(id=2, indexed=3.0),
                     Row(id=3, indexed=1.0), Row(id=4, indexed=1.0)]
        self.assertEqual(actual1, expected1)

        model_empty_labels = StringIndexerModel.from_labels(
            [], inputCol="label", outputCol="indexed", handleInvalid="keep")
        actual2 = model_empty_labels.transform(df1).select("id", "indexed").collect()
        expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=0.0), Row(id=2, indexed=0.0),
                     Row(id=3, indexed=0.0), Row(id=4, indexed=0.0)]
        self.assertEqual(actual2, expected2)

        # Test model with default settings can transform
        model_default = StringIndexerModel.from_labels(["a", "b", "c"], inputCol="label")
        df2 = self.spark.createDataFrame([
            (0, "a"),
            (1, "c"),
            (2, "b"),
            (3, "b"),
            (4, "b")], ["id", "label"])
        transformed_list = model_default.transform(df2)\
            .select(model_default.getOrDefault(model_default.outputCol)).collect()
        self.assertEqual(len(transformed_list), 5) 
Example #27
Source File: functions.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def nanvl(col1, col2):
    """Returns col1 if it is not NaN, or col2 if col1 is NaN.

    Both inputs should be floating point columns (:class:`DoubleType` or :class:`FloatType`).

    >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))
    >>> df.select(nanvl("a", "b").alias("r1"), nanvl(df.a, df.b).alias("r2")).collect()
    [Row(r1=1.0, r2=1.0), Row(r1=2.0, r2=2.0)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.nanvl(_to_java_column(col1), _to_java_column(col2))) 
Example #28
Source File: functions.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def dayofyear(col):
    """
    Extract the day of the year of a given date as integer.

    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
    >>> df.select(dayofyear('dt').alias('day')).collect()
    [Row(day=98)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.dayofyear(_to_java_column(col))) 
Example #29
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_read_images(self):
        data_path = 'data/mllib/images/origin/kittens'
        df = ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True)
        self.assertEqual(df.count(), 4)
        first_row = df.take(1)[0][0]
        array = ImageSchema.toNDArray(first_row)
        self.assertEqual(len(array), first_row[1])
        self.assertEqual(ImageSchema.toImage(array, origin=first_row[0]), first_row)
        self.assertEqual(df.schema, ImageSchema.imageSchema)
        self.assertEqual(df.schema["image"].dataType, ImageSchema.columnSchema)
        expected = {'CV_8UC3': 16, 'Undefined': -1, 'CV_8U': 0, 'CV_8UC1': 0, 'CV_8UC4': 24}
        self.assertEqual(ImageSchema.ocvTypes, expected)
        expected = ['origin', 'height', 'width', 'nChannels', 'mode', 'data']
        self.assertEqual(ImageSchema.imageFields, expected)
        self.assertEqual(ImageSchema.undefinedImageType, "Undefined")

        with QuietTest(self.sc):
            self.assertRaisesRegexp(
                TypeError,
                "image argument should be pyspark.sql.types.Row; however",
                lambda: ImageSchema.toNDArray("a"))

        with QuietTest(self.sc):
            self.assertRaisesRegexp(
                ValueError,
                "image argument should have attributes specified in",
                lambda: ImageSchema.toNDArray(Row(a=1)))

        with QuietTest(self.sc):
            self.assertRaisesRegexp(
                TypeError,
                "array argument should be numpy.ndarray; however, it got",
                lambda: ImageSchema.toImage("a")) 
Example #30
Source File: functions.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def isnan(col):
    """An expression that returns true iff the column is NaN.

    >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))
    >>> df.select(isnan("a").alias("r1"), isnan(df.a).alias("r2")).collect()
    [Row(r1=False, r2=False), Row(r1=True, r2=True)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.isnan(_to_java_column(col)))