/* * Copyright (c) 2015 Couchbase, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // Putting this in this Spark package to be able to access internalCreateDataFrame // See my (currently unanswered) SO post for context: // https://stackoverflow.com/questions/56183811/how-to-create-a-custom-structured-streaming-source-for-apache-spark-2-3-0 package org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.StructType /** Helpers to create streaming DataFrames. */ object DataFrameCreation { def createStreamingDataFrame(sqlContext: SQLContext, rdd: RDD[Row], schema: StructType): DataFrame = { // internalCreateDataFrame requires an RDD[InternalRow] val encoder = RowEncoder.apply(schema) val encoded: RDD[InternalRow] = rdd.map(row => { encoder.toRow(row) }) sqlContext.internalCreateDataFrame(encoded, schema, isStreaming = true) } def createStreamingDataFrame(sqlContext: SQLContext, df: DataFrame, schema: StructType): DataFrame = { // internalCreateDataFrame requires an RDD[InternalRow] val encoder = RowEncoder.apply(schema) val encoded: RDD[InternalRow] = df.rdd.map(row => { encoder.toRow(row) }) sqlContext.internalCreateDataFrame(encoded, schema, isStreaming = true) } }