package net.jgp.labs.spark.l020_dstream.l010_filesystem_text_dataframe; import java.io.Serializable; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.VoidFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.apache.spark.streaming.Durations; import org.apache.spark.streaming.api.java.JavaDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import net.jgp.labs.spark.x.utils.streaming.JavaSparkSessionSingleton; import net.jgp.labs.spark.x.utils.streaming.StreamingUtils; public class StreamingIngestionFileSystemTextFileToDataframeApp implements Serializable { private static final long serialVersionUID = 6795623748995704732L; public static void main(String[] args) { StreamingIngestionFileSystemTextFileToDataframeApp app = new StreamingIngestionFileSystemTextFileToDataframeApp(); app.start(); } private void start() { // Create a local StreamingContext with two working thread and batch // interval of // 1 second SparkConf conf = new SparkConf().setMaster("local[2]").setAppName( "Streaming Ingestion File System Text File to Dataframe"); JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations .seconds(5)); JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils .getInputDirectory()); msgDataStream.print(); // Create JavaRDD<Row> msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() { private static final long serialVersionUID = -590010339928376829L; @Override public void call(JavaRDD<String> rdd) { JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() { private static final long serialVersionUID = 5167089361335095997L; @Override public Row call(String msg) { Row row = RowFactory.create(msg); return row; } }); // Create Schema StructType schema = DataTypes.createStructType( new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) }); // Get Spark 2.0 session SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context() .getConf()); Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema); msgDataFrame.show(); } }); jssc.start(); try { jssc.awaitTermination(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }